Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 12 additions & 19 deletions scripts/us_epa/parent_company/download_existing_facilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from pathlib import Path

import pandas as pd
import requests

from absl import app
from absl import flags
Expand All @@ -27,12 +26,10 @@
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))

from util.dc_api_wrapper import get_dc_api_key
from util.dc_api_wrapper import get_datacommons_client

FLAGS = flags.FLAGS

_V2_SPARQL_URL = "https://api.datacommons.org/v2/sparql"


def _define_flags() -> None:
flags.DEFINE_string('output_path', 'tmp_data', 'Output directory')
Expand All @@ -42,23 +39,19 @@ def download_existing_facilities(output_path: str) -> str:
Path(output_path).mkdir(exist_ok=True)
out_file = os.path.join(output_path, 'existing_facilities.csv')

q = "SELECT DISTINCT ?dcid WHERE {?a typeOf EpaReportingFacility . ?a dcid ?dcid }"
headers = {"Content-Type": "application/json"}
api_key = get_dc_api_key()
if api_key:
headers["X-API-Key"] = api_key
response = requests.post(_V2_SPARQL_URL, json={"query": q}, headers=headers)
response.raise_for_status()
res = response.json()

client = get_datacommons_client()
response = client.node.fetch_property_values(
node_dcids="EpaReportingFacility", properties="typeOf", out=False)
facility_nodes = response.get_properties().get("EpaReportingFacility",
{}).get("typeOf", [])
facility_ids = []
for row in res.get('rows', []):
cells = row.get('cells', [])
if not cells:
facility_ids_set = set()
for node in facility_nodes:
value = getattr(node, "dcid", None)
if not value or value in facility_ids_set:
continue
value = cells[0].get('value')
if value:
facility_ids.append(value)
facility_ids_set.add(value)
facility_ids.append(value)

df = pd.DataFrame.from_dict({"epaGhgrpFacilityId": facility_ids})
df.to_csv(out_file, mode="w", header=True, index=False)
Expand Down
74 changes: 33 additions & 41 deletions scripts/us_epa/parent_company/download_existing_facilities_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,62 +19,54 @@
from pathlib import Path
from unittest import mock

import requests_mock
from absl.testing import absltest

REPO_ROOT = Path(__file__).resolve().parents[3]
sys.path.insert(0, str(REPO_ROOT))

from scripts.us_epa.parent_company.download_existing_facilities import (
download_existing_facilities,)
from scripts.us_epa.parent_company.download_existing_facilities import (
_V2_SPARQL_URL,)


class DownloadExistingFacilitiesTest(absltest.TestCase):

def test_download_existing_facilities(self):
response = {
"header": ["?dcid"],
"rows": [
{
"cells": [{
"value": "epaGhgrpFacilityId/1001"
}]
},
{
"cells": [{
"value": "epaGhgrpFacilityId/1002"
}]
},
],
facility_nodes = [
mock.Mock(dcid="epaGhgrpFacilityId/1001"),
mock.Mock(dcid="epaGhgrpFacilityId/1002"),
mock.Mock(dcid="epaGhgrpFacilityId/1001"),
mock.Mock(dcid=None),
]
mock_response = mock.Mock()
mock_response.get_properties.return_value = {
"EpaReportingFacility": {
"typeOf": facility_nodes,
}
}
mock_client = mock.Mock()
mock_client.node.fetch_property_values.return_value = mock_response

with tempfile.TemporaryDirectory() as tmp_dir:
with requests_mock.Mocker() as mocker:
mocker.post(_V2_SPARQL_URL, json=response)
with mock.patch(
"scripts.us_epa.parent_company."
"download_existing_facilities.get_dc_api_key",
return_value="test-key"):
output_path = download_existing_facilities(tmp_dir)
with mock.patch(
"scripts.us_epa.parent_company."
"download_existing_facilities.get_datacommons_client",
return_value=mock_client):
output_path = download_existing_facilities(tmp_dir)

self.assertTrue(os.path.exists(output_path))
with open(output_path, "r", encoding="utf-8") as handle:
contents = handle.read()
self.assertEqual(
contents,
"epaGhgrpFacilityId\n"
"epaGhgrpFacilityId/1001\n"
"epaGhgrpFacilityId/1002\n",
)
self.assertLen(mocker.request_history, 1)
request = mocker.request_history[0]
self.assertEqual(request.headers.get("X-API-Key"), "test-key")
self.assertEqual(
request.json().get("query"),
"SELECT DISTINCT ?dcid WHERE {?a typeOf "
"EpaReportingFacility . ?a dcid ?dcid }",
)
self.assertTrue(os.path.exists(output_path))
with open(output_path, "r", encoding="utf-8") as handle:
contents = handle.read()
self.assertEqual(
contents,
"epaGhgrpFacilityId\n"
"epaGhgrpFacilityId/1001\n"
"epaGhgrpFacilityId/1002\n",
)
mock_client.node.fetch_property_values.assert_called_once_with(
node_dcids="EpaReportingFacility",
properties="typeOf",
out=False,
)


if __name__ == "__main__":
Expand Down
Loading