diff --git a/scripts/us_epa/parent_company/download_existing_facilities.py b/scripts/us_epa/parent_company/download_existing_facilities.py index 1156172e42..1e618231af 100644 --- a/scripts/us_epa/parent_company/download_existing_facilities.py +++ b/scripts/us_epa/parent_company/download_existing_facilities.py @@ -18,7 +18,6 @@ from pathlib import Path import pandas as pd -import requests from absl import app from absl import flags @@ -27,12 +26,10 @@ if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) -from util.dc_api_wrapper import get_dc_api_key +from util.dc_api_wrapper import get_datacommons_client FLAGS = flags.FLAGS -_V2_SPARQL_URL = "https://api.datacommons.org/v2/sparql" - def _define_flags() -> None: flags.DEFINE_string('output_path', 'tmp_data', 'Output directory') @@ -42,23 +39,19 @@ def download_existing_facilities(output_path: str) -> str: Path(output_path).mkdir(exist_ok=True) out_file = os.path.join(output_path, 'existing_facilities.csv') - q = "SELECT DISTINCT ?dcid WHERE {?a typeOf EpaReportingFacility . ?a dcid ?dcid }" - headers = {"Content-Type": "application/json"} - api_key = get_dc_api_key() - if api_key: - headers["X-API-Key"] = api_key - response = requests.post(_V2_SPARQL_URL, json={"query": q}, headers=headers) - response.raise_for_status() - res = response.json() - + client = get_datacommons_client() + response = client.node.fetch_property_values( + node_dcids="EpaReportingFacility", properties="typeOf", out=False) + facility_nodes = response.get_properties().get("EpaReportingFacility", + {}).get("typeOf", []) facility_ids = [] - for row in res.get('rows', []): - cells = row.get('cells', []) - if not cells: + facility_ids_set = set() + for node in facility_nodes: + value = getattr(node, "dcid", None) + if not value or value in facility_ids_set: continue - value = cells[0].get('value') - if value: - facility_ids.append(value) + facility_ids_set.add(value) + facility_ids.append(value) df = pd.DataFrame.from_dict({"epaGhgrpFacilityId": facility_ids}) df.to_csv(out_file, mode="w", header=True, index=False) diff --git a/scripts/us_epa/parent_company/download_existing_facilities_test.py b/scripts/us_epa/parent_company/download_existing_facilities_test.py index 12b5c7c6f7..fcdc41a9ac 100644 --- a/scripts/us_epa/parent_company/download_existing_facilities_test.py +++ b/scripts/us_epa/parent_company/download_existing_facilities_test.py @@ -19,7 +19,6 @@ from pathlib import Path from unittest import mock -import requests_mock from absl.testing import absltest REPO_ROOT = Path(__file__).resolve().parents[3] @@ -27,54 +26,47 @@ from scripts.us_epa.parent_company.download_existing_facilities import ( download_existing_facilities,) -from scripts.us_epa.parent_company.download_existing_facilities import ( - _V2_SPARQL_URL,) class DownloadExistingFacilitiesTest(absltest.TestCase): def test_download_existing_facilities(self): - response = { - "header": ["?dcid"], - "rows": [ - { - "cells": [{ - "value": "epaGhgrpFacilityId/1001" - }] - }, - { - "cells": [{ - "value": "epaGhgrpFacilityId/1002" - }] - }, - ], + facility_nodes = [ + mock.Mock(dcid="epaGhgrpFacilityId/1001"), + mock.Mock(dcid="epaGhgrpFacilityId/1002"), + mock.Mock(dcid="epaGhgrpFacilityId/1001"), + mock.Mock(dcid=None), + ] + mock_response = mock.Mock() + mock_response.get_properties.return_value = { + "EpaReportingFacility": { + "typeOf": facility_nodes, + } } + mock_client = mock.Mock() + mock_client.node.fetch_property_values.return_value = mock_response + with tempfile.TemporaryDirectory() as tmp_dir: - with requests_mock.Mocker() as mocker: - mocker.post(_V2_SPARQL_URL, json=response) - with mock.patch( - "scripts.us_epa.parent_company." - "download_existing_facilities.get_dc_api_key", - return_value="test-key"): - output_path = download_existing_facilities(tmp_dir) + with mock.patch( + "scripts.us_epa.parent_company." + "download_existing_facilities.get_datacommons_client", + return_value=mock_client): + output_path = download_existing_facilities(tmp_dir) - self.assertTrue(os.path.exists(output_path)) - with open(output_path, "r", encoding="utf-8") as handle: - contents = handle.read() - self.assertEqual( - contents, - "epaGhgrpFacilityId\n" - "epaGhgrpFacilityId/1001\n" - "epaGhgrpFacilityId/1002\n", - ) - self.assertLen(mocker.request_history, 1) - request = mocker.request_history[0] - self.assertEqual(request.headers.get("X-API-Key"), "test-key") - self.assertEqual( - request.json().get("query"), - "SELECT DISTINCT ?dcid WHERE {?a typeOf " - "EpaReportingFacility . ?a dcid ?dcid }", - ) + self.assertTrue(os.path.exists(output_path)) + with open(output_path, "r", encoding="utf-8") as handle: + contents = handle.read() + self.assertEqual( + contents, + "epaGhgrpFacilityId\n" + "epaGhgrpFacilityId/1001\n" + "epaGhgrpFacilityId/1002\n", + ) + mock_client.node.fetch_property_values.assert_called_once_with( + node_dcids="EpaReportingFacility", + properties="typeOf", + out=False, + ) if __name__ == "__main__":