Skip to content

Commit 9c84c38

Browse files
committed
Support .tar.gz files from Campina
1 parent eaac47e commit 9c84c38

3 files changed

Lines changed: 65 additions & 9 deletions

File tree

src/processing/instrument_process.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import logging
66
import re
77
import shutil
8+
import tarfile
89
from collections import defaultdict
910
from pathlib import Path
1011
from typing import Literal, NoReturn
@@ -160,7 +161,7 @@ def process_mira_10(self) -> None:
160161

161162
def process_mira_35(self) -> None:
162163
full_paths, self.uuid.raw = self.download_instrument(
163-
filename_suffix={".znc", ".znc.gz"},
164+
filename_suffix={".znc", ".znc.gz", ".znc.tar.gz"},
164165
allow_empty=True,
165166
exclude_pattern=r"[a-z]+\.znc",
166167
)
@@ -925,11 +926,11 @@ def _get_valid_uuids(
925926

926927

927928
def _unzip_gz_files(full_paths: list[Path]) -> list[Path]:
928-
paths_out = []
929+
paths_out: list[Path] = []
929930
for path_in in full_paths:
930931
try:
931-
paths_out.append(unzip_gz_file(path_in))
932-
except (EOFError, gzip.BadGzipFile) as err:
932+
paths_out.extend(unzip_gz_file(path_in))
933+
except (EOFError, gzip.BadGzipFile, tarfile.TarError) as err:
933934
logging.warning("Cannot unzip gz file %s: %s", path_in, err)
934935
return paths_out
935936

src/processing/utils.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import gzip
33
import logging
44
import shutil
5+
import tarfile
6+
from collections.abc import Iterable
57
from pathlib import Path
68
from typing import Literal
79
from uuid import UUID
@@ -220,16 +222,25 @@ def build_file_landing_page_url(uuid: str | UUID) -> str:
220222
return f"{base}/file/{uuid}"
221223

222224

223-
def unzip_gz_file(path_in: Path) -> Path:
225+
def unzip_gz_file(path_in: Path) -> Iterable[Path]:
224226
if path_in.suffix != ".gz":
225-
return path_in
227+
yield path_in
228+
return
226229
path_out = path_in.parent / path_in.stem
227230
logging.debug(f"Decompressing {path_in} to {path_out}")
228231
with gzip.open(path_in, "rb") as file_in:
229-
with open(path_out, "wb") as file_out:
230-
shutil.copyfileobj(file_in, file_out)
232+
if path_out.suffix == ".tar":
233+
with tarfile.TarFile(fileobj=file_in) as tar:
234+
for tarinfo in tar:
235+
info_out = path_in.parent / tarinfo.name
236+
logging.debug("Extracting to %s", info_out)
237+
tar.extract(tarinfo, path_in.parent, filter="data")
238+
yield info_out
239+
else:
240+
with open(path_out, "wb") as file_out:
241+
shutil.copyfileobj(file_in, file_out)
242+
yield path_out
231243
path_in.unlink()
232-
return path_out
233244

234245

235246
def create_product_put_payload(

tests/unit/test_utils_module.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
import gzip
2+
import shutil
3+
import tarfile
4+
from io import BytesIO
15
from pathlib import Path
26

37
import netCDF4
@@ -9,6 +13,7 @@
913

1014
from processing import netcdf_comparer
1115
from processing.netcdf_comparer import NCDiff
16+
from processing.utils import unzip_gz_file
1217

1318
test_file_path = Path(__file__).parent.absolute()
1419

@@ -614,3 +619,42 @@ def test_compression(tmp_path: Path) -> None:
614619
nc1.createVariable("time", "f8", ("time",), zlib=False)
615620
nc2.createVariable("time", "f8", ("time",), zlib=True)
616621
assert netcdf_comparer.nc_difference(old_file, new_file) == NCDiff.MINOR
622+
623+
624+
def test_unzip_regular_gz_file(tmp_path: Path) -> None:
625+
regular_file = tmp_path / "test.txt"
626+
regular_file.write_text("This is a test file.")
627+
gz_file = tmp_path / "test.txt.gz"
628+
with open(regular_file, "rb") as f_in:
629+
with gzip.open(gz_file, "wb") as f_out:
630+
shutil.copyfileobj(f_in, f_out)
631+
632+
result = list(unzip_gz_file(gz_file))
633+
assert len(result) == 1
634+
assert result[0].name == "test.txt"
635+
assert result[0].read_text() == "This is a test file."
636+
assert not gz_file.exists() # Ensure the .gz file is deleted
637+
638+
639+
def test_unzip_tar_gz_file(tmp_path: Path) -> None:
640+
tar_gz_file = tmp_path / "test.tar.gz"
641+
with tarfile.open(tar_gz_file, "w:gz") as tar:
642+
info = tarfile.TarInfo("subdir/test.txt")
643+
info.size = len(b"This is a test file inside a tar.")
644+
tar.addfile(info, fileobj=BytesIO(b"This is a test file inside a tar."))
645+
646+
result = list(unzip_gz_file(tar_gz_file))
647+
assert len(result) == 1
648+
assert result[0].name == "test.txt"
649+
assert result[0].read_text() == "This is a test file inside a tar."
650+
assert not tar_gz_file.exists() # Ensure the .tar.gz file is deleted
651+
652+
653+
def test_unzip_non_gz_file(tmp_path: Path) -> None:
654+
non_gz_file = tmp_path / "test.txt"
655+
non_gz_file.write_text("This is a non-gz file.")
656+
657+
result = list(unzip_gz_file(non_gz_file))
658+
assert len(result) == 1
659+
assert result[0].name == "test.txt"
660+
assert result[0].read_text() == "This is a non-gz file."

0 commit comments

Comments
 (0)