-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcombine_jsons.py
More file actions
59 lines (47 loc) · 1.86 KB
/
combine_jsons.py
File metadata and controls
59 lines (47 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import glob
import json
import os
from datetime import datetime
import sys
images_root = 'scrap_pdfs/output_pages_images'
def combine_jsons(directory, output_file):
combined_data = []
# Glob all JSON files in the directory
json_files = glob.glob(os.path.join(directory, '*.json'))
# Read each JSON file and append its contents to the combined_data list
for file in json_files:
with open(file, 'r') as f:
data = json.load(f)
combined_data.extend(data)
# Filter out items based on specified conditions
filtered_data = filter_items(combined_data)
# Generate the output file name with a date prefix
date_prefix = datetime.now().strftime("%Y%m%d")
output_filename = f"{date_prefix}_{len(filtered_data)}_{output_file}"
# Write the filtered_data to the output file
with open(output_filename, 'w') as f:
json.dump(filtered_data, f, ensure_ascii=False, indent=2)
print(f"Combined JSON files saved to: {output_filename}")
def filter_items(data):
filtered_data = []
for item in data:
conversations = item['conversations']
if is_gpt_value_valid(conversations):
if os.path.exists(os.path.join(images_root, item['image'].replace('images/', ''))):
filtered_data.append(item)
else:
print(f"Skipping item with image name: {item['image']} (file not found)")
else:
print(f"Skipping item with image name: {item['image']}")
return filtered_data
def is_gpt_value_valid(conversations):
for conversation in conversations:
if conversation['from'] == 'gpt':
value = conversation['value']
if value.strip() == '' or '&#$' in value:
return False
return True
# Usage example
directory = sys.argv[1]
output_file = 'combined.json'
combine_jsons(directory, output_file)