2828from datetime import datetime
2929import dateparser
3030import sys
31+ from typing import Dict
3132from common .rate_limiter import RateLimiter
3233
3334logger = logging .getLogger (__name__ )
@@ -86,7 +87,7 @@ def execute_search(self, params):
8687 else :
8788 metadata = pd .DataFrame (raw_metadata )
8889 metadata = self .sanitize_metadata (metadata )
89- metadata = filter_duplicates (metadata , original_service )
90+ metadata = filter_duplicates (metadata , original_service , params )
9091 metadata = pd .concat (
9192 [metadata , parse_annotations_for_all (metadata , "subject_orig" )],
9293 axis = 1 ,
@@ -240,7 +241,11 @@ def handle_contentproviders(self, request_id, params):
240241pattern_annotations = re .compile (r"([A-Za-z]+:[\w'\- ]+);?" )
241242
242243
243- def filter_duplicates (df , service ):
244+ def filter_duplicates (df , service , params ):
245+ if logger .isEnabledFor (logging .DEBUG ):
246+ logger .debug (f"Filtering duplicates for service: { service } " )
247+ logger .debug (f"Initial number of records: { len (df )} " )
248+ _log_dataframe (df , params , "initial_records" )
244249 df .drop_duplicates ("id" , inplace = True , keep = "first" )
245250 df ["is_anchor" ] = False
246251 df ["doi_duplicate" ] = False
@@ -303,6 +308,9 @@ def filter_duplicates(df, service):
303308 if c in filtered .columns :
304309 filtered .drop (c , axis = 1 , inplace = True )
305310
311+ if logger .isEnabledFor (logging .DEBUG ):
312+ logger .debug (f"Number of records after filtering: { len (filtered )} " )
313+ _log_dataframe (filtered , params , "filtered_records" )
306314 return filtered
307315
308316
@@ -362,3 +370,22 @@ def sanitize_year(year_str):
362370 sanitized_year = year_str # here we keep the original string
363371
364372 return sanitized_year
373+
374+ def _log_dataframe (df : pd .DataFrame , params : Dict [str , str ], name : str , ):
375+ vis_id = params .get ('vis_id' )
376+
377+ columns_to_print = ['id' , 'title' , 'doi' , 'merged_dois' , 'paper_abstract' , 'link' , 'subject' , 'subject_orig' , 'oa_state' ]
378+
379+ available_columns = df .columns .tolist ()
380+ columns_to_print = [col for col in columns_to_print if col in available_columns ]
381+
382+ transformed = df .copy ().reindex (columns = columns_to_print )
383+
384+ transformed = transformed .fillna (value = 'missing' )
385+
386+ # create folder
387+ folder = f'./output/{ vis_id } '
388+ if not os .path .exists (folder ):
389+ os .makedirs (folder )
390+ file_path = f"{ folder } /{ name } .csv"
391+ transformed .to_csv (file_path , index = False )
0 commit comments