diff --git a/src/harmony/parsing/excel_parser.py b/src/harmony/parsing/excel_parser.py index a775637..0148dd2 100644 --- a/src/harmony/parsing/excel_parser.py +++ b/src/harmony/parsing/excel_parser.py @@ -38,6 +38,9 @@ from harmony.schemas.requests.text import RawFile, Instrument re_header_column = re.compile(r'(?i)(?:question|text|pergunta)') +QUESTION_COLUMN_HINTS = re.compile( + r'(?i)(question|text|pergunta|item|description|scale|statement|content)' +) def clean_option_no(option_could_be_int): @@ -59,39 +62,57 @@ def convert_excel_to_instruments(file: RawFile) -> List[Instrument]: instruments = [] for sheet_idx, (sheet_name, df_questions) in enumerate(sheet_name_to_dataframe.items()): - # check we have 3 columns. If more or less, adjust it by deleting or inserting. - if len(df_questions.columns) > 3: - if str(df_questions[df_questions.columns[3]].iloc[0]).lower() == "filename": - if len(df_questions.columns) > 4 and str( - df_questions[df_questions.columns[4]].iloc[0]).lower() == "language": - df_questions.drop(columns=df_questions.columns[5:], inplace=True) - else: - df_questions.drop(columns=df_questions.columns[4:], inplace=True) - else: - df_questions.drop(columns=df_questions.columns[3:], inplace=True) - elif len(df_questions.columns) < 3: - col_avg_lengths = [0] * len(df_questions.columns) - for col_idx, col_name in enumerate(df_questions.columns): - col_avg_lengths[col_idx] = df_questions[col_name].apply(lambda s: len(str(s))).mean() - biggest_col = int(np.argmax(col_avg_lengths)) - if biggest_col == 0: - df_questions.insert(0, "question_no", [str(n) for n in range(len(df_questions))]) - if len(df_questions.columns) < 3: - df_questions.insert(2, "options", [""] * len(df_questions)) + # Strip blank rows before assignment + df_questions.dropna(how='all', inplace=True) + df_questions.reset_index(drop=True, inplace=True) - # standardise the column names - if len(df_questions.columns) == 3: - df_questions.columns = ["question_no", "question", "options"] - elif len(df_questions.columns) == 4: - df_questions.columns = ["question_no", "question", "options", "filename"] + if len(df_questions) == 0: + continue + + # Find the question column semantically + question_col = None + for col in df_questions.columns: + if QUESTION_COLUMN_HINTS.search(str(col)): + question_col = col + break + + if question_col is None and len(df_questions) > 0: + for col in df_questions.columns: + val = str(df_questions[col].iloc[0]) + if QUESTION_COLUMN_HINTS.search(val): + question_col = col + break + + if question_col is None: + # Fall back: longest average string length column + avg_lens = df_questions.apply(lambda c: c.astype(str).str.len().mean()) + question_col = avg_lens.idxmax() + + question_col_idx = df_questions.columns.get_loc(question_col) + + # Ensure we have question_no, question, options + if question_col_idx > 0: + question_no_col = df_questions.columns[0] + else: + df_questions.insert(0, "generated_question_no", [str(n) for n in range(len(df_questions))]) + question_no_col = "generated_question_no" + question_col_idx += 1 + + if question_col_idx < len(df_questions.columns) - 1: + options_col = df_questions.columns[question_col_idx + 1] else: - df_questions.columns = ["question_no", "question", "options", "filename", "language"] + df_questions["generated_options"] = "" + options_col = "generated_options" + + # standardise the column names + df_questions = df_questions[[question_no_col, question_col, options_col]].copy() + df_questions.columns = ["question_no", "question", "options"] # Check if header row present, in which case remove it rows_to_delete = [] for i in range(len(df_questions)): - if df_questions.question.iloc[i] is None or type(df_questions.question.iloc[i]) is not str or \ - re_header_column.match(df_questions.question.iloc[i]): + val = df_questions.question.iloc[i] + if val is None or type(val) is not str or QUESTION_COLUMN_HINTS.search(val): rows_to_delete.append(i) break diff --git a/tests/test_excel_tolerant.py b/tests/test_excel_tolerant.py new file mode 100644 index 0000000..1d7f899 --- /dev/null +++ b/tests/test_excel_tolerant.py @@ -0,0 +1,29 @@ +import base64 +import io +import unittest +import uuid +from harmony.parsing.wrapper_all_parsers import convert_files_to_instruments +from harmony.schemas.requests.text import RawFile + + +class TestExcelTolerantFormat(unittest.TestCase): + def test_excel_tolerant_format(self): + with open("tests/wellbeing-scales-list.xlsx", "rb") as f: + file_as_bytes = f.read() + + file_as_base64 = base64.b64encode(file_as_bytes).decode("ascii") + + harmony_file = RawFile( + file_type="xlsx", + content="," + file_as_base64, + file_id=uuid.uuid4().hex, + file_name="wellbeing-scales-list.xlsx" + ) + + instruments = convert_files_to_instruments([harmony_file]) + + self.assertGreater(len(instruments), 0) + self.assertGreater(len(instruments[0].questions), 0) + + for q in instruments[0].questions: + self.assertTrue(len(q.question_text.strip()) > 0) \ No newline at end of file diff --git a/tests/wellbeing-scales-list.xlsx b/tests/wellbeing-scales-list.xlsx new file mode 100644 index 0000000..29ac2c9 Binary files /dev/null and b/tests/wellbeing-scales-list.xlsx differ