From abf1783c127cb6a0cbafc75c8a73dff24ec8430a Mon Sep 17 00:00:00 2001 From: Rachit-Mangawa <145089194+Rachit-Mangawa@users.noreply.github.com> Date: Thu, 11 Jun 2026 17:33:50 +0530 Subject: [PATCH] fix: tolerate non-standard Excel formats (issue #136) - Drop fully blank rows before column assignment - Add semantic column detection across header row - Fall back to longest-string heuristic when no keyword match - Add regression test with attached wellbeing-scales-list.xlsx Closes #136 --- src/harmony/parsing/excel_parser.py | 75 ++++++++++++++++++---------- tests/test_excel_tolerant.py | 29 +++++++++++ tests/wellbeing-scales-list.xlsx | Bin 0 -> 8617 bytes 3 files changed, 77 insertions(+), 27 deletions(-) create mode 100644 tests/test_excel_tolerant.py create mode 100644 tests/wellbeing-scales-list.xlsx diff --git a/src/harmony/parsing/excel_parser.py b/src/harmony/parsing/excel_parser.py index a775637..0148dd2 100644 --- a/src/harmony/parsing/excel_parser.py +++ b/src/harmony/parsing/excel_parser.py @@ -38,6 +38,9 @@ from harmony.schemas.requests.text import RawFile, Instrument re_header_column = re.compile(r'(?i)(?:question|text|pergunta)') +QUESTION_COLUMN_HINTS = re.compile( + r'(?i)(question|text|pergunta|item|description|scale|statement|content)' +) def clean_option_no(option_could_be_int): @@ -59,39 +62,57 @@ def convert_excel_to_instruments(file: RawFile) -> List[Instrument]: instruments = [] for sheet_idx, (sheet_name, df_questions) in enumerate(sheet_name_to_dataframe.items()): - # check we have 3 columns. If more or less, adjust it by deleting or inserting. - if len(df_questions.columns) > 3: - if str(df_questions[df_questions.columns[3]].iloc[0]).lower() == "filename": - if len(df_questions.columns) > 4 and str( - df_questions[df_questions.columns[4]].iloc[0]).lower() == "language": - df_questions.drop(columns=df_questions.columns[5:], inplace=True) - else: - df_questions.drop(columns=df_questions.columns[4:], inplace=True) - else: - df_questions.drop(columns=df_questions.columns[3:], inplace=True) - elif len(df_questions.columns) < 3: - col_avg_lengths = [0] * len(df_questions.columns) - for col_idx, col_name in enumerate(df_questions.columns): - col_avg_lengths[col_idx] = df_questions[col_name].apply(lambda s: len(str(s))).mean() - biggest_col = int(np.argmax(col_avg_lengths)) - if biggest_col == 0: - df_questions.insert(0, "question_no", [str(n) for n in range(len(df_questions))]) - if len(df_questions.columns) < 3: - df_questions.insert(2, "options", [""] * len(df_questions)) + # Strip blank rows before assignment + df_questions.dropna(how='all', inplace=True) + df_questions.reset_index(drop=True, inplace=True) - # standardise the column names - if len(df_questions.columns) == 3: - df_questions.columns = ["question_no", "question", "options"] - elif len(df_questions.columns) == 4: - df_questions.columns = ["question_no", "question", "options", "filename"] + if len(df_questions) == 0: + continue + + # Find the question column semantically + question_col = None + for col in df_questions.columns: + if QUESTION_COLUMN_HINTS.search(str(col)): + question_col = col + break + + if question_col is None and len(df_questions) > 0: + for col in df_questions.columns: + val = str(df_questions[col].iloc[0]) + if QUESTION_COLUMN_HINTS.search(val): + question_col = col + break + + if question_col is None: + # Fall back: longest average string length column + avg_lens = df_questions.apply(lambda c: c.astype(str).str.len().mean()) + question_col = avg_lens.idxmax() + + question_col_idx = df_questions.columns.get_loc(question_col) + + # Ensure we have question_no, question, options + if question_col_idx > 0: + question_no_col = df_questions.columns[0] + else: + df_questions.insert(0, "generated_question_no", [str(n) for n in range(len(df_questions))]) + question_no_col = "generated_question_no" + question_col_idx += 1 + + if question_col_idx < len(df_questions.columns) - 1: + options_col = df_questions.columns[question_col_idx + 1] else: - df_questions.columns = ["question_no", "question", "options", "filename", "language"] + df_questions["generated_options"] = "" + options_col = "generated_options" + + # standardise the column names + df_questions = df_questions[[question_no_col, question_col, options_col]].copy() + df_questions.columns = ["question_no", "question", "options"] # Check if header row present, in which case remove it rows_to_delete = [] for i in range(len(df_questions)): - if df_questions.question.iloc[i] is None or type(df_questions.question.iloc[i]) is not str or \ - re_header_column.match(df_questions.question.iloc[i]): + val = df_questions.question.iloc[i] + if val is None or type(val) is not str or QUESTION_COLUMN_HINTS.search(val): rows_to_delete.append(i) break diff --git a/tests/test_excel_tolerant.py b/tests/test_excel_tolerant.py new file mode 100644 index 0000000..1d7f899 --- /dev/null +++ b/tests/test_excel_tolerant.py @@ -0,0 +1,29 @@ +import base64 +import io +import unittest +import uuid +from harmony.parsing.wrapper_all_parsers import convert_files_to_instruments +from harmony.schemas.requests.text import RawFile + + +class TestExcelTolerantFormat(unittest.TestCase): + def test_excel_tolerant_format(self): + with open("tests/wellbeing-scales-list.xlsx", "rb") as f: + file_as_bytes = f.read() + + file_as_base64 = base64.b64encode(file_as_bytes).decode("ascii") + + harmony_file = RawFile( + file_type="xlsx", + content="," + file_as_base64, + file_id=uuid.uuid4().hex, + file_name="wellbeing-scales-list.xlsx" + ) + + instruments = convert_files_to_instruments([harmony_file]) + + self.assertGreater(len(instruments), 0) + self.assertGreater(len(instruments[0].questions), 0) + + for q in instruments[0].questions: + self.assertTrue(len(q.question_text.strip()) > 0) \ No newline at end of file diff --git a/tests/wellbeing-scales-list.xlsx b/tests/wellbeing-scales-list.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..29ac2c9598884bfc0732f773eae420ac1691d360 GIT binary patch literal 8617 zcmaiZ1yo$ivNi-8G`Kq?xVyUr8{D12-GaMoa0|gLxH|-QcL)~T-9z%jIrqQG&3pH} z+H243>Akw9yX>p30!l+bqJsed05CzBL2a<#2>1Ef)rvvS0c7RK@ZQ$J($Ln{lHS$Y zivDG!Oi{?9n-RHn7sK$h?kF!%k#-jquBl)jBCxey#!AYJ!}{X!v#c3DAR8vLXT-uZ z*kE|pey^88{Ge3{Xv_t0>YLB7#PB>?{iLg}GCD+elQxZ2twS8^f=^XBvZ3;h852e} z_4VFV-nsq6M|~^Mgh_=~!X>ZX5a*olfw+>D1(dxK7O_jA@6X8_*=ZbY4@c~Ya{~B5 zbFBERXu4{DJ=eDpv|)a&nFjnI=x<6fPKfq>F-KTROFs9F6cGdSkUq+Ax0k#}-`u9? zOQY309A*01%8h08x<=l)IXcX80Ht97?GxNM>(5dsegy^w{I7C){kNQcmr#__pv^KP za@#Ep2EvQ zQm24ExH3}A8;BjhAt(A7dc!rM_|*hA%%-8H88}iG@L`)IB1{VFo8mc-m{l6QD%7e^ zkL{!HiW-j(anZ%#tf`}o(=u!=A$S|!fT>OqL3Z|3Gy+S-Qqq5RPKc2(s~;W#{kkxF ze77Wztf?%9lOlvAT`lk~Ty=c5F(#v%Mp8OFbG>X^6HFgPGJq~UaP)M*+cX;*n=H1?gNB?>q;?p=mTSiP_K$Y2HEuU`s8;g|LpJTMCb|tJUQra25yQ_jBK4v+u^1q%MSx##sUEU%FZAb@A|#5l7zg1 zM?Hr)Qnm@W0rN077&6kE%S~g(u)=sD+e)~QHpZ`!9+|Uu(I^J{WV)y`$O4d z`&bR%>2Mz;39}Y-%^G$?t;cW%5;wP$G#^ik?laydRlL|uE5NMu5gZK64&tBe2JLUV zaWVs0gBbq$WP0(gUbSVLkBrD(r)ui09MDOJ)agYQ%?Bb3it*mQ^C{9s2-O*RkGBK} z8`YKj?^g1Nt6V*IMygD$YLe$Wq@xK_mBrq$Cpj8ug~DpS2^wIZ8*e#&91wno7K^M4 zTVHh}qcN{`N5I4Qw^&E-fE!=g~YUhd; z8(q1?0n0`gXNoz#^CM^?aDlYY!)n}|(2zYR`!V}b0Li7Cqq73Km4nC#x27j^y$$?y z{H(|2{ketzEsQ5)w?||kefl~gGfR3(KCs2qEpn{I41+R=DWyAAH+ zi&&uAl5ng+%*X-*eJ{*Id|kTkh1=S)Dy6rBXXV^WGTF_FqoKO%L8%^;9ja0df**Hf zU$K7_-kZY>>>gvnh%L4!UgOLp9J#^r4=S1V=eps_5gU`9JTqfAxF(*$r)-}W7Y|1r zi!<@&Q5^GT{EWgOmN%-Dwp$Ld5g?O6V+riL%X=U;M9e~68}kO z>)P(ylu|!|7CDWWuAeYtSI~6=Sph;+NF1OQb%oZWR&MAn%b(&9-9DZe>9WaB3^GAI z&j}PJ7-zuB!RH zVH+`Pcp&R>C%K&AM$CXq_Qw1vx7>7IkTrFhDlpTEXwC6tFsh6*dJNM)7v?jfl7l}C z42ih%TWSz<=6d?#)h4WXA8IOUIeErBvYfSAKcU&!SCX_ShLfR;c`wea zjBrIp9C9-;6K`(I-wYz=`1;_8zhAS!PF9WOw2lV*1fk7F`w4s3`YCt;!d@BGvN%Zz zJF7sT&|2@)WZU1V*FLL&) z>E&t`(0R0OY$|d+2VB;^}G(hfphCeIjI|G6ky_GiEMB>;6c^8h^{e20CUuKD?|KxmyXoqL_uG>21Ji=<)ZGN1ltfP+Q$#UrCmf?HQ&nzDW*31Kf znInSI+N{Oe483y~h!}equ@glje})w?S6om2Wtjfj~}<41az71s-jhI*#d_&kms1d8Qixd6&D{IPJ{7 zoFSDyRB$%rjV_2U{87%T1>Ku
FqkgVmfZzD`vkD z7k0{BDk(Y?@d$tW*iiD`alTktJJ0?toXZA()zDMr1IP{eB72Fs0ACiUjCG<~e9^Y6 zGMKb&&xZ1b<|;AjgInoUaqI18#cIwn+aZ>ihaWS;M!JoN-5&S2NL7nxfjZ&F$u+`k zB$FsA2AN~FyURTbo21KT2*w-TKcl&*4X6Dqabo9HlJUkY*4>i* zs3N0Bv+3b7Jab#!Ufv#7;lA#=s72Ryvop)Q?antcfJdlrm?8p&EAT!nWlQ#g7msD1zOgMA zq2-64+GzBl_D-F6aquMW6F11;XAHWCoXC(u#Vg_oS+tpS+jrn0c=iAU=9{Cv{|@aJ9U9ShCaGjP|V)0Y`PM zR3Rfkp&@cqdSR^7*^sZ)v zEna**6BXukX5i#<2*T0_a8w8W%#r=)=j9J1m<~(9S7~h47M7*oW);H&$PTZ~UU3Zj zgsTW>Z1nIt%+ccB$$zSIe?=^iQ6FZyy7*aqFP(<3S?^xj*4AW!)H?{^5Unl($tR&L zENUn0%6+Z$)c^_i5ON6vT!vZP;e6SF#M!9dpRO&T~Aie`oQaSPTMw}1lQo>>zhBkVtqc8F%hKaUO@ zx0nRJ1@mDiP5C5%Ke$&OCMW>m?AtfUJ`RvYUg9QBDUt}8Y(w)ahduy1jV{FV2Hxq7=Gfer!EhTZK zu<;FHL1SCcDVr^dus_n7fW($c7D;xRFm0vdkJxdbG_t<<$8rZiL%bM%P5epI?vfE9`0=pR83NuJsrl>aK2DN^Sr0FfCw;^AT1-rvYJsvWf%gbv4 zwEKZy2e{p-k$M~GXl$I4+>77LOU-F{(tlNyd}CC8)lYxUTAbj}X&5Zfo zTUnpw4&L^Xe938(({OJ=?*4~L1FK;9{uTX)tJ+D6YlatO;*trPSB3@yD}D1%$b|l{ ze9X+i0c5P?03817t1c|>9^S<>&u;YF;| zpA^c&-(_a|BDyZ^`EP7N6>%_cS6Q3GMpOW3)v1V7mR+2+?%BMt&>$V)%JOy^vRfc#j-sTwFJrTQ^ zb8q|f?ml?1c&Q|cNLj=fql_kel2)qB6M(l3v3i)*IQvmtR3*KDX;xpO3g#v{pkY@` zOBYJmMY~QR#%aDonFDeRdlg;QJ0myGIfhF;PjT2E!G8g;jTZk@~ z0CJ$r@;ho2wcs_*f(f7?H$uZJL0}y_b_ET)ba(JK<-XPJ?zbZJ#L_EtJH%1CkJtLAF(*GWIuH9#Uay>ULptM9|-9 zEUu^SeD_Qbgm327d%G7p`e~5eQ^;Dym(k!Dl+{&OZJY(*Q479XB6fG;_@j(Zu)G)9NCYaht7rEe(S-ccIm78iN2;wCc z0E3vvb@>1mTwoaf>{_Uh##TkhhcoJNNNo`ox-9Z)m2Ho(%G}ShV?8#>A2b)ZL!cbf zYvbq8snK9xN!M-E?khh*6=lE0J~!6V*ksDq5Rm~*n-^kT`Ic($ER?^mN_=CS4r7mZ z*0V-J01$}@{2UnWbryx4DY>D1f#zgOD$hb55YW1j{OQrJ1I}}oFG#hjA*vbYVKszV z)k{(ZE}q0-u&T$$GcXQpQNJ^)I7sqQ+(vxK10R<=cp@rj_6PhAJ=&RJ@z(wL?Ad1J z;jAApwxTgkDki%f+O4VriEcAyh)6M&!TWP$iZ;mfvk}ar9l@%&=NZFuczfTCShe2e z4TSsTnUH64eLG*G@S8$%MleMy#_ed8&;n;3K8tGgP+VFN0>p)k)Le_*conEwWIMzf z2~IlrD8lthueX!2>liHGxqXE~@r4zcjn2d5$R?L4ZXd>c#GF(%V!6h%fWtn93RzRr zYtBfO{QMPg1`J+1HX9n;BlMjr=L5F;lZvtRXN$nqRL&z$qGGQaq9i>}JSJwg_(4S2E!QY6+0^yigifa(xPDzPO$G0ub<_;I^aUc984(B* zwshq_i-4JdfQ0zP1KTIn-EbY;{nbaz5zFG{^haldPEXIr{q2Ic~nM0D2e`~wwU){S~8k(N$5@lM;W z^*FCtSqv2R$cWk+X)lI?srGr)KQWlyMu5r>`5NSs;{#?;r6+B)^bmO?6R)> z@=bo5S6@H;XkObI8B%77WIqw)1NYU&R=#CcjjE{W=nJjuIw&-@b&xY5EO$L1ll!T;GUM(o-!-5{!XY5IP-V zN7L%>_Uxs-+qbXyii5vE!bdtdRo2pqUM)(ip3KS*{G;u#5R441;IV_R=2So{Ze?BpFemmWZ;aSETzlcIAd4| z;n9l)e-jFPg)D=qoq*%6Io3^sjx8-FL_-OC>~hal+;rv@yQ8>A6J@r$qsx&{pV zb^h-)AMYt%)K)uKtL^%%w!MFBHX#45HUm4m7p-N)4%>7wB8%NVg%22}bLO#%4KN*u z=#bP!_%=sqn}Vr%4-z8ZKjI?CIJaGV<>6*&H@OU$TqG1$9%DDwP9TS%(Wy8pAQ`?P z%Z#JVscf4V6BdCEH69~cP_rX^MIEX^y0GwpE+!v@ctkDra9S65=gq|K4>)quZ&oosgKp#qZzD=v&cgaN;QK87zTmX!XoUzj41xK` zT-50^tXjT-DiFOBeuBK?jmnzMD+=Gu!KF)BA!t4|Uzv?RD$kX+y@LvKQcTR9&zSPk zWpa<o#By1eCO{FcuRXZp;AC{K!fpeN|NP4bx;0e`Uq1`; z%k%x;D=9A(_+LKY?C4}`{g*t)s^Gd9(StTt3IU7D$}q80wGo^=%C{d@>%kpu2V^6> zJjnb{qC<%I<)-bM?NIL|R# zg4O!WYhucl>frC@>wWEbLdt!uhP0^&yhUAZsXDdx;bv62!k|gwCOHXneWP|cMb!p{ zuBF2D8^ws-)kN}$+s;_!09<3Z`C*|u;yw(V4Gz`7*)Ny9uWw{yDl%?3T{PO%xY=jZNcZH$_#ywZ@gYB=L9Kh9CgSO`dj+29*M;# zg6}GnJ5*(orQ!8y$B7K;6d_WE=Pf1S@6VcyoEi7A=qEq@z>uyL24d^?ZoJOrk-Tpe zJoT1Zj-v1MI;|fp$urEvQyK^)3(~*<@z`U+y$5Rq8V>86ZPJn(XKXr{f9>^#%B`jY z*HBGVd2B(+gmw`*05i}nUQvZR>-+i*o`K4b4*o^~qr1H}L$>ZykTDgnqnk${#~XR2 zX>)2cWbV{tNZ5?3FG0iLA4cXa)g6j8uVVWEX}ei9A!!9-6C0nw-k&&FPLXNqK;V}o zpIrZ7tJ<+W6da=q<*2<%&ul|U{~F@S3h|RjL{6?>6;DDUHfVbIX|`m-kA$?6^0m)l z9jU_m69(mw2r&E4F3j(XhQ}@tN6){F`)b>59fiqfyDAskR834=ld7&UciXT(tmGF|Hr!2WyT|Dy){Czs!o ztd|q|FJgX1la~ejd0zjM!|xfzOEUJ0!Z80MHT!eo@0j+IaQvcGtlvZLztWFCr~e+s zUx4oyB|e9^e@E_r!Qh{>e~*>_skv{@*7Wb$M*oice=_(z#Jq&mU(}5E7lZ#j!2ZeR jcN725b@bqaf&DMh2`CNqY#Cr+2+tSb*~lUYUw-{R8MyYd literal 0 HcmV?d00001