diff --git a/repository/BioParsers-Tests/BioPhylipParserTest.class.st b/repository/BioParsers-Tests/BioPhylipParserTest.class.st index 2c2e750d..55a66c27 100644 --- a/repository/BioParsers-Tests/BioPhylipParserTest.class.st +++ b/repository/BioParsers-Tests/BioPhylipParserTest.class.st @@ -5,250 +5,366 @@ Class { #package : 'BioParsers-Tests' } -{ #category : 'testing' } -BioPhylipParserTest >> firstLineTokenizer [ - - ^ BioPhylipParser new firstLineTokenizer -] - -{ #category : 'testing' } -BioPhylipParserTest >> phylipInterleavedDNA [ - - ^ ' 6 13 -Archaeopt CGATGCTTAC CGCCGATGCT -HesperorniCGTTACTCGT TGTCGTTACT -BaluchitheTAATGTTAAT TGTTAATGTT -B. virginiTAATGTTCGT TGTTAATGTT -BrontosaurCAAAACCCAT CATCAAAACC -B.subtilisGGCAGCCAAT CACGGCAGCC - -TACCGCCGAT GCTTACCGC -CGTTGTCGTT ACTCGTTGT -AATTGTTAAT GTTAATTGT -CGTTGTTAAT GTTCGTTGT -CATCATCAAA ACCCATCAT -AATCACGGCA GCCAATCAC - -CCCCGCCCCC GCTTACCGC -CCCCGTCCCC ACTCGTTGT -CCCCGTCCCC GTTAATTGT -CCCCGTCCCC GTTCGTTGT -CCCCATCCCC ACCCATCAT -CCCCACCCCC GCCAATCAC -' -] - -{ #category : 'testing' } -BioPhylipParserTest >> phylipInterleavedProtein [ - - ^ ' 5 176 -cox2_leitaMAFILSFWMI FLLDSVIVLL SFVCFVCVWI CALLFSTVLL VSKLNNIYCT -cox2_crifaMAFILSFWMI FLIDAVIVLL SFVCFVCIWI CSLFFSSFLL VSKINNVYCT -cox2_bsaltMSFIISFWML FLIDSLIVLL SGAIFVCIWI CSLFFLCILF ICKLDYIFCS -cox2_trybbMSFILTFWMI FLMDSIIVLI SFSIFLSVWI CALIIATVLT VTKINNIYCT -cox2_tborrMLFFINQLLL LLVDTFVILE IFSLFVCVFI IVMYILFINY NIFLKNINVY - -WDFTASKFID VYWFTIGGMF SLGLLLRLCL LLYFGHLNFV SFDLCKVVGF -WDFTASKFID AYWFTIGGMF VLCLLLRLCL LLYFGCLNFV SFDLCKVVGF -WDFISAKFID LYWFTLGCLF IVCLLIRLCL LLYFSCLNFV CFDLCKCIGF -WDFISSKFID TYWFVLGMMF ILCLLLRLCL LLYFSCINFV SFDLCKVIGF -LDFIGSKYLD LYWFLIGIFF VIVLLIRLCL LLYYSWISLL IFDLCKIMGF - -QWYWVYFIFG ETTIFSNLIL ESDYMIGDLR LLQCNHVLTL LSLVIYKLWL -QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKLWL -QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKVWL -QWYWVYFLFG ETTIFSNLIL ESDYLIGDLR ILQCNHVLTL LSLVIYKLWV -QWYWIFFVFK ENVIFSNLLI ESDYWIGDLR LLQCNNTFNL ICLVVYKIWV - -SAVDVIHSFA ISSLGVKVEN LVAVMK -SAVDVIHSFA VSSLGIKVDC IPGRCN -SAIDVIHSFT LANLGIKVD? ?PGRCN -SAVDVIHSFT ISSLGIKVEN PGRCNE -TSIDVIHSFT ISTLGIKIDC IPGRCN -' -] - -{ #category : 'testing' } -BioPhylipParserTest >> speciesDNALineTokenizer [ - - ^ BioPhylipParser new speciesDNALineTokenizer -] - -{ #category : 'testing' } -BioPhylipParserTest >> speciesDNANamedBlockTokenizer [ - - ^ BioPhylipParser new speciesDNANamedBlockTokenizer -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeDNASpeciesBlock01 [ - " Private - Answer a with a sample phylip DNA " - - | speciesBlock expectedResult firstRecord | - - speciesBlock := 'Archaeopt CGATGCTTAC CGC -HesperorniCGTTACTCGT TGT -BaluchitheTAATGTTAAT TGT -B. virginiTAATGTTCGT TGT -BrontosaurCAAAACCCAT CAT -B.subtilisGGCAGCCAAT CAC'. - expectedResult := #(#('Archaeopt ' 'CGATGCTTAC CGC' nil) #('Hesperorni' 'CGTTACTCGT TGT' nil) #('Baluchithe' 'TAATGTTAAT TGT' nil) #('B. virgini' 'TAATGTTCGT TGT' nil) #('Brontosaur' 'CAAAACCCAT CAT' nil)). - - parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. - firstRecord := parseResult first. - - self assert: firstRecord first equals: 'Archaeopt '. - self assert: firstRecord second equals: 'CGATGCTTAC CGC'. - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeDNASpeciesBlock02 [ - " Private - Answer a with a sample phylip DNA " - - | speciesBlock expectedResult firstRecord | - - speciesBlock := 'Archaeopt CGATGCTTAC CGC -Hes CGTTACTCGT TGT -BaluchitheTAATGTTAAT TGT -B. virginiTAATGTTCGT TGT -BrontosaurCAAAACCCAT CAT -B.subtilisGGCAGCCAAT CAC'. - expectedResult := #( - #('Archaeopt ' 'CGATGCTTAC CGC') - #('Hesperorni' 'CGTTACTCGT TGT') - #('Baluchithe' 'TAATGTTAAT TGT') - #('B. virgini' 'TAATGTTCGT TGT') - #('Brontosaur' 'CAAAACCCAT CAT')). - - parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. - firstRecord := parseResult first. - - self assert: firstRecord first equals: 'Archaeopt '. - self assert: firstRecord second equals: 'CGATGCTTAC CGC'. - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeDNASpeciesLine01 [ - " Private - Answer a with a sample phylip DNA " - - | speciesLineBlock expectedResult | - - speciesLineBlock := 'Archaeopt CGATGCTTAC CGC'. - expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). - parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. - - self assert: (parseResult bioHasEqualElements: expectedResult ). -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeDNASpeciesLine02 [ - " Private - Answer a with a sample phylip DNA " - - | speciesLineBlock expectedResult | - - speciesLineBlock := 'Archaeopt CGATGCTTACCGC'. - expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). - parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. - - self assert: (parseResult bioHasEqualElements: expectedResult). - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeDNASpeciesLine03 [ - " Private - Answer a with a sample phylip DNA " - - | speciesLineBlock expectedResult | - - speciesLineBlock := 'B. virginiTAATGTTCGT TGT'. - expectedResult := #('B. virgini' 'TAATGTTCGTTGT'). - parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. - - self assert: (parseResult bioHasEqualElements: expectedResult). - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeFirstLine01 [ - " Private - Answer a with a sample phylip DNA " - - | firstLine | - - firstLine := '6 13 -'. - parseResult := self firstLineTokenizer parse: firstLine. - self assert: (parseResult bioHasEqualElements: #('6' '13') ). - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeFirstLine02 [ - " Private - Answer a with a sample phylip DNA " - - | firstLine | - - firstLine := ' 6 13 -'. - parseResult := self firstLineTokenizer parse: firstLine. - self assert: (parseResult bioHasEqualElements: #('6' '13') ). - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeFirstLine03 [ - " Private - Answer a with a sample phylip DNA " - - | firstLine | - - firstLine := '6 13 -'. - parseResult := self firstLineTokenizer parse: firstLine. - self assert: (parseResult bioHasEqualElements: #('6' '13') ). - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeInterleavedDNA [ - " Private - Answer a with a sample phylip DNA " - - | phylipString | - phylipString := self phylipInterleavedDNA. - - parseResult := BioParser tokenizePhylipInterleavedDNA: phylipString. - self assert: parseResult size equals: 4. - self assert: parseResult first equals: 6. - self assert: parseResult second equals: 13. - self assert: (parseResult third bioHasEqualElements: - #( 'Archaeopt ' 'Hesperorni' 'Baluchithe' 'B. virgini' - 'Brontosaur' 'B.subtilis' )). - self assert: (parseResult fourth bioHasEqualElements: - #( 'CGATGCTTACCGCCGATGCTTACCGCCGATGCTTACCGCCCCCGCCCCCGCTTACCGC' - 'CGTTACTCGTTGTCGTTACTCGTTGTCGTTACTCGTTGTCCCCGTCCCCACTCGTTGT' - 'TAATGTTAATTGTTAATGTTAATTGTTAATGTTAATTGTCCCCGTCCCCGTTAATTGT' - 'TAATGTTCGTTGTTAATGTTCGTTGTTAATGTTCGTTGTCCCCGTCCCCGTTCGTTGT' - 'CAAAACCCATCATCAAAACCCATCATCAAAACCCATCATCCCCATCCCCACCCATCAT' - 'GGCAGCCAATCACGGCAGCCAATCACGGCAGCCAATCACCCCCACCCCCGCCAATCAC' )) -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeInterleavedProtein [ - - | phylipString | - phylipString := self phylipInterleavedProtein. - parseResult := BioParser tokenizePhylipInterleavedProtein: phylipString. +{ #category : 'as yet unclassified' } +BioPhylipParserTest >> testAmbiguousDNAAlphabetDetection [ + "Note: Ambiguous DNA codes like N, R, Y overlap with amino acid codes. + BioSmalltalk detects such sequences as protein alphabet by default. + This test verifies that sequences are still created correctly." + | phylip aln seq | + phylip := '2 10 +Seq1 AACGTGGNNA +Seq2 CCGTATGGNN +'. + aln := BioPhylipParser parseString: phylip. + seq := aln sequences first. + "Sequence is created and can be used regardless of alphabet detection" + self assert: seq size equals: 10. + self assert: (seq asString includesSubstring: 'NN') +] + +{ #category : 'tests' } +BioPhylipParserTest >> testAsPhylipStringRelaxed [ + + | phylip aln output | + phylip := '3 10 +Homo_sapiens AACGTGGCCA +Pan_troglodytes CCGTATGGCC +Gorilla GGCTTTGACC +'. + aln := BioPhylipParser parseString: phylip. + output := BioPhylipParser new asPhylipStringRelaxed: aln. + self assert: (output includesSubstring: '3 10'). + self assert: (output includesSubstring: 'Homo_sapiens'). + self assert: (output includesSubstring: 'AACGTGGCCA') +] + +{ #category : 'tests' } +BioPhylipParserTest >> testAutoDetectsInterleaved [ + + | phylip aln | + phylip := '2 20 +S1 ATGCTAGCTA +S2 CCGCTAGCTA +GCTAGCTAGC +GCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 2. + self assert: aln numberOfBases equals: 20 +] + +{ #category : 'tests' } +BioPhylipParserTest >> testAutoDetectsSequential [ + + | phylip aln | + "Sequential: taxon data on consecutive lines, continuation lines have no name" + phylip := '2 20 +S1 ATGC +GCTAGCTAGCTAGCTA +S2 CCGC +TAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 2. + self assert: aln numberOfBases equals: 20 +] + +{ #category : 'tests' } +BioPhylipParserTest >> testClassicStrict5Taxa [ + + | phylip aln | + phylip := '5 13 +Alpha AACGTGGCCACAT +Beta AAGGTCGCCACAC +Gamma CAGTTCGCCACAA +Delta GAGATTTCCGCCT +Epsilon GAGATCTCCGCCC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 5. + self assert: aln numberOfBases equals: 13. + self assert: (aln sequenceNames includes: 'Alpha'). + self assert: (aln sequenceNames includes: 'Epsilon'). + self assert: aln sequences first asString equals: 'AACGTGGCCACAT'. + self assert: aln sequences last asString equals: 'GAGATCTCCGCCC' +] + +{ #category : 'as yet unclassified' } +BioPhylipParserTest >> testDNAAlphabetDetection [ + | phylip aln seq | + phylip := '2 10 +Seq1 AACGTGGCCA +Seq2 CCGTATGGCA +'. + aln := BioPhylipParser parseString: phylip. + seq := aln sequences first. + self assert: (seq alphabet class name) equals: #BioIUPACUnambiguousDNA +] + +{ #category : 'tests' } +BioPhylipParserTest >> testGapsAndAmbiguityCodes [ + + | phylip aln | + phylip := '3 10 +Seq1 AAC-GG??TN +Seq2 CCN-AT???K +Seq3 GGRYY??-KM +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 10. + self assert: aln sequences first asString equals: 'AAC-GG??TN' +] + +{ #category : 'tests' } +BioPhylipParserTest >> testInterleavedWithBlankLines [ + + | phylip aln | + phylip := '3 40 +Taxon1 ATGCTAGCTAGCTAGCTAGC +Taxon2 CCGCTAGCTAGCTAGCTAGC +Taxon3 GGGCTAGCTAGCTAGCTAGC + +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 40 +] + +{ #category : 'tests' } +BioPhylipParserTest >> testMinimalFile [ + + | phylip aln | + phylip := '2 1 +A T +B G +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 2. + self assert: aln numberOfBases equals: 1. + self assert: aln sequences first asString equals: 'T'. + self assert: aln sequences last asString equals: 'G' +] + +{ #category : 'tests' } +BioPhylipParserTest >> testParserProperties [ + + | phylip aln | + phylip := '3 10 +Taxon1 AACGTGGCCA +Taxon2 CCGTATGGCC +Taxon3 GGCTTTGACC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 10. + self assert: (aln sequenceNames includes: 'Taxon1'). + self assert: (aln sequenceNames includes: 'Taxon3') +] + +{ #category : 'as yet unclassified' } +BioPhylipParserTest >> testProteinAlphabetDetection [ + | phylip aln seq | + phylip := '2 15 +Human MVKQLEARKRPEQQE +Mouse MVKQLEARHRPEQQK +'. + aln := BioPhylipParser parseString: phylip. + seq := aln sequences first. + self assert: (seq alphabet class name) equals: #BioIUPACProtein +] + +{ #category : 'tests' } +BioPhylipParserTest >> testProteinSequences [ - self assert: parseResult size equals: 4. - self assert: parseResult first equals: 5. - self assert: parseResult second equals: 176. - self assert: (parseResult third bioHasEqualElements: - #( 'cox2_leita' 'cox2_crifa' 'cox2_bsalt' 'cox2_trybb' - 'cox2_tborr' )). - self assert: (parseResult fourth bioHasEqualElements: - #( 'MAFILSFWMIFLLDSVIVLLSFVCFVCVWICALLFSTVLLVSKLNNIYCTWDFTASKFIDVYWFTIGGMFSLGLLLRLCLLLYFGHLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYMIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAISSLGVKVENLVAVMK' - 'MAFILSFWMIFLIDAVIVLLSFVCFVCIWICSLFFSSFLLVSKINNVYCTWDFTASKFIDAYWFTIGGMFVLCLLLRLCLLLYFGCLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAVSSLGIKVDCIPGRCN' - 'MSFIISFWMLFLIDSLIVLLSGAIFVCIWICSLFFLCILFICKLDYIFCSWDFISAKFIDLYWFTLGCLFIVCLLIRLCLLLYFSCLNFVCFDLCKCIGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKVWLSAIDVIHSFTLANLGIKVD??PGRCN' - 'MSFILTFWMIFLMDSIIVLISFSIFLSVWICALIIATVLTVTKINNIYCTWDFISSKFIDTYWFVLGMMFILCLLLRLCLLLYFSCINFVSFDLCKVIGFQWYWVYFLFGETTIFSNLILESDYLIGDLRILQCNHVLTLLSLVIYKLWVSAVDVIHSFTISSLGIKVENPGRCNE' - 'MLFFINQLLLLLVDTFVILEIFSLFVCVFIIVMYILFINYNIFLKNINVYLDFIGSKYLDLYWFLIGIFFVIVLLIRLCLLLYYSWISLLIFDLCKIMGFQWYWIFFVFKENVIFSNLLIESDYWIGDLRLLQCNNTFNLICLVVYKIWVTSIDVIHSFTISTLGIKIDCIPGRCN' )) + | phylip aln | + "Protein sequences with amino acid codes" + phylip := '3 10 +Human MVKQLEARKR +Mouse MVKQLEARHR +Chicken GGCTTTGACC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 10. + self assert: (aln sequenceNames includes: 'Human') +] + +{ #category : 'as yet unclassified' } +BioPhylipParserTest >> testRNAAlphabetDetection [ + | phylip aln seq | + phylip := '2 10 +Seq1 AACGUGGUUU +Seq2 CCGUAUGGAU +'. + aln := BioPhylipParser parseString: phylip. + seq := aln sequences first. + self assert: (seq alphabet class name) equals: #BioIUPACUnambiguousRNA +] + +{ #category : 'tests' } +BioPhylipParserTest >> testRelaxedInterleaved [ + + | phylip aln | + phylip := '3 40 +Homo_sapiens ATGCTAGCTAGCTAGCTAGC +Pan_troglodytes CCGCTAGCTAGCTAGCTAGC +Gorilla_gorilla GGGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 40. + self assert: (aln sequenceNames includes: 'Homo_sapiens'). + self assert: (aln sequenceNames includes: 'Gorilla_gorilla') +] + +{ #category : 'tests' } +BioPhylipParserTest >> testRelaxedSequential [ + + | phylip aln | + phylip := '4 20 +Homo_sapiens AACGTGGCCACATACGTGGC +Pan_troglodytes AAGGTCGCCACACAAGGTCC +Gorilla_gorilla CAGTTCGCCACAACAGTTCC +Pongo_abelii GAGATTTCCGCCTGAGATTT +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 4. + self assert: aln numberOfBases equals: 20. + self assert: (aln sequenceNames includes: 'Homo_sapiens'). + self assert: (aln sequenceNames includes: 'Pongo_abelii') +] + +{ #category : 'tests' } +BioPhylipParserTest >> testRoundTripStrict [ + + | phylip aln output | + phylip := '3 10 +Taxon1 AACGTGGCCA +Taxon2 CCGTATGGCC +Taxon3 GGCTTTGACC +'. + aln := BioPhylipParser parseString: phylip. + output := BioPhylipParser new asPhylipStringStrict: aln. + self assert: (output includesSubstring: '3 10'). + self assert: (output includesSubstring: 'Taxon1'). + self assert: (output includesSubstring: 'AACGTGGCCA') +] + +{ #category : 'tests' } +BioPhylipParserTest >> testSequentialWithWrapping [ + + | phylip aln | + phylip := '3 40 +Taxon1 ATGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +Taxon2 CCGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +Taxon3 GGGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 40. + self assert: aln sequences first asString size equals: 40 +] + +{ #category : 'tests' } +BioPhylipParserTest >> testSpacesInSequences [ + + | phylip aln | + phylip := '2 10 +Seq1 AACGTGGCCA +Seq2 CCGTATGGCC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 2. + self assert: aln numberOfBases equals: 10. + self assert: aln sequences first asString equals: 'AACGTGGCCA' +] + +{ #category : 'tests' } +BioPhylipParserTest >> testStrictInterleaved [ + + | phylip aln | + phylip := '3 40 +Taxon1 ATGCTAGCTAGCTAGCTAGC +Taxon2 CCGCTAGCTAGCTAGCTAGC +Taxon3 GGGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 40. + self + assert: aln sequenceNames asArray + equals: #( 'Taxon1' 'Taxon2' 'Taxon3' ). + self + assert: aln sequences first asString + equals: 'ATGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC' +] + +{ #category : 'tests' } +BioPhylipParserTest >> testStrictSequential [ + + | phylip aln | + phylip := '5 13 +Alpha AACGTGGCCACAT +Beta AAGGTCGCCACAC +Gamma CAGTTCGCCACAA +Delta GAGATTTCCGCCT +Epsilon GAGATCTCCGCCC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 5. + self assert: aln numberOfBases equals: 13. + self assert: (aln sequenceNames includes: 'Alpha'). + self assert: (aln sequenceNames includes: 'Epsilon'). + self assert: aln sequences first asString equals: 'AACGTGGCCACAT' +] + +{ #category : 'tests' } +BioPhylipParserTest >> testThreeBlockInterleaved [ + + | phylip aln | + phylip := '2 60 +S1 ATGCTAGCTAGCTAGCTAGCTAGCTAGCTA +S2 ATGCTAGCTAGCTAGCTAGCTAGCTAGCTA +GCTAGCTAGCTAGCTAGCTAGCTAGCTAGC +GCTAGCTAGCTAGCTAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 2. + self assert: aln numberOfBases equals: 60. + self assert: aln sequences first asString size equals: 60 +] + +{ #category : 'tests' } +BioPhylipParserTest >> testValidatorRejectsWrongSeqLength [ + + | phylip | + phylip := '2 10 +Seq1 AACGTG +Seq2 CCGTAT +'. + self should: [ BioPhylipParser parseString: phylip ] raise: Error +] + +{ #category : 'tests' } +BioPhylipParserTest >> testValidatorRejectsWrongTaxaCount [ + + | phylip | + phylip := '3 10 +Seq1 AACGTGGCCA +Seq2 CCGTATGGCC +'. + self should: [ BioPhylipParser parseString: phylip ] raise: Error ] diff --git a/repository/BioParsers-Tests/BioPhylipPetitParserTest.class.st b/repository/BioParsers-Tests/BioPhylipPetitParserTest.class.st new file mode 100644 index 00000000..9975e9b2 --- /dev/null +++ b/repository/BioParsers-Tests/BioPhylipPetitParserTest.class.st @@ -0,0 +1,254 @@ +Class { + #name : 'BioPhylipPetitParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioPhylipPetitParserTest >> firstLineTokenizer [ + + ^ BioPhylipParser new firstLineTokenizer +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> phylipInterleavedDNA [ + + ^ ' 6 13 +Archaeopt CGATGCTTAC CGCCGATGCT +HesperorniCGTTACTCGT TGTCGTTACT +BaluchitheTAATGTTAAT TGTTAATGTT +B. virginiTAATGTTCGT TGTTAATGTT +BrontosaurCAAAACCCAT CATCAAAACC +B.subtilisGGCAGCCAAT CACGGCAGCC + +TACCGCCGAT GCTTACCGC +CGTTGTCGTT ACTCGTTGT +AATTGTTAAT GTTAATTGT +CGTTGTTAAT GTTCGTTGT +CATCATCAAA ACCCATCAT +AATCACGGCA GCCAATCAC + +CCCCGCCCCC GCTTACCGC +CCCCGTCCCC ACTCGTTGT +CCCCGTCCCC GTTAATTGT +CCCCGTCCCC GTTCGTTGT +CCCCATCCCC ACCCATCAT +CCCCACCCCC GCCAATCAC +' +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> phylipInterleavedProtein [ + + ^ ' 5 176 +cox2_leitaMAFILSFWMI FLLDSVIVLL SFVCFVCVWI CALLFSTVLL VSKLNNIYCT +cox2_crifaMAFILSFWMI FLIDAVIVLL SFVCFVCIWI CSLFFSSFLL VSKINNVYCT +cox2_bsaltMSFIISFWML FLIDSLIVLL SGAIFVCIWI CSLFFLCILF ICKLDYIFCS +cox2_trybbMSFILTFWMI FLMDSIIVLI SFSIFLSVWI CALIIATVLT VTKINNIYCT +cox2_tborrMLFFINQLLL LLVDTFVILE IFSLFVCVFI IVMYILFINY NIFLKNINVY + +WDFTASKFID VYWFTIGGMF SLGLLLRLCL LLYFGHLNFV SFDLCKVVGF +WDFTASKFID AYWFTIGGMF VLCLLLRLCL LLYFGCLNFV SFDLCKVVGF +WDFISAKFID LYWFTLGCLF IVCLLIRLCL LLYFSCLNFV CFDLCKCIGF +WDFISSKFID TYWFVLGMMF ILCLLLRLCL LLYFSCINFV SFDLCKVIGF +LDFIGSKYLD LYWFLIGIFF VIVLLIRLCL LLYYSWISLL IFDLCKIMGF + +QWYWVYFIFG ETTIFSNLIL ESDYMIGDLR LLQCNHVLTL LSLVIYKLWL +QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKLWL +QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKVWL +QWYWVYFLFG ETTIFSNLIL ESDYLIGDLR ILQCNHVLTL LSLVIYKLWV +QWYWIFFVFK ENVIFSNLLI ESDYWIGDLR LLQCNNTFNL ICLVVYKIWV + +SAVDVIHSFA ISSLGVKVEN LVAVMK +SAVDVIHSFA VSSLGIKVDC IPGRCN +SAIDVIHSFT LANLGIKVD? ?PGRCN +SAVDVIHSFT ISSLGIKVEN PGRCNE +TSIDVIHSFT ISTLGIKIDC IPGRCN +' +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> speciesDNALineTokenizer [ + + ^ BioPhylipParser new speciesDNALineTokenizer +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> speciesDNANamedBlockTokenizer [ + + ^ BioPhylipParser new speciesDNANamedBlockTokenizer +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeDNASpeciesBlock01 [ + " Private - Answer a with a sample phylip DNA " + + | speciesBlock expectedResult firstRecord | + + speciesBlock := 'Archaeopt CGATGCTTAC CGC +HesperorniCGTTACTCGT TGT +BaluchitheTAATGTTAAT TGT +B. virginiTAATGTTCGT TGT +BrontosaurCAAAACCCAT CAT +B.subtilisGGCAGCCAAT CAC'. + expectedResult := #(#('Archaeopt ' 'CGATGCTTAC CGC' nil) #('Hesperorni' 'CGTTACTCGT TGT' nil) #('Baluchithe' 'TAATGTTAAT TGT' nil) #('B. virgini' 'TAATGTTCGT TGT' nil) #('Brontosaur' 'CAAAACCCAT CAT' nil)). + + parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. + firstRecord := parseResult first. + + self assert: firstRecord first equals: 'Archaeopt '. + self assert: firstRecord second equals: 'CGATGCTTAC CGC'. + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeDNASpeciesBlock02 [ + " Private - Answer a with a sample phylip DNA " + + | speciesBlock expectedResult firstRecord | + + speciesBlock := 'Archaeopt CGATGCTTAC CGC +Hes CGTTACTCGT TGT +BaluchitheTAATGTTAAT TGT +B. virginiTAATGTTCGT TGT +BrontosaurCAAAACCCAT CAT +B.subtilisGGCAGCCAAT CAC'. + expectedResult := #( + #('Archaeopt ' 'CGATGCTTAC CGC') + #('Hesperorni' 'CGTTACTCGT TGT') + #('Baluchithe' 'TAATGTTAAT TGT') + #('B. virgini' 'TAATGTTCGT TGT') + #('Brontosaur' 'CAAAACCCAT CAT')). + + parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. + firstRecord := parseResult first. + + self assert: firstRecord first equals: 'Archaeopt '. + self assert: firstRecord second equals: 'CGATGCTTAC CGC'. + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeDNASpeciesLine01 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'Archaeopt CGATGCTTAC CGC'. + expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult ). +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeDNASpeciesLine02 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'Archaeopt CGATGCTTACCGC'. + expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult). + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeDNASpeciesLine03 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'B. virginiTAATGTTCGT TGT'. + expectedResult := #('B. virgini' 'TAATGTTCGTTGT'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult). + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeFirstLine01 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := '6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeFirstLine02 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := ' 6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeFirstLine03 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := '6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeInterleavedDNA [ + " Private - Answer a with a sample phylip DNA " + + | phylipString | + phylipString := self phylipInterleavedDNA. + + parseResult := BioParser tokenizePhylipInterleavedDNA: phylipString. + self assert: parseResult size equals: 4. + self assert: parseResult first equals: 6. + self assert: parseResult second equals: 13. + self assert: (parseResult third bioHasEqualElements: + #( 'Archaeopt ' 'Hesperorni' 'Baluchithe' 'B. virgini' + 'Brontosaur' 'B.subtilis' )). + self assert: (parseResult fourth bioHasEqualElements: + #( 'CGATGCTTACCGCCGATGCTTACCGCCGATGCTTACCGCCCCCGCCCCCGCTTACCGC' + 'CGTTACTCGTTGTCGTTACTCGTTGTCGTTACTCGTTGTCCCCGTCCCCACTCGTTGT' + 'TAATGTTAATTGTTAATGTTAATTGTTAATGTTAATTGTCCCCGTCCCCGTTAATTGT' + 'TAATGTTCGTTGTTAATGTTCGTTGTTAATGTTCGTTGTCCCCGTCCCCGTTCGTTGT' + 'CAAAACCCATCATCAAAACCCATCATCAAAACCCATCATCCCCATCCCCACCCATCAT' + 'GGCAGCCAATCACGGCAGCCAATCACGGCAGCCAATCACCCCCACCCCCGCCAATCAC' )) +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeInterleavedProtein [ + + | phylipString | + phylipString := self phylipInterleavedProtein. + parseResult := BioParser tokenizePhylipInterleavedProtein: phylipString. + + self assert: parseResult size equals: 4. + self assert: parseResult first equals: 5. + self assert: parseResult second equals: 176. + self assert: (parseResult third bioHasEqualElements: + #( 'cox2_leita' 'cox2_crifa' 'cox2_bsalt' 'cox2_trybb' + 'cox2_tborr' )). + self assert: (parseResult fourth bioHasEqualElements: + #( 'MAFILSFWMIFLLDSVIVLLSFVCFVCVWICALLFSTVLLVSKLNNIYCTWDFTASKFIDVYWFTIGGMFSLGLLLRLCLLLYFGHLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYMIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAISSLGVKVENLVAVMK' + 'MAFILSFWMIFLIDAVIVLLSFVCFVCIWICSLFFSSFLLVSKINNVYCTWDFTASKFIDAYWFTIGGMFVLCLLLRLCLLLYFGCLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAVSSLGIKVDCIPGRCN' + 'MSFIISFWMLFLIDSLIVLLSGAIFVCIWICSLFFLCILFICKLDYIFCSWDFISAKFIDLYWFTLGCLFIVCLLIRLCLLLYFSCLNFVCFDLCKCIGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKVWLSAIDVIHSFTLANLGIKVD??PGRCN' + 'MSFILTFWMIFLMDSIIVLISFSIFLSVWICALIIATVLTVTKINNIYCTWDFISSKFIDTYWFVLGMMFILCLLLRLCLLLYFSCINFVSFDLCKVIGFQWYWVYFLFGETTIFSNLILESDYLIGDLRILQCNHVLTLLSLVIYKLWVSAVDVIHSFTISSLGIKVENPGRCNE' + 'MLFFINQLLLLLVDTFVILEIFSLFVCVFIIVMYILFINYNIFLKNINVYLDFIGSKYLDLYWFLIGIFFVIVLLIRLCLLLYYSWISLLIFDLCKIMGFQWYWIFFVFKENVIFSNLLIESDYWIGDLRLLQCNNTFNLICLVVYKIWVTSIDVIHSFTISTLGIKIDCIPGRCN' )) +] diff --git a/repository/BioParsers/BioPhylipParser.class.st b/repository/BioParsers/BioPhylipParser.class.st index 46e0ce8a..bea56d9f 100644 --- a/repository/BioParsers/BioPhylipParser.class.st +++ b/repository/BioParsers/BioPhylipParser.class.st @@ -1,218 +1,618 @@ -" -Documentation taken from http://bioweb2.pasteur.fr/docs/phylip/doc/main.html#inputfiles - -" Class { #name : 'BioPhylipParser', - #superclass : 'BioAbstractTextParser', - #category : 'BioParsers-Core', + #superclass : 'BioObject', + #instVars : [ + 'numTaxa', + 'numChars', + 'taxaNames', + 'sequences', + 'isInterleaved', + 'isStrict', + 'currentLine', + 'lines', + 'errorLine' + ], + #category : 'BioParsers-PHYLIP', #package : 'BioParsers', - #tag : 'Core' + #tag : 'PHYLIP' } -{ #category : 'accessing-dna' } -BioPhylipParser >> buildDNAResults: aCollection [ - " Answer an identified object for the receiver's parsing output " +{ #category : 'as yet unclassified' } +BioPhylipParser class >> parseFile: aFileReference [ + ^ self new parseFile: aFileReference +] - | tokenized sequences | - - tokenized := self buildTokens: aCollection. - sequences := tokenized third - with: tokenized fourth - do: [: first : snd | BioSequence newAmbiguousDNA: snd named: first ]. - ^ BioPhylip new - numberOfTaxa: tokenized first; - numberOfCharacters: tokenized second; - sequences: sequences. +{ #category : 'as yet unclassified' } +BioPhylipParser class >> parseString: aString [ + ^ self new parseString: aString ] -{ #category : 'accessing-protein' } -BioPhylipParser >> buildProteinResults: aCollection [ - " Answer an identified object for the receiver's parsing output " +{ #category : 'converting' } +BioPhylipParser >> asAlignment [ + "Build a BioAlignment from the parsed data. + Uses BioSequence class>>newNamed:sequence: which auto-detects the alphabet." + | alignment | + alignment := BioAlignment new. + 1 to: numTaxa do: [ :i | + | seq | + seq := BioSequence newNamed: (taxaNames at: i) sequence: (sequences at: i) asUppercase. + alignment addFromSequence: seq ]. + ^ alignment +] - | tokenized sequences | - - tokenized := self buildTokens: aCollection. - sequences := tokenized third - with: tokenized fourth - do: [: first : snd | BioSequence newProtein: snd named: first ]. - ^ BioPhylip new - numberOfTaxa: tokenized first; - numberOfCharacters: tokenized second; - sequences: sequences. +{ #category : 'converting' } +BioPhylipParser >> asPhylipStringRelaxed: anAlignment [ + "Answer a relaxed PHYLIP string from anAlignment (variable-length names, sequential)." + + ^ String streamContents: [ :s | + s + nextPutAll: anAlignment size asString; + space; + nextPutAll: anAlignment numberOfBases asString; + cr. + anAlignment sequences do: [ :seq | + s + nextPutAll: (seq name ifNil: [ 'Unnamed' ]); + space; + nextPutAll: seq asString; + cr ] ] +] + +{ #category : 'converting' } +BioPhylipParser >> asPhylipStringStrict: anAlignment [ + "Answer a strict PHYLIP string from anAlignment (10-char names, sequential)." + + ^ String streamContents: [ :s | + s + nextPutAll: anAlignment size asString; + space; + nextPutAll: anAlignment numberOfBases asString; + cr. + anAlignment sequences do: [ :seq | + | name padded | + name := seq name ifNil: [ 'Unnamed' ]. + padded := name size > 10 + ifTrue: [ name copyFrom: 1 to: 10 ] + ifFalse: [ + name + , + (String new: 10 - name size withAll: Character space) ]. + s + nextPutAll: padded; + nextPutAll: seq asString; + cr ] ] +] + +{ #category : 'private' } +BioPhylipParser >> cleanSequence: aString [ + "Remove whitespace from sequence data. PHYLIP allows spaces within sequences." + + ^ aString reject: [ :c | c isSeparator ] ] { #category : 'accessing' } -BioPhylipParser >> buildTokens: aCollection [ - " Answer a tokenized parsing aCollection " - - ^ Array - with: (self taxaNumberFrom: aCollection) - with: (aCollection first second asNumber) - with: (aCollection second collect: #first) - with: (self buildTokensFrom: aCollection). +BioPhylipParser >> currentLine [ + ^ currentLine +] + +{ #category : 'accessing' } +BioPhylipParser >> currentLine: anInt [ + + currentLine := anInt +] +{ #category : 'private' } +BioPhylipParser >> detectFormat [ + "Auto-detect strict/relaxed and sequential/interleaved." + + | firstDataLine nameField | + currentLine > lines size ifTrue: [ + Error signal: 'No data lines after header' ]. + firstDataLine := lines at: currentLine. + nameField := self extractNameFromLine: firstDataLine. + isStrict := nameField size <= 10 and: [ + firstDataLine size >= 10 and: [ + (firstDataLine + copyFrom: 1 + to: (10 min: firstDataLine size)) trimBoth + = nameField ] ]. + isInterleaved := self detectInterleaved ] -{ #category : 'accessing-private' } -BioPhylipParser >> buildTokensBlock [ +{ #category : 'private' } +BioPhylipParser >> detectInterleaved [ + "Determine if the file is interleaved or sequential. + If the first taxon has numChars chars, it is sequential (or single-block). + Otherwise, skip first N taxa lines, then check if the next N lines + are pure sequence (=> interleaved) or include a new taxon name (=> sequential)." + + | i firstSeqLen firstDataLineIdx consecutiveContLines | + firstDataLineIdx := currentLine. + [ + firstDataLineIdx <= lines size and: [ + (lines at: firstDataLineIdx) trimBoth isEmpty ] ] whileTrue: [ + firstDataLineIdx := firstDataLineIdx + 1 ]. + firstDataLineIdx > lines size ifTrue: [ ^ false ]. + firstSeqLen := (self extractSequenceFromLine: + (lines at: firstDataLineIdx)) size. + firstSeqLen >= numChars ifTrue: [ ^ false ]. + i := firstDataLineIdx. + numTaxa timesRepeat: [ + [ i <= lines size and: [ (lines at: i) trimBoth isEmpty ] ] + whileTrue: [ i := i + 1 ]. + i := i + 1 ]. + [ i <= lines size and: [ (lines at: i) trimBoth isEmpty ] ] + whileTrue: [ i := i + 1 ]. + i > lines size ifTrue: [ ^ false ]. + consecutiveContLines := 0. + [ i <= lines size and: [ consecutiveContLines < numTaxa ] ] + whileTrue: [ + | trimmed | + trimmed := (lines at: i) trimBoth. + trimmed isEmpty + ifTrue: [ i := i + 1 ] + ifFalse: [ + (self nextLineLooksLikeNewTaxon: (lines at: i)) ifTrue: [ + ^ false ]. + consecutiveContLines := consecutiveContLines + 1. + i := i + 1 ] ]. + ^ consecutiveContLines >= numTaxa +] + +{ #category : 'private' } +BioPhylipParser >> detectSequenceClass [ + "Answer the sequence class to use for creating sequences. + BioSequence newNamed:sequence: auto-detects the alphabet from the sequence content." + ^ BioSequence +] - ^ [: node | - OrderedCollection - with: node first - with: (node second collect: #allButLast) - with: (((node third reject: [: line | line first isEmpty ]) collect: #first) collect: #withoutBlanks ) ] +{ #category : 'private' } +BioPhylipParser >> detectStrictOrRelaxed [ + "Detect whether the file uses strict (10-char names) or relaxed (variable-length names) format." + + | firstDataLine nameField | + currentLine > lines size ifTrue: [ + isStrict := false. + ^ self ]. + firstDataLine := lines at: currentLine. + nameField := self extractNameFromLineRelaxed: firstDataLine. + isStrict := nameField size <= 10 and: [ + firstDataLine size >= 10 and: [ + (firstDataLine + copyFrom: 1 + to: (10 min: firstDataLine size)) trimBoth + = nameField ] ] ] { #category : 'accessing' } -BioPhylipParser >> buildTokensFrom: aCollection [ +BioPhylipParser >> errorAt: lineNum message: aString [ + "Signal a parse error with line number context." - | taxaNumber collection seqIndex seqBlock | + self error: + 'Phylip parse error at line ' , lineNum asString , ': ' , aString +] - taxaNumber := self taxaNumberFrom: aCollection. - collection := self buildTokensFromFirstBlock: aCollection. - seqIndex := 1. - (seqBlock := aCollection third) doWithIndex: [:seq :index | - seqIndex = (taxaNumber + 1) - ifTrue: [seqIndex := 1]. - index <= seqBlock size - ifFalse: [ ^ collection ]. - collection - at: seqIndex - put: (String - streamContents: [:str | str - nextPutAll: (collection at: seqIndex); - nextPutAll: (seqBlock at: index)]). - seqIndex := seqIndex + 1]. - ^ collection +{ #category : 'private' } +BioPhylipParser >> extractNameFromLine: aLine [ + "Extract the taxon name from a data line. + In relaxed mode: name is everything before the first whitespace. + In strict mode: name is the first 10 characters (trimmed). + Returns the trimmed name string." + + | trimmed wsIdx | + isStrict ifNotNil: [ + isStrict ifTrue: [ + aLine size < 10 ifTrue: [ ^ aLine trimBoth ]. + ^ (aLine copyFrom: 1 to: 10) trimBoth ] ]. + "Relaxed or unknown: name ends at first whitespace" + trimmed := aLine trimBoth. + wsIdx := self indexOfFirstWhitespaceIn: trimmed. + wsIdx = 0 ifTrue: [ ^ trimmed ]. + ^ trimmed copyFrom: 1 to: wsIdx - 1 +] +{ #category : 'private' } +BioPhylipParser >> extractNameFromLineRelaxed: aLine [ + "Extract the taxon name assuming relaxed format (name ends at first whitespace)." + + | trimmed wsIdx | + trimmed := aLine trimBoth. + wsIdx := self indexOfFirstWhitespaceIn: trimmed. + wsIdx = 0 ifTrue: [ ^ trimmed ]. + ^ trimmed copyFrom: 1 to: wsIdx - 1 +] + +{ #category : 'private' } +BioPhylipParser >> extractSequenceFromLine: aLine [ + "Extract the sequence portion from a data line (after the taxon name)." + + | nameSeq nameEnd | + isStrict ifTrue: [ + aLine size <= 10 ifTrue: [ ^ '' ]. + ^ self cleanSequence: (aLine copyFrom: 11 to: aLine size) ]. + "Relaxed: find end of name, rest is sequence" + nameSeq := aLine trimBoth. + nameEnd := self indexOfFirstWhitespaceIn: nameSeq. + nameEnd = 0 ifTrue: [ ^ '' ]. + ^ self cleanSequence: + (nameSeq copyFrom: nameEnd + 1 to: nameSeq size) ] { #category : 'accessing' } -BioPhylipParser >> buildTokensFromFirstBlock: aCollection [ +BioPhylipParser >> indexOfFirstWhitespaceIn: aString [ + "Answer the 1-based index of the first whitespace character (space or tab) in aString, + or 0 if none." - ^ aCollection second collect: [: seq | seq second withoutBlanks ] + 1 to: aString size do: [ :i | + (aString at: i) isSeparator ifTrue: [ ^ i ] ]. + ^ 0 ] -{ #category : 'accessing-dna' } -BioPhylipParser >> dnaInterleaveLineTokenizer [ +{ #category : 'initialization' } +BioPhylipParser >> initialize [ - ^ (self dnaInterleaveSequenceTokenizer , #newline asPParser) star + super initialize. + taxaNames := OrderedCollection new. + sequences := OrderedCollection new. + currentLine := 0. + lines := #( ). + errorLine := 0 ] -{ #category : 'accessing-dna' } -BioPhylipParser >> dnaInterleaveSequenceTokenizer [ +{ #category : 'accessing' } +BioPhylipParser >> isInterleaved [ ^ isInterleaved +] - ^ #dnaLetter asPParser trimBlanks star flatten +{ #category : 'accessing' } +BioPhylipParser >> isInterleaved: aBool [ isInterleaved := aBool ] -{ #category : 'accessing-private' } -BioPhylipParser >> firstLineTokenizer [ - " Answer a Parser for parsing the first line of the format " - - ^ (#number asPParser / self parserForAnyButNumber) , - (self parserForAnyButNumber) , - #blank asPParser plus optional flatten , - #newline asPParser ==> [ : node | - node asOrderedCollection - removeAllSuchThat: [ : elem | elem allSatisfy: [ : e | e = Character space ] ]; - copyWithoutAll: { - Character lf asString . - Character cr asString } ] -] - -{ #category : 'accessing-dna' } -BioPhylipParser >> parseInterleavedDNA: aString [ - " Answer an object with the result of parsing aString with the receiver's parser " - - | parseResults | - - parseResults := self parseString: aString. - ^ self isSuccess - ifTrue: [ results := self buildDNAResults: parseResults ] - ifFalse: [ self signalInvalidObject: parseResults ]. +{ #category : 'testing' } +BioPhylipParser >> isSequenceChar: c [ + "^ true if c is a valid PHYLIP sequence character (IUPAC nucleotide/amino acid + gap)." + + ^ 'ACGTURYNWSMKHDVBacgturynwsmkhbdvb-?.' includes: c ] -{ #category : 'accessing-protein' } -BioPhylipParser >> parseInterleavedProtein: aString [ - " Answer an object with the result of parsing aString with the receiver's parser " - - | parseResults | - - parseResults := self parseString: aString. - ^ self isSuccess - ifTrue: [ results := self buildProteinResults: parseResults ] - ifFalse: [ self signalInvalidObject: parseResults ]. +{ #category : 'accessing' } +BioPhylipParser >> isStrict [ ^ isStrict ] -{ #category : 'accessing-private' } -BioPhylipParser >> parserForAnyButNumber [ +{ #category : 'accessing' } +BioPhylipParser >> isStrict: aBool [ isStrict := aBool +] - ^ #digit asPParser negate plus , #number asPParser ==> [: n | n second ] +{ #category : 'private' } +BioPhylipParser >> lineStartsWithName: aLine [ + "Check if aLine starts with a taxon name (vs being a pure sequence continuation line). + In strict mode: the first 10 chars contain a name followed by spaces/padding. + In relaxed mode: the line has a name (non-sequence-like prefix) followed by whitespace + and then sequence data." + + | trimmed wsIdx potentialName first10 | + trimmed := aLine trimBoth. + trimmed isEmpty ifTrue: [ ^ false ]. + isStrict ifTrue: [ + trimmed size < 10 ifTrue: [ ^ true ]. "Short lines in strict mode must be names" + first10 := (trimmed copyFrom: 1 to: 10) trimBoth. + ^ (self looksLikeSequence: first10) not ]. + "Relaxed: check for name + whitespace + sequence pattern" + wsIdx := self indexOfFirstWhitespaceIn: trimmed. + wsIdx = 0 ifTrue: [ ^ false ]. "No whitespace = pure sequence continuation" + wsIdx > 50 ifTrue: [ ^ false ]. "Whitespace too far in = sequence not name" + potentialName := trimmed copyFrom: 1 to: wsIdx - 1. + ^ (self looksLikeSequence: potentialName) not ] -{ #category : 'accessing-dna' } -BioPhylipParser >> speciesDNALineTokenizer [ - " Answer a Parser for parsing the species names line " - - ^ ((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten , - self dnaInterleaveSequenceTokenizer +{ #category : 'accessing' } +BioPhylipParser >> lines: anArray [ lines := anArray ] -{ #category : 'accessing-dna' } -BioPhylipParser >> speciesDNANamedBlockTokenizer [ - " Answer a Parser for parsing the sequence blocks " - - ^ (self speciesDNALineTokenizer , #newline asPParser flatten) star +{ #category : 'private' } +BioPhylipParser >> looksLikeSequence: aString [ + "Check if aString looks like sequence data (mostly IUPAC characters)." + + | seqChars totalChars ratio | + totalChars := aString size. + totalChars = 0 ifTrue: [ ^ false ]. + seqChars := 0. + aString do: [ :c | + (self isSequenceChar: c) ifTrue: [ seqChars := seqChars + 1 ] ]. + ratio := seqChars / totalChars. + ^ ratio > 0.9 ] -{ #category : 'accessing-private' } -BioPhylipParser >> speciesFobiddenNames [ - " Private - Answer a with receiver's not allowed Characters in a species name " - - ^ OrderedCollection new - add: Character cr; - add: Character lf; - add: $[; - add: $]; - add: $(; - add: $); - add: $:; - add: $;; - add: $,; - yourself +{ #category : 'private' } +BioPhylipParser >> nextLineLooksLikeContinuation: aLine [ + "Check if aLine looks like a sequence continuation (no name prefix)." + + | trimmed wsIdx potentialName first10 | + trimmed := aLine trimBoth. + trimmed isEmpty ifTrue: [ ^ true ]. + isStrict ifTrue: [ + trimmed size >= 10 ifTrue: [ + first10 := (trimmed copyFrom: 1 to: 10) trimBoth. + first10 isEmpty ifTrue: [ ^ true ]. + ^ self looksLikeSequence: first10 ]. + ^ true ]. + "Relaxed: check if line starts with a name" + wsIdx := self indexOfFirstWhitespaceIn: trimmed. + wsIdx = 0 ifTrue: [ ^ true ]. + wsIdx > 30 ifTrue: [ ^ true ]. + potentialName := trimmed copyFrom: 1 to: wsIdx - 1. + ^ self looksLikeSequence: potentialName +] + +{ #category : 'private' } +BioPhylipParser >> nextLineLooksLikeNewTaxon: aLine [ + "Check if aLine looks like a new taxon name + sequence (vs pure sequence continuation). + Returns true if it looks like a new taxon (=> interleaved). + Returns false if it looks like continuation (=> sequential)." + + | trimmed wsIdx potentialName first10 | + trimmed := aLine trimBoth. + trimmed isEmpty ifTrue: [ ^ false ]. + isStrict ifTrue: [ "Strict: first 10 chars should be a name (not sequence chars)" + trimmed size < 10 ifTrue: [ ^ self looksLikeSequence: trimmed not ]. + first10 := (trimmed copyFrom: 1 to: 10) trimBoth. + first10 isEmpty ifTrue: [ ^ false ]. + ^ (self looksLikeSequence: first10) not ]. + "Relaxed: name ends at first whitespace, check if it looks like a name" + wsIdx := self indexOfFirstWhitespaceIn: trimmed. + wsIdx = 0 ifTrue: [ ^ false ]. "No whitespace = pure sequence, not a name" + wsIdx > 30 ifTrue: [ ^ false ]. "Whitespace very far in = probably sequence not name" + potentialName := trimmed copyFrom: 1 to: wsIdx - 1. + ^ (self looksLikeSequence: potentialName) not ] { #category : 'accessing' } -BioPhylipParser >> taxaNumberFrom: aCollection [ +BioPhylipParser >> numChars [ ^ numChars +] - ^ aCollection first first asNumber +{ #category : 'accessing' } +BioPhylipParser >> numTaxa [ ^ numTaxa ] -{ #category : 'accessing-dna' } -BioPhylipParser >> tokenizeInterleavedDNA [ - " Private - Tokenize the receiver's epression as DNA data " +{ #category : 'parsing' } +BioPhylipParser >> parseAsInterleaved: aString [ + "Parse assuming interleaved format." + + self initialize. + lines := (aString lines select: [ :l | l notEmpty ]) asArray. + self parseHeader. + isInterleaved := true. + self detectStrictOrRelaxed. + self parseBodyInterleaved. + self validate. + ^ self asAlignment +] - parser := - ( self firstLineTokenizer , - self speciesDNANamedBlockTokenizer , - self dnaInterleaveLineTokenizer ) ==> self buildTokensBlock. - ^ self tokenize. - - +{ #category : 'parsing' } +BioPhylipParser >> parseAsSequential: aString [ + "Parse assuming sequential format." + + self initialize. + lines := (aString lines select: [ :l | l notEmpty ]) asArray. + self parseHeader. + isInterleaved := false. + self detectStrictOrRelaxed. + self parseBodySequential. + self validate. + ^ self asAlignment ] -{ #category : 'accessing-protein' } -BioPhylipParser >> tokenizeInterleavedProtein [ - " Private - Tokenize the receiver's epression as Protein data " +{ #category : 'parsing' } +BioPhylipParser >> parseBody [ + "Parse the data body after the header. Dispatches to sequential or interleaved parser." - parser := - self firstLineTokenizer , - (((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten , - #proteinLetterGapped asPParser trimBlanks star flatten , - #newline asPParser) star , - (#proteinLetterGapped asPParser trimBlanks star flatten , #newline asPParser) star ==> self buildTokensBlock. - ^ self tokenize. - + isInterleaved + ifTrue: [ self parseBodyInterleaved ] + ifFalse: [ self parseBodySequential ] +] + +{ #category : 'parsing' } +BioPhylipParser >> parseBodyInterleaved [ + "Parse interleaved format: first block has N name+seq lines, subsequent blocks are seq-only." + + | totalRead | + numTaxa timesRepeat: [ + [ + currentLine <= lines size and: [ + (lines at: currentLine) trimBoth isEmpty ] ] whileTrue: [ + currentLine := currentLine + 1 ]. + currentLine > lines size ifTrue: [ + Error signal: 'Unexpected end of input in interleaved block' ]. + taxaNames add: (self extractNameFromLine: (lines at: currentLine)). + sequences add: + (self extractSequenceFromLine: (lines at: currentLine)). + currentLine := currentLine + 1 ]. + totalRead := 0. + [ currentLine <= lines size ] whileTrue: [ + | line trimmed | + line := lines at: currentLine. + trimmed := line trimBoth. + trimmed isEmpty + ifTrue: [ currentLine := currentLine + 1 ] + ifFalse: [ + | idx | + idx := totalRead \\ numTaxa. + idx := idx + 1. + sequences + at: idx + put: (sequences at: idx) , (self cleanSequence: trimmed). + totalRead := totalRead + 1. + currentLine := currentLine + 1 ] ] +] + +{ #category : 'parsing' } +BioPhylipParser >> parseBodySequential [ + "Parse sequential format: each taxon's data appears on consecutive lines. + A taxon starts with a name line, followed by optional continuation lines. + We detect name lines by checking if the line starts with a taxon name + (has a recognizable name prefix followed by whitespace then sequence data)." + + | taxonIndex currentSeq charsRead | + taxonIndex := 0. + currentSeq := ''. + charsRead := 0. + [ currentLine <= lines size ] whileTrue: [ + | line trimmed name seq | + line := lines at: currentLine. + trimmed := line trimBoth. + trimmed isEmpty + ifTrue: [ currentLine := currentLine + 1 ] + ifFalse: [ + (taxonIndex < numTaxa and: [ self lineStartsWithName: line ]) + ifTrue: [ "New taxon line - save previous sequence if any" + taxonIndex > 0 ifTrue: [ sequences add: currentSeq ]. + name := self extractNameFromLine: line. + seq := self extractSequenceFromLine: line. + taxaNames add: name. + currentSeq := seq. + charsRead := seq size. + taxonIndex := taxonIndex + 1 ] + ifFalse: [ "Continuation of current taxon" + seq := self cleanSequence: trimmed. + currentSeq := currentSeq , seq. + charsRead := charsRead + seq size ]. + currentLine := currentLine + 1 ] ]. + "Add last sequence" + currentSeq ifNotEmpty: [ sequences add: currentSeq ] +] + +{ #category : 'parsing' } +BioPhylipParser >> parseFile: aFileReference [ + "Parse a PHYLIP format file. Answer a ." + + ^ self parseString: aFileReference asFileReference contents +] + +{ #category : 'parsing' } +BioPhylipParser >> parseHeader [ + + | header firstNum secondNum | + currentLine < lines size ifFalse: [ + Error signal: 'Empty PHYLIP input' ]. + header := lines first trimBoth. + firstNum := self readFirstNumber: header. + secondNum := self readSecondNumber: header. + firstNum ifNil: [ Error signal: 'Cannot read taxa count' ]. + secondNum ifNil: [ Error signal: 'Cannot read character count' ]. + firstNum < 2 ifTrue: [ Error signal: 'Need at least 2 taxa' ]. + secondNum < 1 ifTrue: [ Error signal: 'Need at least 1 character' ]. + numTaxa := firstNum. + numChars := secondNum. + currentLine := 2 +] + +{ #category : 'parsing' } +BioPhylipParser >> parseString: aString [ + "Parse a PHYLIP format string. Auto-detect sequential/interleaved. + Try sequential first; if validation fails, try interleaved." + + | aln | + aln := [ self parseAsSequential: aString ] + on: Error + do: [ nil ]. + aln ifNotNil: [ ^ aln ]. + ^ self parseAsInterleaved: aString +] + +{ #category : 'private' } +BioPhylipParser >> readFirstNumber: aString [ + "Read the first integer from the header line." + + | idx | + idx := aString indexOf: Character space startingAt: 1. + idx = 0 ifTrue: [ + idx := aString indexOf: Character tab startingAt: 1 ]. + idx = 0 ifTrue: [ ^ nil ]. + ^ [ (aString copyFrom: 1 to: idx - 1) asInteger ] + on: Error + do: [ nil ] +] + +{ #category : 'private' } +BioPhylipParser >> readSecondNumber: aString [ + "Read the second integer from the header line." + + | idx rest | + idx := aString indexOf: Character space startingAt: 1. + idx = 0 ifTrue: [ + idx := aString indexOf: Character tab startingAt: 1 ]. + idx = 0 ifTrue: [ ^ nil ]. + rest := (aString copyFrom: idx + 1 to: aString size) trimBoth. + ^ [ rest asInteger ] + on: Error + do: [ nil ] +] + +{ #category : 'initialization' } +BioPhylipParser >> resetForParse: aString [ + "Reset parser state and set up lines from aString." + + self initialize. + lines := (aString lines select: [ :l | l notEmpty ]) asArray. + currentLine := 2. + self parseHeader +] + +{ #category : 'accessing' } +BioPhylipParser >> sequences [ ^ sequences +] + +{ #category : 'accessing' } +BioPhylipParser >> taxaNames [ ^ taxaNames +] + +{ #category : 'private' } +BioPhylipParser >> tryParseInterleaved: aString [ + "Try to parse assuming interleaved format. Signal error on failure." + + | savedState | + savedState := self deepCopy. + [ + self resetForParse: aString. + isInterleaved := true. + self detectStrictOrRelaxed. + self parseBodyInterleaved. + self validate. + ^ self asAlignment ] + on: Error + do: [ :ex | "Restore state and re-raise" + savedState restoreTo: self. + ex pass ] +] + +{ #category : 'private' } +BioPhylipParser >> tryParseSequential: aString [ + "Try to parse assuming sequential format." + + self initialize. + lines := (aString lines select: [ :l | l notEmpty ]) asArray. + self parseHeader. + isInterleaved := false. + self detectStrictOrRelaxed. + self parseBodySequential. + self validate. + ^ self asAlignment +] + +{ #category : 'accessing' } +BioPhylipParser >> validate [ + "Validate the parsed data." + + taxaNames size = numTaxa ifFalse: [ + Error signal: 'Expected ' , numTaxa asString , ' taxa but found ' + , taxaNames size asString ]. + sequences size = numTaxa ifFalse: [ + Error signal: + 'Expected ' , numTaxa asString , ' sequences but found ' + , sequences size asString ]. + 1 to: numTaxa do: [ :i | + (sequences at: i) size = numChars ifFalse: [ + Error signal: 'Sequence for ' , (taxaNames at: i) , ' has ' + , (sequences at: i) size asString + , ' chars but header specifies ' , numChars asString ] ] ] diff --git a/repository/BioParsers/BioPhylipPetitParser.class.st b/repository/BioParsers/BioPhylipPetitParser.class.st new file mode 100644 index 00000000..140b5dd8 --- /dev/null +++ b/repository/BioParsers/BioPhylipPetitParser.class.st @@ -0,0 +1,218 @@ +" +Documentation taken from http://bioweb2.pasteur.fr/docs/phylip/doc/main.html#inputfiles + +" +Class { + #name : 'BioPhylipPetitParser', + #superclass : 'BioAbstractTextParser', + #category : 'BioParsers-Core', + #package : 'BioParsers', + #tag : 'Core' +} + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> buildDNAResults: aCollection [ + " Answer an identified object for the receiver's parsing output " + + | tokenized sequences | + + tokenized := self buildTokens: aCollection. + sequences := tokenized third + with: tokenized fourth + do: [: first : snd | BioSequence newAmbiguousDNA: snd named: first ]. + ^ BioPhylip new + numberOfTaxa: tokenized first; + numberOfCharacters: tokenized second; + sequences: sequences. +] + +{ #category : 'accessing-protein' } +BioPhylipPetitParser >> buildProteinResults: aCollection [ + " Answer an identified object for the receiver's parsing output " + + | tokenized sequences | + + tokenized := self buildTokens: aCollection. + sequences := tokenized third + with: tokenized fourth + do: [: first : snd | BioSequence newProtein: snd named: first ]. + ^ BioPhylip new + numberOfTaxa: tokenized first; + numberOfCharacters: tokenized second; + sequences: sequences. +] + +{ #category : 'accessing' } +BioPhylipPetitParser >> buildTokens: aCollection [ + " Answer a tokenized parsing aCollection " + + ^ Array + with: (self taxaNumberFrom: aCollection) + with: (aCollection first second asNumber) + with: (aCollection second collect: #first) + with: (self buildTokensFrom: aCollection). + + +] + +{ #category : 'accessing-private' } +BioPhylipPetitParser >> buildTokensBlock [ + + ^ [: node | + OrderedCollection + with: node first + with: (node second collect: #allButLast) + with: (((node third reject: [: line | line first isEmpty ]) collect: #first) collect: #withoutBlanks ) ] +] + +{ #category : 'accessing' } +BioPhylipPetitParser >> buildTokensFrom: aCollection [ + + | taxaNumber collection seqIndex seqBlock | + + taxaNumber := self taxaNumberFrom: aCollection. + collection := self buildTokensFromFirstBlock: aCollection. + seqIndex := 1. + (seqBlock := aCollection third) doWithIndex: [:seq :index | + seqIndex = (taxaNumber + 1) + ifTrue: [seqIndex := 1]. + index <= seqBlock size + ifFalse: [ ^ collection ]. + collection + at: seqIndex + put: (String + streamContents: [:str | str + nextPutAll: (collection at: seqIndex); + nextPutAll: (seqBlock at: index)]). + seqIndex := seqIndex + 1]. + ^ collection + +] + +{ #category : 'accessing' } +BioPhylipPetitParser >> buildTokensFromFirstBlock: aCollection [ + + ^ aCollection second collect: [: seq | seq second withoutBlanks ] +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> dnaInterleaveLineTokenizer [ + + ^ (self dnaInterleaveSequenceTokenizer , #newline asPParser) star +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> dnaInterleaveSequenceTokenizer [ + + ^ #dnaLetter asPParser trimBlanks star flatten +] + +{ #category : 'accessing-private' } +BioPhylipPetitParser >> firstLineTokenizer [ + " Answer a Parser for parsing the first line of the format " + + ^ (#number asPParser / self parserForAnyButNumber) , + (self parserForAnyButNumber) , + #blank asPParser plus optional flatten , + #newline asPParser ==> [ : node | + node asOrderedCollection + removeAllSuchThat: [ : elem | elem allSatisfy: [ : e | e = Character space ] ]; + copyWithoutAll: { + Character lf asString . + Character cr asString } ] +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> parseInterleavedDNA: aString [ + " Answer an object with the result of parsing aString with the receiver's parser " + + | parseResults | + + parseResults := self parseString: aString. + ^ self isSuccess + ifTrue: [ results := self buildDNAResults: parseResults ] + ifFalse: [ self signalInvalidObject: parseResults ]. +] + +{ #category : 'accessing-protein' } +BioPhylipPetitParser >> parseInterleavedProtein: aString [ + " Answer an object with the result of parsing aString with the receiver's parser " + + | parseResults | + + parseResults := self parseString: aString. + ^ self isSuccess + ifTrue: [ results := self buildProteinResults: parseResults ] + ifFalse: [ self signalInvalidObject: parseResults ]. +] + +{ #category : 'accessing-private' } +BioPhylipPetitParser >> parserForAnyButNumber [ + + ^ #digit asPParser negate plus , #number asPParser ==> [: n | n second ] +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> speciesDNALineTokenizer [ + " Answer a Parser for parsing the species names line " + + ^ ((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten , + self dnaInterleaveSequenceTokenizer +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> speciesDNANamedBlockTokenizer [ + " Answer a Parser for parsing the sequence blocks " + + ^ (self speciesDNALineTokenizer , #newline asPParser flatten) star +] + +{ #category : 'accessing-private' } +BioPhylipPetitParser >> speciesFobiddenNames [ + " Private - Answer a with receiver's not allowed Characters in a species name " + + ^ OrderedCollection new + add: Character cr; + add: Character lf; + add: $[; + add: $]; + add: $(; + add: $); + add: $:; + add: $;; + add: $,; + yourself +] + +{ #category : 'accessing' } +BioPhylipPetitParser >> taxaNumberFrom: aCollection [ + + ^ aCollection first first asNumber +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> tokenizeInterleavedDNA [ + " Private - Tokenize the receiver's epression as DNA data " + + parser := + ( self firstLineTokenizer , + self speciesDNANamedBlockTokenizer , + self dnaInterleaveLineTokenizer ) ==> self buildTokensBlock. + ^ self tokenize. + + +] + +{ #category : 'accessing-protein' } +BioPhylipPetitParser >> tokenizeInterleavedProtein [ + " Private - Tokenize the receiver's epression as Protein data " + + parser := + self firstLineTokenizer , + (((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten , + #proteinLetterGapped asPParser trimBlanks star flatten , + #newline asPParser) star , + (#proteinLetterGapped asPParser trimBlanks star flatten , #newline asPParser) star ==> self buildTokensBlock. + ^ self tokenize. + +]