From cee6bfd69205a753cd59ac13d196695a6a777538 Mon Sep 17 00:00:00 2001 From: Hernan Morales Date: Mon, 27 Apr 2026 15:38:56 -0300 Subject: [PATCH 1/2] Add GFF3 parser Repackaging BioParserTests -> BioParser-Tests --- .../BaselineOfBioSmalltalk.class.st | 6 +- .../BioFormatters/BioGFF3Formatter.class.st | 52 - .../BioAbstractFASTAParserTest.class.st | 355 ++++ .../BioAbstractParserTest.class.st | 22 + .../BioAccessionParserTest.class.st | 136 ++ .../BioCGCParserTest.class.st | 31 + .../BioDNANucleotideParserTest.class.st | 66 + .../BioDNASequenceParserTest.class.st | 45 + .../BioDegenerateBaseParserTest.class.st | 110 ++ .../BioEMBLParserTest.class.st | 33 + ...trezXMLGenBankAccessionParserTest.class.st | 1445 +++++++++++++++++ .../BioFASTAParserTest.class.st | 515 ++++++ .../BioParsers-Tests/BioGFF3Test.class.st | 315 ++++ .../BioGenBankParserTest.class.st | 51 + .../BioGenIdParserTest.class.st | 33 + .../BioMAFParserTest.class.st | 81 + .../BioNCBIIdParserTest.class.st | 83 + .../BioPhylipParserTest.class.st | 254 +++ .../BioProteinParserTest.class.st | 81 + .../BioSwissProtParserTest.class.st | 38 + repository/BioParsers-Tests/package.st | 1 + .../BioParsers/BioBlastContainerNode.class.st | 4 +- .../BioParsers/BioBlastHitNode.class.st | 4 +- .../BioParsers/BioBlastHspNode.class.st | 4 +- repository/BioParsers/BioBlastNode.class.st | 4 +- .../BioParsers/BioBlastParentNode.class.st | 4 +- .../BioParsers/BioBlastRootNode.class.st | 4 +- .../BioParsers/BioBlastStructureNode.class.st | 4 +- .../BioParsers/BioBlastValueNode.class.st | 4 +- .../BioParsers/BioEResultKeysParser.class.st | 4 +- .../BioParsers/BioEntrezResultParser.class.st | 4 +- .../BioEntrezXMLGBBasicParser.class.st | 4 +- .../BioEntrezXMLGBFullParser.class.st | 4 +- ...ioEntrezXMLGBSeqFeatureQualParser.class.st | 4 +- .../BioEntrezXMLGBSeqFullParser.class.st | 4 +- .../BioEntrezXMLGBSeqJournalParser.class.st | 4 +- .../BioEntrezXMLGBSeqParser.class.st | 4 +- .../BioEntrezXMLGenBankSeqParser.class.st | 4 +- .../BioEntrezXMLGenSetParser.class.st | 4 +- .../BioParsers/BioFASTABasicParser.class.st | 4 +- .../BioParsers/BioFASTAMultiParser.class.st | 4 +- repository/BioParsers/BioFASTAParser.class.st | 4 +- .../BioGFF3CommentRecordNode.class.st | 34 + .../BioGFF3DirectiveListNode.class.st | 31 + .../BioParsers/BioGFF3DirectiveNode.class.st | 34 + repository/BioParsers/BioGFF3Feature.class.st | 250 +++ .../BioGFF3FeatureLineNode.class.st | 138 ++ .../BioGFF3FeatureListNode.class.st | 58 + repository/BioParsers/BioGFF3File.class.st | 272 ++++ .../BioParsers/BioGFF3GFF3FileNode.class.st | 51 + .../BioGFF3GFF3FileNodeVisitor.class.st | 9 + .../BioParsers/BioNCBIBlastSAXParser.class.st | 4 +- .../BioNCBIBlastSAXTokenizer.class.st | 4 +- .../BioParsers/BioNCBIXMLBlastParser.class.st | 4 +- repository/BioParsers/BioParser.class.st | 14 + repository/BioParsers/BioSAXParser.class.st | 4 +- .../TBioGFF3GFF3FileNodeVisitor.trait.st | 44 + 57 files changed, 4683 insertions(+), 105 deletions(-) delete mode 100644 repository/BioFormatters/BioGFF3Formatter.class.st create mode 100644 repository/BioParsers-Tests/BioAbstractFASTAParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioAbstractParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioAccessionParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioCGCParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioDNANucleotideParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioDNASequenceParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioDegenerateBaseParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioEMBLParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioEntrezXMLGenBankAccessionParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioFASTAParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioGFF3Test.class.st create mode 100644 repository/BioParsers-Tests/BioGenBankParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioGenIdParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioMAFParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioNCBIIdParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioPhylipParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioProteinParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioSwissProtParserTest.class.st create mode 100644 repository/BioParsers-Tests/package.st create mode 100644 repository/BioParsers/BioGFF3CommentRecordNode.class.st create mode 100644 repository/BioParsers/BioGFF3DirectiveListNode.class.st create mode 100644 repository/BioParsers/BioGFF3DirectiveNode.class.st create mode 100644 repository/BioParsers/BioGFF3Feature.class.st create mode 100644 repository/BioParsers/BioGFF3FeatureLineNode.class.st create mode 100644 repository/BioParsers/BioGFF3FeatureListNode.class.st create mode 100644 repository/BioParsers/BioGFF3File.class.st create mode 100644 repository/BioParsers/BioGFF3GFF3FileNode.class.st create mode 100644 repository/BioParsers/BioGFF3GFF3FileNodeVisitor.class.st create mode 100644 repository/BioParsers/TBioGFF3GFF3FileNodeVisitor.trait.st diff --git a/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st b/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st index f0a57868..0af2294d 100644 --- a/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st +++ b/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st @@ -83,7 +83,7 @@ BaselineOfBioSmalltalk >> baselineCommonPackages: spec [ package: 'BioBenchmarks' with: [ spec requires: #('BioTools' ). ]; package: 'BioBlast' with: [ spec requires: #('BioWrappers' 'BioParsers' 'BioEntrez' ). ]; package: 'BioBlastSamples' with: [ spec requires: #('BioToolsSamples' ). ]; - package: 'BioBlastTests' with: [ spec requires: #('BioTools-Tests' 'BioBlast' 'BioParserTests' ). ]; + package: 'BioBlastTests' with: [ spec requires: #('BioTools-Tests' 'BioBlast' 'BioParser-Tests' ). ]; package: 'BioClassifier' with: [ spec requires: #('BioTools' ). ]; package: 'BioClassifierTests' with: [ spec requires: #('BioTools-Tests' 'BioClassifier' ). ]; package: 'BioEBI' with: [ spec requires: #('BioTools' 'BioWrappers' ). ]; @@ -97,7 +97,7 @@ BaselineOfBioSmalltalk >> baselineCommonPackages: spec [ package: 'BioNCBI' with: [ spec requires: #('BioTools' ). ]; package: 'BioNCBITests' with: [ spec requires: #('BioTools-Tests' 'BioNCBI' ). ]; package: 'BioNGS' with: [ spec requires: #('BioTools' 'BioWrappers' ). ]; - package: 'BioParserTests' with: [ spec requires: #('BioTools-Tests' 'BioParsers' ). ]; + package: 'BioParser-Tests' with: [ spec requires: #('BioTools-Tests' 'BioParsers' ). ]; package: 'BioParsers' with: [ spec requires: #('BioWrappers' 'BioTools' ). ]; package: 'BioPharoCommon' with: [ spec requires: #('BioTools' ). ]; package: 'BioPharo4' with: [ spec requires: #('BioPharoCommon' ). ]; @@ -236,7 +236,7 @@ BaselineOfBioSmalltalk >> baselineTestsGroup: spec [ 'BioTools-Tests' 'BioBlastTests' 'BioWrapperTests' - 'BioParserTests' + 'BioParser-Tests' 'BioEntrezTests' 'BioNCBITests' 'BioFormatterTests' diff --git a/repository/BioFormatters/BioGFF3Formatter.class.st b/repository/BioFormatters/BioGFF3Formatter.class.st deleted file mode 100644 index 535c6787..00000000 --- a/repository/BioFormatters/BioGFF3Formatter.class.st +++ /dev/null @@ -1,52 +0,0 @@ -" -The [GFF3 format](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) addresses the most common extensions to GFF, while preserving backward compatibility with previous formats. - -## Instance Variables - -- sequenceFeature: `BioSequenceFeature` - -" -Class { - #name : 'BioGFF3Formatter', - #superclass : 'BioSequenceFeatureFormatter', - #category : 'BioFormatters-Formatters', - #package : 'BioFormatters', - #tag : 'Formatters' -} - -{ #category : 'accessing' } -BioGFF3Formatter class >> identifiers [ - "Answer a of identifiers of the receiver" - - ^ #('GFF' 'GFF3') -] - -{ #category : 'converting' } -BioGFF3Formatter >> asString [ - " Answer a representation of the receiver " - - ^ String streamContents: [ : outStream | - outStream - nextPutAll: self sequenceFeature name; - nextPut: self delimiter; - nextPutAll: self sequenceFeature sourceTag; - nextPut: self delimiter; - nextPutAll: self sequenceFeature primaryTag; - nextPut: self delimiter; - nextPutAll: self sequenceFeature start asString; - nextPut: self delimiter; - nextPutAll: self sequenceFeature end asString; - nextPut: self delimiter; - nextPutAll: self sequenceFeature score asString; - nextPut: self delimiter; - nextPutAll: self sequenceFeature strand asString; - nextPut: self delimiter; - nextPutAll: self sequenceFeature tag asString ] -] - -{ #category : 'accessing' } -BioGFF3Formatter >> delimiter [ - " Answer a used to delimit fields between the receiver's elements " - - ^ Character tab -] diff --git a/repository/BioParsers-Tests/BioAbstractFASTAParserTest.class.st b/repository/BioParsers-Tests/BioAbstractFASTAParserTest.class.st new file mode 100644 index 00000000..1a8fa7d4 --- /dev/null +++ b/repository/BioParsers-Tests/BioAbstractFASTAParserTest.class.st @@ -0,0 +1,355 @@ +Class { + #name : 'BioAbstractFASTAParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ '>YAL069W-1.334 Putative promoter sequence +CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA +CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT +ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC +CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC +TGTTCTTCTACCCACCATATTGAAACGCTAACAA +' +] + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ '>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA +GTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTT +TTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGG +TACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT +' +] + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq04 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ '>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCA +CAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGA +AGTTTATATATAAATTTCCTTTTTATTGGA +' +] + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq05 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ '>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +' +] + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq08 [ + " From http://www.citizendia.org/FASTA_format " + + ^ '>sp_ac|P02769_WOSIG0 \ID=ALBU_BOVIN \DE="Serum albumin precursor (Allergen Bos d 6) (BSA)" \NCBITAXID=9913 \MODRES=(1|Acetyl) \VARIANT=(196|A|T) \LENGTH=589 +RGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCV +ADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPK +LKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKG +ACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLV +TDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEK +DAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEA +TLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKV +PQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTK +CCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKH +KPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA +' +] + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq09 [ + " Definition line from the BioPython manual " + + ^ '>gi|6273291|emb|AF191665.1|AF191665 +actgtcgat +atgctagct +' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html + + thisContext receiver new multiFastaSeq01 + " + + ^ String streamContents: [ : stream | + stream + nextPutAll: self multiFastaSeq01Header01; cr; + nextPutAll: self multiFastaSeq01Body01; cr; + nextPutAll: self multiFastaSeq01Header02; cr; + nextPutAll: self multiFastaSeq01Body02 + ]. + +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq01Body01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA +CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT +ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC +CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC +TGTTCTTCTACCCACCATATTGAAACGCTAACAA' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq01Body02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA +GTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTT +TTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGG +TACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq01Header01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'YAL069W-1.334 Putative promoter sequence' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq01Header02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'YAL068C-7235.2170 Putative promoter sequence' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html + + thisContext receiver new multiFastaSeq02 + " + + ^ String streamContents: [ : stream | + stream + nextPutAll: self multiFastaSeq02Header01; cr; + nextPutAll: self multiFastaSeq02Body01; cr; + nextPutAll: self multiFastaSeq02Header02; cr; + nextPutAll: self multiFastaSeq02Body02; cr; + nextPutAll: self multiFastaSeq02Header03; cr; + nextPutAll: self multiFastaSeq02Body03 + ]. +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Body01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCA' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Body02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'CAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGA +AGTTTATATATAAATTTCCTTTTTATTGGA' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Body03 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Header01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'first sequence record' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Header02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'second sequence record' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Header03 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'third sequence record' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02PlainText [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html + + thisContext receiver new multiFastaSeq02PlainText + " + + ^ '>first sequence record +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCA +>second sequence record +CAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGA +AGTTTATATATAAATTTCCTTTTTATTGGA +>third sequence record +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq03 [ + " From http://www.citizendia.org/FASTA_format + + thisContext receiver new multiFastaSeq03 + " + + ^ String streamContents: [ : stream | + stream + nextPutAll: self multiFastaSeq03Header01; cr; + nextPutAll: self multiFastaSeq03Body01; cr; + nextPutAll: self multiFastaSeq03Header02; cr; + nextPutAll: self multiFastaSeq03Body02 ]. +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq03Body01 [ + " From http://www.citizendia.org/FASTA_format " + + ^ 'MTEITAAMVKELRESTGAGMMDCKNALSETNGDFDKAVQLLREKGLGKAAKKADRLAAEG +LVSVKVSDDFTIAAMRPSYLSYEDLDMTFVENEYKALVAELEKENEERRRLKDPNKPEHK +IPQFASRKQLSDAILKEAEEKIKEELKAQGKPEKIWDNIIPGKMNSFIADNSQLDSKLTL +MGQFYVMDDKKTVEQVIAEKEKEFGGKIKIVEFICFEVGEGLEKKTEDFAAEVAAQL' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq03Body02 [ + " From http://www.citizendia.org/FASTA_format " + + ^ 'SATVSEINSETDFVAKNDQFIALTKDTTAHIQSNSLQSVEELHSSTINGVKFEEYLKSQI +ATIGENLVVRRFATLKAGANGVVNGYIHTNGRVGVVIAAACDSAEVASKSRDLLRQICMH' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq03Header01 [ + " From http://www.citizendia.org/FASTA_format " + + ^ '>SEQUENCE_1' + +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq03Header02 [ + " From http://www.citizendia.org/FASTA_format " + + ^ '>SEQUENCE_2' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq04 [ + " http://www.cbs.dtu.dk/services/NetGene2/fasta.php " + + ^ '>HSBGPG Human gene for bone gla protein (BGP) +GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGCTGGCGGGGCAGGCCAGCTGAGTCCTGAGCAGCAGCCCAGCGCAGCCACCGAGACACC +ATGAGAGCCCTCACACTCCTCGCCCTATTGGCCCTGGCCGCACTTTGCATCGCTGGCCAGGCAGGTGAGTGCCCC +CACCTCCCCTCAGGCCGCATTGCAGTGGGGGCTGAGAGGAGGAAGCACCATGGCCCACCTCTTCTCACCCCTTTG +GCTGGCAGTCCCTTTGCAGTCTAACCACCTTGTTGCAGGCTCAATCCATTTGCCCCAGCTCTGCCCTTGCAGAGG +GAGAGGAGGGAAGAGCAAGCTGCCCGAGACGCAGGGGAAGGAGGATGAGGGCCCTGGGGATGAGCTGGGGTGAAC +CAGGCTCCCTTTCCTTTGCAGGTGCGAAGCCCAGCGGTGCAGAGTCCAGCAAAGGTGCAGGTATGAGGATGGACC +TGATGGGTTCCTGGACCCTCCCCTCTCACCCTGGTCCCTCAGTCTCATTCCCCCACTCCTGCCACCTCCTGTCTG +GCCATCAGGAAGGCCAGCCTGCTCCCCACCTGATCCTCCCAAACCCAGAGCCACCTGATGCCTGCCCCTCTGCTC +CACAGCCTTTGTGTCCAAGCAGGAGGGCAGCGAGGTAGTGAAGAGACCCAGGCGCTACCTGTATCAATGGCTGGG +GTGAGAGAAAAGGCAGAGCTGGGCCAAGGCCCTGCCTCTCCGGGATGGTCTGTGGGGGAGCTGCAGCAGGGAGTG +GCCTCTCTGGGTTGTGGTGGGGGTACAGGCAGCCTGCCCTGGTGGGCACCCTGGAGCCCCATGTGTAGGGAGAGG +AGGGATGGGCATTTTGCACGGGGGCTGATGCCACCACGTCGGGTGTCTCAGAGCCCCAGTCCCCTACCCGGATCC +CCTGGAGCCCAGGAGGGAGGTGTGTGAGCTCAATCCGGACTGTGACGAGTTGGCTGACCACATCGGCTTTCAGGA +GGCCTATCGGCGCTTCTACGGCCCGGTCTAGGGTGTCGCTCTGCTGGCCTGGCCGGCAACCCCAGTTCTGCTCCT +CTCCAGGCACCCTTCTTTCCTCTTCCCCTTGCCCTTGCCCTGACCTCCCAGCCCTATGGATGTGGGGTCCCCATC +ATCCCAGCTGCTCCCAAATAAACTCCAGAAG +>HSGLTH1 Human theta 1-globin gene +CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGA +CGGGGGGCCTTGGATCCAGGGCGATTCAGAGGGCCCCGGTCGGAGCTGTCGGAGATTGAGCGCGCGCGGTCCCGG +GATCTCCGACGAGGCCCTGGACCCCCGGGCGGCGAAGCTGCGGCGCGGCGCCCCCTGGAGGCCGCGGGACCCCTG +GCCGGTCCGCGCAGGCGCAGCGGGGTCGCAGGGCGCGGCGGGTTCCAGCGCGGGGATGGCGCTGTCCGCGGAGGA +CCGGGCGCTGGTGCGCGCCCTGTGGAAGAAGCTGGGCAGCAACGTCGGCGTCTACACGACAGAGGCCCTGGAAAG +GTGCGGCAGGCTGGGCGCCCCCGCCCCCAGGGGCCCTCCCTCCCCAAGCCCCCCGGACGCGCCTCACCCACGTTC +CTCTCGCAGGACCTTCCTGGCTTTCCCCGCCACGAAGACCTACTTCTCCCACCTGGACCTGAGCCCCGGCTCCTC +ACAAGTCAGAGCCCACGGCCAGAAGGTGGCGGACGCGCTGAGCCTCGCCGTGGAGCGCCTGGACGACCTACCCCA +CGCGCTGTCCGCGCTGAGCCACCTGCACGCGTGCCAGCTGCGAGTGGACCCGGCCAGCTTCCAGGTGAGCGGCTG +CCGTGCTGGGCCCCTGTCCCCGGGAGGGCCCCGGCGGGGTGGGTGCGGGGGGCGTGCGGGGCGGGTGCAGGCGAG +TGAGCCTTGAGCGCTCGCCGCAGCTCCTGGGCCACTGCCTGCTGGTAACCCTCGCCCGGCACTACCCCGGAGACT +TCAGCCCCGCGCTGCAGGCGTCGCTGGACAAGTTCCTGAGCCACGTTATCTCGGCGCTGGTTTCCGAGTACCGCT +GAACTGTGGGTGGGTGGCCGCGGGATCCCCAGGCGACCTTCCCCGTGTTTGAGTAAAGCCTCTCCCAGGAGCAGC +CTTCTTGCCGTGCTCTCTCGAGGTCAGGACGCGAGAGGAAGGCGC +' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq05 [ + " http://quma.cdb.riken.jp/help/multiFastaHelp.html " + + ^ '>sequence1 +ACTCCCCGTGCGCGCCCGGCCCGTAGCGTCCTCGTCGCCGCCCCTCGTCTCGCAGCCGCAGCCCGCGTGG +ACGCTCTCGCCTGAGCGCCGCGGACTAGCCCGGGTGGCC +>sequence2 +CAGTCCGGCAGCGCCGGGGTTAAGCGGCCCAAGTAAACGTAGCGCAGCGATCGGCGCCGGAGATTCGCGA +ACCCGACACTCCGCGCCGCCCGCCGGCCAGGACCCGCGGCGCGATCGCGGCGCCGCGCTACAGCCAGCCT +CACTGGCGCGCGGGCGAGCGCACGGGCGCTC +>sequence3 +CACGACAGGCCCGCTGAGGCTTGTGCCAGACCTTGGAAACCTCAGGTATATACCTTTCCAGACGCGGGAT +CTCCCCTCCCC +>sequence4 +CAGCAGACATCTGAATGAAGAAGAGGGTGCCAGCGGGTATGAGGAGTGCATTATCGTTAATGGGAACTTC +AGTGACCAGTCCTCAGACACGAAGGATGCTCCCTCACCCCCAGTCTTGGAGGCAATCTGCACAGAGCCAG +TCTGCACACC' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq06 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html + This is the multiFastaSeq01 with additional lines between records + thisContext receiver new multiFastaSeq06 + " + + ^ String streamContents: [ : stream | + stream + nextPutAll: self multiFastaSeq01Header01; cr; + nextPutAll: self multiFastaSeq01Body01; cr; cr; + nextPutAll: self multiFastaSeq01Header02; cr; + nextPutAll: self multiFastaSeq01Body02 ]. + +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq07 [ + + ^ '>Sample sequence 1 +garkbdctymvhu + +>Sample sequence 2 +ctymvhgarkbda + +>Sample sequence 3 +ccccccccccga' + +] diff --git a/repository/BioParsers-Tests/BioAbstractParserTest.class.st b/repository/BioParsers-Tests/BioAbstractParserTest.class.st new file mode 100644 index 00000000..8fc4d8d7 --- /dev/null +++ b/repository/BioParsers-Tests/BioAbstractParserTest.class.st @@ -0,0 +1,22 @@ +Class { + #name : 'BioAbstractParserTest', + #superclass : 'BioAbstractTest', + #instVars : [ + 'parser', + 'parseResult' + ], + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioAbstractParserTest >> parser [ + + ^ parser +] + +{ #category : 'accessing' } +BioAbstractParserTest >> parserClass [ + + ^ BioParser +] diff --git a/repository/BioParsers-Tests/BioAccessionParserTest.class.st b/repository/BioParsers-Tests/BioAccessionParserTest.class.st new file mode 100644 index 00000000..915b5093 --- /dev/null +++ b/repository/BioParsers-Tests/BioAccessionParserTest.class.st @@ -0,0 +1,136 @@ +Class { + #name : 'BioAccessionParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioAccessionParserTest >> parserClass [ + " Private - See superimplementor's comment " + + ^ BioAccessionParser +] + +{ #category : 'accessing' } +BioAccessionParserTest >> setUp [ + + super setUp. + parser := self parserClass new +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession01 [ + + parseResult := self parser parse: 'gi|555|emb|X65215.1|'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'X65215'. + self assert: parseResult version equals: '1'. + +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession02 [ + + parseResult := self parser parse: 'gi|226437718|gb|AC150860.6|'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'AC150860'. + self assert: parseResult version equals: '6'. + +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession03 [ + + parseResult := self parser parse: 'gi|207524544|gb|AC226190.2|'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'AC226190'. + self assert: parseResult version equals: '2'. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession04 [ + + parseResult := self parser parse: 'gb|AC226190.2|'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'AC226190'. + self assert: parseResult version equals: '2'. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession05 [ + + parseResult := self parser parse: 'AC150530.4'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'AC150530'. + self assert: parseResult version equals: '4'. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession06 [ + + parseResult := self parser parse: 'AC150707'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'AC150707'. + self deny: parseResult hasVersion. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession07 [ + + parseResult := self parser parse: '>gi|198282148|ref|NC_011206.1| Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'NC_011206'. + self assert: parseResult version equals: '1'. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession08 [ + + parseResult := self parser parse: '>gi|104773257|ref|NC_008054.1| Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842, complete genome'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'NC_008054'. + self assert: parseResult version equals: '1'. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testTokenizeAccession01 [ + + self + assert: (BioParser tokenizeAccession: 'gi|555|emb|X65215.1|') + equals: #( 'X65215' '1' ). + self + assert: (BioParser tokenizeAccession: 'gi|226437718|gb|AC150860.6|') + equals: #( 'AC150860' '6' ). + self + assert: (BioParser tokenizeAccession: 'gi|207524544|gb|AC226190.2|') + equals: #( 'AC226190' '2' ). + self + assert: + (BioParser tokenizeAccession: 'gi|207524544|gb|AC226190.2345|') + equals: #( 'AC226190' '2345' ). + self + assert: (BioParser tokenizeAccession: 'gb|AC226190.2|') + equals: #( 'AC226190' '2' ). + self + assert: (BioParser tokenizeAccession: 'AC150530.4') + equals: #( 'AC150530' '4' ). + self + assert: (BioParser tokenizeAccession: 'AC150707') + equals: #( 'AC150707' ) +] + +{ #category : 'testing' } +BioAccessionParserTest >> testTokenizeAccession02 [ + + self assert: ( self parser tokenize: 'gi|555|emb|X65215.1|' ) = #('X65215' '1') . + self assert: ( self parser tokenize: 'gi|226437718|gb|AC150860.6|' ) = #('AC150860' '6'). + self assert: ( self parser tokenize: 'gi|207524544|gb|AC226190.2|' ) = #('AC226190' '2'). + self assert: ( self parser tokenize: 'gi|207524544|gb|AC226190.2345|' ) = #('AC226190' '2345'). + self assert: ( self parser tokenize: 'gb|AC226190.2|' ) = #('AC226190' '2'). + self assert: ( self parser tokenize: 'AC150530.4' ) = #('AC150530' '4'). + self assert: ( self parser tokenize: 'AC150707' ) = #('AC150707'). + + self assert: ( ( self parser tokenize: '>gi|198282148|ref|NC_011206.1| Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome' ) = #('NC_011206' '1') ). + self assert: ( ( self parser tokenize: '>gi|104773257|ref|NC_008054.1| Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842, complete genome' ) = #('NC_008054' '1') ). +] diff --git a/repository/BioParsers-Tests/BioCGCParserTest.class.st b/repository/BioParsers-Tests/BioCGCParserTest.class.st new file mode 100644 index 00000000..22be415f --- /dev/null +++ b/repository/BioParsers-Tests/BioCGCParserTest.class.st @@ -0,0 +1,31 @@ +Class { + #name : 'BioCGCParserTest', + #superclass : 'BioAbstractParserTest', + #instVars : [ + 'cgcParser' + ], + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioCGCParserTest >> cgcSeq01 [ + " From http://www.genomatix.de/online_help/help/sequence_formats.html " + + ^ 'ID AB000263 standard; RNA; PRI; 368 BP. +XX +AC AB000263; +XX +DE Homo sapiens mRNA for prepro cortistatin like peptide, complete cds. +XX +SQ Sequence 368 BP; +AB000263 Length: 368 Check: 4514 .. + 1 acaagatgcc attgtccccc ggcctcctgc tgctgctgct ctccggggcc acggccaccg + 61 ctgccctgcc cctggagggt ggccccaccg gccgagacag cgagcatatg caggaagcgg + 121 caggaataag gaaaagcagc ctcctgactt tcctcgcttg gtggtttgag tggacctccc + 181 aggccagtgc cgggcccctc ataggagagg aagctcggga ggtggccagg cggcaggaag + 241 gcgcaccccc ccagcaatcc gcgcgccggg acagaatgcc ctgcaggaac ttcttctgga + 301 agaccttctc ctcctgcaaa taaaacctca cccatgaatg ctcacgcaag tttaattaca + 361 gacctgaa +' +] diff --git a/repository/BioParsers-Tests/BioDNANucleotideParserTest.class.st b/repository/BioParsers-Tests/BioDNANucleotideParserTest.class.st new file mode 100644 index 00000000..7c955e89 --- /dev/null +++ b/repository/BioParsers-Tests/BioDNANucleotideParserTest.class.st @@ -0,0 +1,66 @@ +Class { + #name : 'BioDNANucleotideParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'running' } +BioDNANucleotideParserTest >> setUp [ + + super setUp. + parser := #dnaLetter asPParser. + +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNALetterMatchA [ + + self assert: (parser matches: 'a'). + self assert: (parser matches: 'A'). + +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNALetterMatchC [ + + self assert: (parser matches: 'c'). + self assert: (parser matches: 'C'). + +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNALetterMatchG [ + + self assert: (parser matches: 'g'). + self assert: (parser matches: 'G'). + +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNALetterMatchN [ + + self assert: (parser matches: 'N'). + self assert: (parser matches: 'n'). +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNALetterMatchT [ + + self assert: (parser matches: 't'). + self assert: (parser matches: 'T'). + +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNANucelotideEmpty [ + + self deny: (parser matches: String empty) +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNANucleotideCharacter [ + + self should: [ parser matches: $a ] raise: MessageNotUnderstood. + self should: [ parser matches: nil ] raise: MessageNotUnderstood. +] diff --git a/repository/BioParsers-Tests/BioDNASequenceParserTest.class.st b/repository/BioParsers-Tests/BioDNASequenceParserTest.class.st new file mode 100644 index 00000000..6fb3afe7 --- /dev/null +++ b/repository/BioParsers-Tests/BioDNASequenceParserTest.class.st @@ -0,0 +1,45 @@ +Class { + #name : 'BioDNASequenceParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioDNASequenceParserTest >> setUp [ + + super setUp. + parser := #dnaSequence asPParser. + +] + +{ #category : 'testing' } +BioDNASequenceParserTest >> testDNASeqMatchString [ + + self assert: (parser matches: 'actg'). + self assert: (parser matches: 'ACTG'). + +] + +{ #category : 'testing' } +BioDNASequenceParserTest >> testDNASeqMatches [ + + self assert: (parser matches: 'a'). + self assert: (parser matches: 'A'). + self assert: (parser matches: 'N'). + self assert: (parser matches: 'n'). + +] + +{ #category : 'testing' } +BioDNASequenceParserTest >> testDNASeqNumber [ + + self deny: (parser matches: '8743'). +] + +{ #category : 'testing' } +BioDNASequenceParserTest >> testDNASeqParseEmpty [ + + self deny: (parser matches: String empty). + +] diff --git a/repository/BioParsers-Tests/BioDegenerateBaseParserTest.class.st b/repository/BioParsers-Tests/BioDegenerateBaseParserTest.class.st new file mode 100644 index 00000000..40834565 --- /dev/null +++ b/repository/BioParsers-Tests/BioDegenerateBaseParserTest.class.st @@ -0,0 +1,110 @@ +Class { + #name : 'BioDegenerateBaseParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithSeparators01 [ + + parseResult := self parserClass parseAmbiguousWithSeparators: 'AT[A/C]TA'. + + self assert: parseResult size equals: 5. + self assert: parseResult asString equals: 'ATMTA'. +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithSeparators02 [ + + parseResult := self parserClass parseAmbiguousWithSeparators: '[G/A]ACTGCA'. + + self assert: parseResult size equals: 7. + self assert: parseResult asString equals: 'RACTGCA'. + +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithSeparators03 [ + + parseResult := self parserClass parseAmbiguousWithSeparators: 'ACTGCA[T/C]'. + + self assert: parseResult size equals: 7. + self assert: parseResult asString equals: 'ACTGCAY' +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithoutSeparators01 [ + + parseResult := self parserClass parseAmbiguousWithoutSeparators: 'AT[AC]TA'. + + self assert: parseResult size equals: 5. + self assert: parseResult asString equals: 'ATMTA'. +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithoutSeparators02 [ + + parseResult := self parserClass parseAmbiguousWithoutSeparators: '[GA]ACTGCA'. + + self assert: parseResult size equals: 7. + self assert: parseResult asString equals: 'RACTGCA'. + +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithoutSeparators03 [ + + parseResult := self parserClass parseAmbiguousWithoutSeparators: 'ACTGCA[TC]'. + + self assert: parseResult size equals: 7. + self assert: parseResult asString equals: 'ACTGCAY' +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testTokenizeAmbiguousSequenceWithoutSeparators [ + self + assert: (self parserClass tokenizeAmbiguousWithoutSeparators: 'AT[AC]TA') + equals: 'ATMTA'. + self + assert: (self parserClass tokenizeAmbiguousWithoutSeparators: '[GA]ACTGCA') + equals: 'RACTGCA'. + self + assert: (self parserClass tokenizeAmbiguousWithoutSeparators: 'ACTGCA[TC]') + equals: 'ACTGCAY'. + self + assert: (self parserClass tokenizeAmbiguousWithoutSeparators: 'AT[AC]TA[CA]') + equals: 'ATMTAM'. + self + assert: + (self parserClass tokenizeAmbiguousWithoutSeparators: '[GA]ACT[AG]GCA') + equals: 'RACTRGCA'. + self + assert: + (self parserClass tokenizeAmbiguousWithoutSeparators: '[CT]ACTGCA[TC]') + equals: 'YACTGCAY' +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testTokenizeDelimitedAmbiguousSequence [ + + self assert: (self parserClass tokenizeAmbiguousWithSeparators: 'AT[A/C]TA') equals: 'ATMTA'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: '[G/A]ACTGCA') equals: 'RACTGCA'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: 'ACTGCA[T/C]') equals: 'ACTGCAY'. + + self assert: (self parserClass tokenizeAmbiguousWithSeparators: 'AT[A/C]TA[C/A]') equals: 'ATMTAM'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: '[G/A]ACT[A/G]GCA') equals: 'RACTRGCA'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: '[C/T]ACTGCA[T/C]') equals: 'YACTGCAY'. +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testTokenizeResolveAmbiguousSequence [ + +" self assert: (self parserClass tokenizeAmbiguousLettersToSeparators: 'ATMTA') equals: 'AT[A/C]TA'. + self assert: (self parserClass tokenizeAmbiguousLettersToSeparators: 'RACTGCA') equals: '[G/A]ACTGCA'. + self assert: (self parserClass tokenizeAmbiguousLettersToSeparators: 'ACTGCAY') equals: 'ACTGCA[T/C]'." + +" self assert: (self parserClass tokenizeAmbiguousWithSeparators: 'AT[A/C]TA[C/A]') equals: 'ATMTAM'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: '[G/A]ACT[A/G]GCA') equals: 'RACTRGCA'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: '[C/T]ACTGCA[T/C]') equals: 'YACTGCAY'. " +] diff --git a/repository/BioParsers-Tests/BioEMBLParserTest.class.st b/repository/BioParsers-Tests/BioEMBLParserTest.class.st new file mode 100644 index 00000000..f70b17c4 --- /dev/null +++ b/repository/BioParsers-Tests/BioEMBLParserTest.class.st @@ -0,0 +1,33 @@ +Class { + #name : 'BioEMBLParserTest', + #superclass : 'BioAbstractParserTest', + #instVars : [ + 'emblParser' + ], + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioEMBLParserTest >> emblSeq01 [ + " From http://www.genomatix.de/online_help/help/sequence_formats.html " + + ^ 'An example sequence in EMBL format is: + +ID AB000263 standard; RNA; PRI; 368 BP. +XX +AC AB000263; +XX +DE Homo sapiens mRNA for prepro cortistatin like peptide, complete cds. +XX +SQ Sequence 368 BP; + acaagatgcc attgtccccc ggcctcctgc tgctgctgct ctccggggcc acggccaccg 60 + ctgccctgcc cctggagggt ggccccaccg gccgagacag cgagcatatg caggaagcgg 120 + caggaataag gaaaagcagc ctcctgactt tcctcgcttg gtggtttgag tggacctccc 180 + aggccagtgc cgggcccctc ataggagagg aagctcggga ggtggccagg cggcaggaag 240 + gcgcaccccc ccagcaatcc gcgcgccggg acagaatgcc ctgcaggaac ttcttctgga 300 + agaccttctc ctcctgcaaa taaaacctca cccatgaatg ctcacgcaag tttaattaca 360 + gacctgaa 368 +// +' +] diff --git a/repository/BioParsers-Tests/BioEntrezXMLGenBankAccessionParserTest.class.st b/repository/BioParsers-Tests/BioEntrezXMLGenBankAccessionParserTest.class.st new file mode 100644 index 00000000..8c6f4492 --- /dev/null +++ b/repository/BioParsers-Tests/BioEntrezXMLGenBankAccessionParserTest.class.st @@ -0,0 +1,1445 @@ +Class { + #name : 'BioEntrezXMLGenBankAccessionParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> gbSet01 [ + " GBSeq size = 1 " + + ^ ' + + + + HQ184032 + 16339 + double + DNA + circular + MAM + 07-JAN-2011 + 07-JAN-2011 + Bos taurus isolate Chi597 mitochondrion, complete genome + HQ184032 + HQ184032.1 + + gb|HQ184032.1| + gi|306977295 + + mitochondrion Bos taurus (cattle) + Bos taurus + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Laurasiatheria; Cetartiodactyla; Ruminantia; Pecora; Bovidae; Bovinae; Bos + + + 1 + 1..16339 + + Bonfiglio,S. + Achilli,A. + Olivieri,A. + Negrini,R. + Colli,L. + Liotta,L. + Ajmone-Marsan,P. + Torroni,A. + Ferretti,L. + + The Enigmatic Origin of Bovine mtDNA Haplogroup R: Sporadic Interbreeding or an Independent Event of Bos primigenius Domestication in Italy? + PLoS ONE 5 (12), E15760 (2010) + + + doi + 10.1371/journal.pone.0015760 + + + 21209945 + Publication Status: Online-Only + + + 2 + 1..16339 + + Bonfiglio,S. + Achilli,A. + Olivieri,A. + Negrini,R. + Colli,L. + Liotta,L. + Ajmone-Marsan,P. + Torroni,A. + Ferretti,L. + + Direct Submission + Submitted (25-AUG-2010) Dipartimento di Genetica e Microbiologia, University of Pavia, Via Ferrata, 1, Pavia 27100, Italy + + + + + source + 1..16339 + + + 1 + 16339 + HQ184032.1 + + + + + organism + Bos taurus + + + organelle + mitochondrion + + + mol_type + genomic DNA + + + isolate + Chi597 + + + db_xref + taxon:9913 + + + haplogroup + Q2 + + + country + Italy + + + note + breed: Chianina + + + + + D-loop + join(15793..16339,1..362) + + + 15793 + 16339 + HQ184032.1 + + + 1 + 362 + HQ184032.1 + + + join + + + tRNA + 364..430 + + + 364 + 430 + HQ184032.1 + + + + + product + tRNA-Phe + + + + + rRNA + 431..1386 + + + 431 + 1386 + HQ184032.1 + + + + + product + small subunit ribosomal RNA + + + + + tRNA + 1387..1453 + + + 1387 + 1453 + HQ184032.1 + + + + + product + tRNA-Val + + + + + rRNA + 1454..3024 + + + 1454 + 3024 + HQ184032.1 + + + + + product + large subunit ribosomal RNA + + + + + tRNA + 3025..3099 + + + 3025 + 3099 + HQ184032.1 + + + + + product + tRNA-Leu + + + note + codons recognized: UUR + + + + + gene + 3102..4057 + + + 3102 + 4057 + HQ184032.1 + + + + + gene + ND1 + + + + + CDS + 3102..4057 + + + 3102 + 4057 + HQ184032.1 + + + + + gene + ND1 + + + note + TAA stop codon is completed by the addition of 3' A residues to the mRNA + + + codon_start + 1 + + + transl_except + (pos:4056..4057,aa:TERM) + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 1 + + + protein_id + ADN11801.1 + + + db_xref + GI:306977296 + + + translation + MFMINILMLIIPILLAVAFLTLVERKVLGYMQLRKGPNVVGPYGLLQPIADAIKLFIKEPLRPATSSASMFILAPIMALGLALTMWIPLPMPYPLINMNLGVLFILAMSSLAVYSILWSGWASNSKYALIGALRAVAQTISYEVTLAIILLSVLLMSGSFTLSTLITTQEQMWLILPAWPLAMMWFISTLAETNRAPFDLTEGESELVSGFNVEYAAGPFALFFMAEYANIIMMNIFTAILFLGTSHNPHMPELYTINFTIKSLLLTMSFLWIRASYPRFRYDQLMHLLWKNFLPLTLALCMWHVSLPILTSGIPPQT + + + + + tRNA + 4058..4126 + + + 4058 + 4126 + HQ184032.1 + + + + + product + tRNA-Ile + + + + + tRNA + complement(4124..4195) + + + 4195 + 4124 + + HQ184032.1 + + + + + product + tRNA-Gln + + + + + tRNA + 4198..4266 + + + 4198 + 4266 + HQ184032.1 + + + + + product + tRNA-Met + + + + + gene + 4267..5308 + + + 4267 + 5308 + HQ184032.1 + + + + + gene + ND2 + + + + + CDS + 4267..5308 + + + 4267 + 5308 + HQ184032.1 + + + + + gene + ND2 + + + note + TAA stop codon is completed by the addition of 3' A residues to the mRNA + + + codon_start + 1 + + + transl_except + (pos:5308,aa:TERM) + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 2 + + + protein_id + ADN11802.1 + + + db_xref + GI:306977297 + + + translation + MNPIIFIIILLTIMLGTIIVMISSHWLLVWIGFEMNMLAIIPIMMKNHNPRATEASTKYFLTQSTASMLLMMAVIINLMFSGQWTVMKLFNPMASMLMTMALAMKLGMAPFHFWVPEVTQGIPLSSGLILLTWQKLAPMSVLYQIFPSINLNLILTLSVLSILIGGWGGLNQTQLRKIMAYSSIAHMGWMTAVLPYNPTMTLLNLIIYIIMTSTMFTMFMANSTTTTLSLSHTWNKTPIMTVLILATLLSMGGLPPLSGFMPKWMIIQEMTKNNSIILPTFMAITALLNLYFYMRLTYSTTLTMFPSTNNMKMKWQFPLMKKMTFLPTMVVLSTMMLPLTPMLSVLE + + + + + tRNA + 5309..5375 + + + 5309 + 5375 + HQ184032.1 + + + + + product + tRNA-Trp + + + + + tRNA + complement(5377..5445) + + + 5445 + 5377 + + HQ184032.1 + + + + + product + tRNA-Ala + + + + + tRNA + complement(5447..5519) + + + 5519 + 5447 + + HQ184032.1 + + + + + product + tRNA-Asn + + + + + rep_origin + 5520..5550 + + + 5520 + 5550 + HQ184032.1 + + + + + note + origin of L-strand replication + + + + + tRNA + complement(5552..5618) + + + 5618 + 5552 + + HQ184032.1 + + + + + product + tRNA-Cys + + + + + tRNA + complement(5619..5686) + + + 5686 + 5619 + + HQ184032.1 + + + + + product + tRNA-Tyr + + + + + gene + 5688..7232 + + + 5688 + 7232 + HQ184032.1 + + + + + gene + COX1 + + + + + CDS + 5688..7232 + + + 5688 + 7232 + HQ184032.1 + + + + + gene + COX1 + + + codon_start + 1 + + + transl_table + 2 + + + product + cytochrome c oxidase subunit I + + + protein_id + ADN11803.1 + + + db_xref + GI:306977298 + + + translation + MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSLLIRAELGQPGTLLGDDQIYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEAGAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMSQYQTPLFVWSVMITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMMWALGFIFLFTVGGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFVHWFPLFSGYTLNDTWAKIHFAIMFVGVNMTFFPQHFLGLSGMPRRYSDYPDAYTMWNTISSMGSFISLTAVMLMVFIIWEAFASKREVLTVDLTTTNLEWLNGCPPPYHTFEEPTYVNLK + + + + + tRNA + complement(7230..7300) + + + 7300 + 7230 + + HQ184032.1 + + + + + product + tRNA-Ser + + + note + codons recognized: UCN + + + + + tRNA + 7305..7373 + + + 7305 + 7373 + HQ184032.1 + + + + + product + tRNA-Asp + + + + + gene + 7375..8058 + + + 7375 + 8058 + HQ184032.1 + + + + + gene + COX2 + + + + + CDS + 7375..8058 + + + 7375 + 8058 + HQ184032.1 + + + + + gene + COX2 + + + codon_start + 1 + + + transl_table + 2 + + + product + cytochrome c oxidase subunit II + + + protein_id + ADN11804.1 + + + db_xref + GI:306977299 + + + translation + MAYPMQLGFQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQEVETIWTILPAIILILIALPSLRILYMMDEINNPSLTVKTMGHQWYWSYEYTDYEDLSFDSYMIPTSELKPGELRLLEVDNRVVLPMEMTIRMLVSSEDVLHSWAVPSLGLKTDAIPGRLNQMTLMSSRPGLYYGQCSEICGSNHSFMPIVLELVPLKYFEKWSASML + + + + + tRNA + 8062..8128 + + + 8062 + 8128 + HQ184032.1 + + + + + product + tRNA-Lys + + + + + gene + 8130..8330 + + + 8130 + 8330 + HQ184032.1 + + + + + gene + ATP8 + + + + + CDS + 8130..8330 + + + 8130 + 8330 + HQ184032.1 + + + + + gene + ATP8 + + + codon_start + 1 + + + transl_table + 2 + + + product + ATP synthase F0 subunit 8 + + + protein_id + ADN11805.1 + + + db_xref + GI:306977300 + + + translation + MPQLDTSTWLTMILSMFLTLFIIFQLKVSKHNFYHNPELTPTKMLKQNTPWETKWTKIYLPLLLPL + + + + + gene + 8291..8971 + + + 8291 + 8971 + HQ184032.1 + + + + + gene + ATP6 + + + + + CDS + 8291..8971 + + + 8291 + 8971 + HQ184032.1 + + + + + gene + ATP6 + + + codon_start + 1 + + + transl_table + 2 + + + product + ATP synthase F0 subunit 6 + + + protein_id + ADN11806.1 + + + db_xref + GI:306977301 + + + translation + MNENLFTSFTTPVILGLPLVTLIVLFPSLLFPTSNRLVSNRFVTLQQWMLQLVSKQMMSIHNSKGQTWTLMLMSLILFIGSTNLLGLLPHSFTPTTQLSMNLGMAIPLWAGAVITGFRNKTKASLAHFLPQGTPTPLIPMLVIIETISLFIQPMALAVRLTANITAGHLLIHLIGGATLALMSISTTTALITFTILTLLTILEFAVAMIQAYVFTLLVSLYLHDNT + + + + + gene + 8971..9754 + + + 8971 + 9754 + HQ184032.1 + + + + + gene + COX3 + + + + + CDS + 8971..9754 + + + 8971 + 9754 + HQ184032.1 + + + + + gene + COX3 + + + note + TAA stop codon is completed by the addition of 3' A residues to the mRNA + + + codon_start + 1 + + + transl_except + (pos:9754,aa:TERM) + + + transl_table + 2 + + + product + cytochrome c oxidase subunit III + + + protein_id + ADN11807.1 + + + db_xref + GI:306977302 + + + translation + MTHQTHAYHMVNPSPWPLTGALSALLMTSGLTMWFHFNSMTLLMIGLTTNMLTMYQWWRDVIRESTFQGHHTPAVQKGLRYGMILFIISEVLFFTGFFWAFYHSSLAPTPELGGCWPPTGIHPLNPLEVPLLNTSVLLASGVSITWAHHSLMEGDRKHMLQALFITITLGVYFTLLQASEYYEAPFTISDGVYGSTFFVATGFHGLHVIIGSTFLIVCFFRQLKFHFTSNHHFGFEAAAWYWHFVDVVWLFLYVSIYWWGS + + + + + tRNA + 9755..9823 + + + 9755 + 9823 + HQ184032.1 + + + + + product + tRNA-Gly + + + + + gene + 9824..10170 + + + 9824 + 10170 + HQ184032.1 + + + + + gene + ND3 + + + + + CDS + 9824..10170 + + + 9824 + 10170 + HQ184032.1 + + + + + gene + ND3 + + + note + TAA stop codon is completed by the addition of 3' A residues to the mRNA + + + codon_start + 1 + + + transl_except + (pos:10169..10170,aa:TERM) + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 3 + + + protein_id + ADN11808.1 + + + db_xref + GI:306977303 + + + translation + MNLMLALLTNFTLATLLVIIAFWLPQLNVYSEKTSPYECGFDPMGSARLPFSMKFFLVAITFLLFDLEIALLLPLPWASQTANLNTMLTMALFLIILLAVSLAYEWTQKGLEWTE + + + + + tRNA + 10171..10239 + + + 10171 + 10239 + HQ184032.1 + + + + + product + tRNA-Arg + + + + + gene + 10240..10536 + + + 10240 + 10536 + HQ184032.1 + + + + + gene + ND4L + + + + + CDS + 10240..10536 + + + 10240 + 10536 + HQ184032.1 + + + + + gene + ND4L + + + codon_start + 1 + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 4L + + + protein_id + ADN11809.1 + + + db_xref + GI:306977304 + + + translation + MSMVYMNIMMAFTVSLVGLLMYRSHLMSSLLCLEGMMLSLFVMAALTILNSHFTLASMMPIILLVFAACEAALGLSLLVMVSNTYGTDYVQNLNLLQC + + + + + gene + 10530..11907 + + + 10530 + 11907 + HQ184032.1 + + + + + gene + ND4 + + + + + CDS + 10530..11907 + + + 10530 + 11907 + HQ184032.1 + + + + + gene + ND4 + + + note + TAA stop codon is completed by the addition of 3' A residues to the mRNA + + + codon_start + 1 + + + transl_except + (pos:11907,aa:TERM) + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 4 + + + protein_id + ADN11810.1 + + + db_xref + GI:306977305 + + + translation + MLKYIIPTIMLMPLTWLSKNNMIWVNSTAHSLLISFTSLLLMNQFGDNSLNFSLLFFSDSLSTPLLILTMWLLPLMLMASQHHLSKENLTRKKLFITMLISLQLFLIMTFTAMELILFYILFEATLVPTLIIITRWGNQTERLNAGLYFLFYTLAGSLPLLVALIYIQNTVGSLNFLMLQYWVQPVHNSWSNVFMWLACMMAFMVKMPLYGLHLWLPKAHVEAPIAGSMVLAAVLLKLGGYGMLRITLILNPMTDFMAYPFIMLSLWGMIMTSSICLRQTDLKSLIAYSSVSHMALVIVAILIQTPWSYMGATALMIAHGLTSSMLFCLANSNYERIHSRTMILARGLQTLLPLMATWWLLASLTNLALPPTINLIGELFVVMSTFSWSNITIILMGVNMVITALYSLYMLIMTQRGKYTYHINNISPSFTRENALMSLHILPLLLLTLNPKIILGPLY + + + + + tRNA + 11908..11977 + + + 11908 + 11977 + HQ184032.1 + + + + + product + tRNA-His + + + + + tRNA + 11978..12037 + + + 11978 + 12037 + HQ184032.1 + + + + + product + tRNA-Ser + + + note + codons recognized: AGY + + + + + tRNA + 12039..12109 + + + 12039 + 12109 + HQ184032.1 + + + + + product + tRNA-Leu + + + note + codons recognized: CUN + + + + + gene + 12110..13930 + + + 12110 + 13930 + HQ184032.1 + + + + + gene + ND5 + + + + + CDS + 12110..13930 + + + 12110 + 13930 + HQ184032.1 + + + + + gene + ND5 + + + codon_start + 1 + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 5 + + + protein_id + ADN11811.1 + + + db_xref + GI:306977306 + + + translation + MNMFSSLSLVTLLLLTMPIMMMSFNTYKPSNYPLYVKTAISYAFITSMIPTMMFIHSGQELIISNWHWLTIQTLKLSLSFKMDYFSMMFIPVALFVTWSIMEFSMWYMHSDPNINKFFKYLLLFLITMLILVTANNLFQLFIGWEGVGIMSFLLIGWWYGRADANTAALQAILYNRIGDIGFILAMAWFLTNLNTWDLQQIFMLNPSDSNMPLIGLALAATGKSAQFGLHPWLPSAMEGPTPVSALLHSSTMVVAGIFLLIRFYPLTENNKYIQSITLCLGAITTLFTAMCALTQNDIKKIIAFSTSSQLGLMMVTIGINQPYLAFLHICTHAFFKAMLFMCSGSIIHSLNDEQDIRKMGGLFKAMPFTTTALIVGSLALTGMPFLTGFYSKDLIIEAANTSYTNAWALLMTLIATSFTAIYSTRIIFFALLGQPRFPTLVNINENNPLLINSIKRLLIGSLFAGYIISNNIPPTTIPQMTMPYYLKTTALIVTILGFILALEISNMTKNLKYHYPSNAFKFSTLLGYFPTIMHRLAPYMNLSMSQKSASSLLDLIWLEAILPKTISLAQMKASTLVTNQKGLIKLYFLSFLITILISMILFNFHE + + + + + gene + complement(13914..14441) + + + 14441 + 13914 + + HQ184032.1 + + + + + gene + ND6 + + + + + CDS + complement(13914..14441) + + + 14441 + 13914 + + HQ184032.1 + + + + + gene + ND6 + + + codon_start + 1 + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 6 + + + protein_id + ADN11812.1 + + + db_xref + GI:306977307 + + + translation + MMLYIVFILSVIFVMGFVGFSSKPSPIYGGLGLIVSGGVGCGIVLNFGGSFLGLMVFLIYLGGMMVVFGYTTAMATEQYPEIWLSNKAVLGAFVTGLLMEFFMVYYVLKDKEVEVVFEFNGLGDWVIYDTGDSGFFSEEAMGIAALYSYGTWLVIVTGWSLLIGVVVIMEITRGN + + + + + tRNA + complement(14442..14510) + + + 14510 + 14442 + + HQ184032.1 + + + + + product + tRNA-Glu + + + + + gene + 14515..15654 + + + 14515 + 15654 + HQ184032.1 + + + + + gene + CYTB + + + + + CDS + 14515..15654 + + + 14515 + 15654 + HQ184032.1 + + + + + gene + CYTB + + + codon_start + 1 + + + transl_table + 2 + + + product + cytochrome b + + + protein_id + ADN11813.1 + + + db_xref + GI:306977308 + + + translation + MTNIRKSHPLMKIVNNAFIDLPAPSNISSWWNFGSLLGICLILQILTGLFLAMHYTSDTTTAFSSVTHICRDVNYGWIIRYMHANGASMFFICLYMHVGRGLYYGSYTFLETWNIGVILLLTVMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGTNLVEWIWGGFSVDKATLTRFFAFHFILPFIIMAIAMVHLLFLHETGSNNPTGISSDMDKIPFHPYYTIKDILGALLLILALMLLVLFAPDLLGDPDNYTPANPLNTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAFSILILALIPLLHTSKQRSMMFRPLSQCLFWALVADLLTLTWIGGQPVEHPYITIGQLASVLYFLLILVLMPTAGTVENKLLKW + + + + + tRNA + 15659..15727 + + + 15659 + 15727 + HQ184032.1 + + + + + product + tRNA-Thr + + + + + tRNA + complement(15727..15792) + + + 15792 + 15727 + + HQ184032.1 + + + + + product + tRNA-Pro + + + + + actaatggctaatcagcccatgctcacacataactgtgctgtcatacatttggtatttttttattttgggggatgcttggactcagctatggccgtcaaaggccctgacccggagcatctattgtagctggacttaactgcatcttgagcaccagcataatgataagcgtggacattacagtcaatggtcacaggacataaattatattatatatccccccttcataaaaatttcccccttaaatatctaccaccacttttaacagacttttccctagatacttatttaaatttttcacgctttcaatactcaatttagcactccaaacaaagtcaatatataaacgcaggccccccccccccgttgatgtagcttaacccaaagcaaggcactgaaaatgcctagatgagtctcccaactccataaacacataggtttggtcccagccttcctgttaactcttaataaacttacacatgcaagcatctacaccccagtgagaatgccctctaggttattaaaactaagaggagctggcatcaagcacacaccctgtagctcacgacgccttgcttaaccacacccccacgggaaacagcagtgacaaaaattaagccataaacgaaagtttgactaagttatattaattagggttggtaaatctcgtgccagccaccgcggtcatacgattaacccaagctaacaggagtacggcgtaaaacgtgttaaagcaccataccaaatagggttaaattctaactaagctgtaaaaagccatgattaaaataaaaataaatgacgaaagtgaccctacaatagccgacgcactatagctaagacccaaactgggattagataccccactatgcttagccctaaacacagataattacataaacaaaattattcgccagagtactactagcaacagcttaaaactcaaaggacttggcggtgctttatatccttctagaggagcctgttctataatcgataaaccccgataaacctcaccaattcttgctaatacagtctatataccgccatcttcagcaaaccctaaaaaggaaaaaaagtaagcgtaattatgatacataaaaacgttaggtcaaggtgtaacctatgaaatgggaagaaatgggctacattctctacaccaagagaatcaagcacgaaagttattatgaaaccaataaccaaaggaggatttagcagtaaactaagaatagagtgcttagttgaattaggccatgaagcacgcacacaccgcccgtcaccctcctcaaatagattcagtgcatctaaccctatttaaacgcactagctacatgagaggagacaagtcgtaacaaggtaagcatactggaaagtgtgcttggataaatcaagatatagcttaaacaaagcatccagtttacacctagaagacttcattcattatgaatatcttgaactagacctagcccaaagataccctctcgactaaacaaccaagatagaataaaacaaaacatttaatcccaatttaaagtataggagatagaaatctaagtacggcgctatagagaaagtaccgcaagggaacgatgaaagaaaaaaactaaaagtataaaaaagcaaagattaccccttgtaccttttgcataatgaattaactagtataagacttaacaaaatgaattttagctaagcagcccgaaaccagacgagctactcacaaacagtttaccaagaactaactcatctatgtggcaaaatagtgagaagatttgtaagtagaggtgacatgcctaacgagcctggtgatagctggttgtccagaaaatgaatctaagttcagctttaaagataccaaaaattcaaataaaccccactgtagctttaaaagttagtctaaaaaggtacagccttttagaaacggatacaaccttgactagagagtaaaatttaacactaccatagtaggcctaaaagcagccatcaattaagaaagcgttaaagctcaacaacaaaaattaaatagattccaacaacaaatgattaactcctagccccaatactggactaatctattatagaatagaagcaataatgttaatatgagtaacaagaaaaattttctccttgcataagtctaagtcagtgcctgataatactctgaccactaacagtcaataaaaataatccaacaataaacaatttattgattatactgttaacccaacacaggagtgcatctaaggaaagattaaaagaagtaaaaggaactcggcaaacacaaaccccgcctgtttaccaaaaacatcacctccagcattcccagtattggaggcattgcctgcccagtgacaactgtttaacggccgcggtatcctgaccgtgcaaaggtagcataatcatttgttctctaaataaggacttgtatgaatggccgcacgagggttttactgtctcttacttccaatcagtgaaattgaccttcccgtgaagaggcgggaatgcacaaataagacgagaagaccctatggagctttaactaaccaacccaaagagaataaatttaaccattaaggaataacaacaatctccatgagttggtagtttcggttggggtgacctcggagaataaaaaatcctccgagcgattttaaagactagacccacaagtcaaatcactctatcgctcattgatccaaaaacttgatcaacggaacaagttaccctagggataacagcgcaatcctattcaagagtccatatcgacaatagggtttacgacctcgatgttggatcaggacatcctgatggtgcaaccgctatcaaaggttcgtttgttcaacgattaaagtcctacgtgatctgagttcagaccggagtaatccaggtcggtttctatctattacgtatttctcccagtacgaaaggacaagagaaataaggccaactttaaatcaagcgccttaagacaaccaatgataacatctcaactgacaacacaaaaccctgccctagaacagggcttagttaaggtggcagagcccggtaattgcataaaacttaaacttttatatccagagattcaaatcctctccttaacaaaatgttcataattaacatcttaatactaattattcccatcctattggccgtagcattccttacgttagtggaacgaaaagttctaggctatatacaactccgaaaaggtccaaatgtcgtaggtccatatggcctactccaacccatcgccgatgcaatcaaacttttcattaaagaaccactacgacccgctacatcttcagcctcaatatttatcctagcacctatcatagctttaggcctagccttaaccatgtgaattcccctaccaataccctatcctcttatcaacataaacctaggagtcctatttattctagccatatcaagcctagccgtatactccattctctgatcaggctgagcttccaactcaaaatacgcactaatcggagccctacgagcagtagcacaaacaatctcatacgaagtaacgctagcaattatcctgttatcagtactcctaataagtgggtcctttaccctctccacattaattactacacaagaacaaatatggttaatcctcccagcatggcctctagcaataatatgatttatctcaacactagcagaaacaaaccgagctccatttgatttaactgaaggagaatcagagctagtctcgggcttcaacgtagaatatgcagcaggaccatttgccctcttcttcatagcagagtacgcaaatattatcataataaatatctttacagcaattttattcctaggaacatcccacaatccacacataccagaactctacacaatcaattttaccattaaatccctactgctcacaatatccttcctatgaatccgagcatcctaccctcgatttcgctatgaccaactaatacacttactatgaaaaaattttctacctctgacactagccctgtgcatgtgacacgtatccctacccatccttacatcaggcatcccaccacaaacataagaaatatgtctgacaaaagagttactttgatagagtaaataatagaggttcaaaccctcttatttctagaactataggaatcgaacctactcctaagaatccaaaactcttcgtgctcccaattacaccaaattctattagtaaggtcagctaattaagctatcgggcccataccccgaaaatgttggtttatatccttcccgtactaataaacccaattatctttattattattctactaaccattatactaggaactattattgtcataatcagttctcactgactacttgtctgaatcgggtttgaaataaatatactcgccatcatccccatcataataaaaaatcacaacccacgagctacagaagcatcaactaaatattttttgactcaatcaacagcctcaatactactaataatagccgtcatcattaacctaatattctcaggccaatgaaccgtaataaaactatttaacccaatagcctcaatacttataacgatagccctagctataaaactaggaatagccccatttcacttctgagtcccagaagtaacacagggcatccccctatcctcaggccttatcctactgacatgacaaaaactagcacctatatctgtactttaccaaatcttcccatcaattaacctaaacttaattctaaccctatcagttttatcaatcctaattggaggctgagggggactaaaccaaacacaactccgaaaaatcatagcctactcatcaatcgctcatataggctgaataacagcagtactaccatataaccccaccataacattgctaaacttaattatctatatcattataacttccaccatatttaccatatttatagccaattccaccaccactaccctgtcattatcacacacatgaaataaaacacccattataaccgtcctaattcttgccactctcctatccataggaggactccctcccctatctgggtttataccaaaatgaataatcatccaagagataacaaaaaataacagcatcattctacccactttcatagcaatcacagctctactaaacttatatttttatatacgactcacgtattctaccacactaacaatatttccctccacaaacaacataaaaataaaatgacaatttccccttatgaaaaaaataacttttctaccaacaatagtcgtattatctaccataatactaccactcacgccaatactatcagtgttagaataggaatttaggttaaacagaccaagagccttcaaagccctaagcaagtacaatttacttaattcctgataaggattgcaagactacaccttacatcaattgaatgcaaatcaaccactttaattaagctaaatcctcactagactggtgggctccacccccacgaaactttagttaacagctaaacaccctagttaactggcttcaatctacttctcccgccgcaagaaaaaaaaggcgggagaagccccggcagaattgaagctgcttctctgaatttgcaattcaacgtgtaaattcaccacagggcttggtaaaaagaggagtcaaacctctatctttagatttacagtctaatgctttgctcagccattttacccatgttcattaaccgctgactattctcaaccaaccataaagatattggtaccctttatctactatttggtgcttgggccggtatagtaggaacagctctaagccttctaattcgcgctgaattaggccaacccggaactctgctcggagacgaccaaatctacaacgtagttgtaaccgcacacgcatttgtaataatcttcttcatagtaataccaatcataattggaggattcggtaactgacttgttcccctaataattggtgctcccgatatagcatttccccgaataaataatataagcttctgactcctccctccctcattcctactactcctcgcatcctctatagttgaagctggggcaggaacaggctgaaccgtgtaccctcccttagcaggcaacctagcccatgcaggagcttcagtagatctaaccattttctctttacacttagcaggagtttcctcaattttaggagccatcaacttcattacaacaattatcaacataaagccccccgcaatgtcacaataccaaacccctctgttcgtatgatccgtaataattaccgccgtactactactactctcgctccctgtattagcagccggcatcacaatgctattaacagaccggaacctaaatacaaccttcttcgacccggcaggaggaggagaccctattctatatcaacacttattctgattctttggacaccccgaagtctatattttaatcttacctggatttggaataatctctcatatcgtgacctactactcaggaaaaaaagaaccattcggatatatgggaatagtttgggctataatgtcaatcggatttctaggtttcatcgtatgagcccaccatatattcactgtcggaatagacgtcgacacacgagcctacttcacatcagccactataattattgctattccaaccggggtaaaagtcttcagctgattggcaacacttcatggaggtaatatcaaatggtctcctgctataatgtgagccctaggctttattttcttatttacagtagggggtttaactggaattgtcttagccaactcttccctcgatattgttcttcacgacacatactacgttgtcgcacatttccactatgttttatcaataggagctgtatttgctattatagggggatttgttcattgattcccactattctcaggttatactctcaacgatacatgagccaaaatccacttcgcaattatatttgtaggcgtcaatataaccttcttcccacaacactttctaggactatctggcatgcctcgacgatactccgactacccagatgcatacacaatatgaaatactatctcatcaataggctcattcatttccctaacagcagttatactaatagttttcatcatctgagaagcatttgcatctaaacgagaagtcttgactgtagacttaaccacgacaaatctagaatgattaaacggatgccctccaccatatcacacatttgaagaacccacctatgttaacctaaaataagaaaggaaggaatcgaaccccctactattggtttcaagccaacatcataacctctatgtctctctcaataaacgaggtgttagtaaaacattatataattttgtcaaagttaagttacaagtgaaagtcctgtacacctcatatggcatatcccatacaactaggattccaagatgcaacatcaccaatcatagaagaactacttcactttcatgaccacacgctaataattgtcttcttaattagctcattagtactttacattatttcactaatactaacgacaaagctgacccatacaagcacgatagatgcacaagaagtagagacaatctgaaccattctgcccgccatcatcttaattctaattgctcttccttctttacgaattctatacataatagatgaaatcaataacccatctcttacagtaaaaaccataggacatcagtgatactgaagctatgagtatacagattatgaggacttaagcttcgactcctacataattccaacatcagaattaaagccaggggagctacgactattagaagtcgataatcgagttgtactaccaatagaaataacaatccgaatgttagtctcctctgaagacgtattacactcatgagctgtgccctctctaggactaaaaacagacgcaatcccaggccgtctaaaccaaataacccttatatcgtcccgtccaggcttatattacggtcaatgctcagaaatttgcgggtcaaaccacagtttcatacccattgtccttgagttagtcccactaaagtactttgaaaaatgatctgcgtcaatattataaaatcactaagaagctatatagcactaaccttttaagttagagattgagagccatatactctccttggtgacatgccgcaactagacacgtcaacatgactgacaatgatcttatcaatattcttgaccctttttatcatctttcaactaaaagtttcaaaacacaacttttatcacaatccagaactgacaccaacaaaaatattaaaacaaaacaccccttgagaaacaaaatgaacgaaaatttatttacctcttttactacccctgtaattttaggtctccctctcgtaacccttatcgtactattccccagcctactattcccaacatcaaaccgactagtaagcaatcgctttgtaaccctccaacaatgaatacttcaacttgtatcaaaacaaataatgagtatccacaattctaaaggacaaacatgaacattaatattaatatctctgatcctatttattggatcaacaaacctactaggcctattaccccattcattcacaccaacaacacaactatcaataaacctaggcatagccatccccctgtgagcaggagccgtaattacaggattccgcaataaaactaaagcatcacttgcccatttcttaccacaaggaacacccactccactaatcccaatactagtaattattgaaactatcagcctttttattcaacctatagccctcgccgtgcggttaacagctaacatcactgcaggacacctattaattcacctaatcggaggagctacacttgcactaataagcattagcactacaacagctctaattacattcaccattctaaccctactaacaattctagagtttgcagtagctataatccaagcctatgtattcactctcctagtcagcctatatctgcatgacaacacataatgacacaccaaactcatgcttatcatatagtaaacccaagcccttgacctcttacaggagctttgtctgccctcttaataacatccggcctaaccatgtgatttcactttaactcaatgaccctgctaataattggcctaacaacaaatatactaacaatataccaatgatgacgagatgttatccgagaaagcaccttccaagggcaccataccccagctgtccaaaaaggcctccgttatggaataattctttttattatctccgaagtactattctttaccggatttttctgagctttctaccactcaagcctcgcccccacccctgaactaggcggctgctgacccccaacaggcattcacccactaaaccccctagaagtcccactgctcaacacctctgtcctattggcttccggagtttctattacctgagcccatcatagtttaatagaaggggaccgaaagcatatattacaagccctatttatcaccatcacattaggagtctacttcacactactacaagcctcagaatactatgaagcaccttttactatctccgacggagtttacggctcaactttttttgtagccacaggcttccacggcctccacgtcatcattgggtccaccttcttaattgtctgcttcttccgccaattaaaatttcattttacttctaaccaccacttcggctttgaagccgctgcctgatactgacatttcgtagacgtagtctgacttttcctctatgtttctatctattgatgaggctcctattcttttagtattaactagtacagctgacttccaatcagctagtttcggtctagtccgaaaaagaataataaatttaatactagccctcctgaccaattttacactagccaccctactcgtcatcatcgcattctgacttccccaactaaatgtatactctgagaaaacaagcccatacgaatgtggatttgaccccataggatcagcccgccttcccttctctataaaattctttctggtagccatcacattcctcttatttgacctagaaattgcactcctcctaccactgccatgagcctcacaaacagcaaatctaaacacaatgcttaccatagccctcttcctaattatcctcctagctgtaagcctagcctatgagtgaactcaaaaaggactagaatgaaccgaatatggtacttagtttaaaataaaataaatgatttcgactcattagattatgatttaattcataattaccaaatgtctatagtatacataaacattataatagcattcacagtatctcttgtaggactactaatataccgatcccacctaatatcctcccttctatgcttagaaggaataatgctatccctattcgttatagcagccctaacaatcctcaactcacattttacattagctagcataatacctattatcctactagtcttcgcagcctgtgaagcagccctaggtctatctctactagtaatagtatcaaatacatatggtactgattatgtacaaaacctcaacttactccaatgctaaaatacattattccaacaattatacttatacccctaacctggttatcaaaaaataatataatttgggttaactccacagcacacagccttctaattagctttacaagcctcctcctcataaaccagtttggcgacaacagccttaatttttcactactatttttctccgactccctatccacaccactactaattttaaccatatggctcctccctctaatactaatagctagccaacatcatctatcaaaagaaaacctaacccgaaaaaaactatttattactatgctgatctcactacaactattcctaattataacctttaccgccatggaactaatcttattttatattctatttgaagcaacactagtcccaacactcattattatcacccgatgaggaaaccaaacagaacgcctaaacgccggactctatttcctattctatacactagctggctccttacccctattagtcgcactaatttatatccaaaacacagtaggatccctaaatttcctaatattacagtactgagtacaacctgttcacaactcttgatctaatgtcttcatatgactagcatgtataatagctttcatagtaaaaataccactatatggcctccacctttgactacctaaagctcacgtagaagcccccatcgcaggctccatagtccttgcagcagttctactaaaactaggggggtacggtatgctacgaatcacactaattctaaaccctatgaccgactttatagcatacccattcattatactctccctatgaggcataattataaccagctcaatctgcctccgtcaaacggacctaaaatcactcatcgcatactcctctgtaagccacatagcactcgttatcgtagccatccttatccagacaccttgaagctacataggagcaaccgcccttatgattgcccacggcctcacatcctccatacttttctgtctagcaaactcaaactacgaacgaatccacagccgaaccataattctagctcgaggcctacaaacgctccttccactaatagccacctgatgactactagcaagtctaaccaacttagctctacccccaacaatcaacttaattggagaactatttgtagtaatgtcaaccttttcatgatctaacattacaattattctaataggagtaaatatagtaatcaccgccctatattctctatacatgctaattataacccaacgaggaaaatatacctaccacattaataatatctcgccttcctttacacgggaaaatgcactcatatcattacacatcctacccctactactcctaaccctaaacccaaaaattattctaggacctctatactgtaaatatagtttaacaaaaacattagattgtgaatctaacaatagaaactcattaccttcttatttaccgaaaaagtatgcaagaactgctaattctatgctcccatatctaatagtatggctttttcgaacttttaaaggatagtagtttatccgttggtcttaggaaccaaaaaattggtgcaactccaaataaaagtaataaacatattctcctcactctcactagttactttactcttactaactatacccattataataataagctttaacacctacaaaccttccaactacccactctacgtaaaaacagctatctcatacgccttcattaccagcataattcccacaataatatttatccactcaggccaagaactaattatttcaaactgacactgactaaccatccaaactcttaaattatccctcagctttaaaatagactatttctcaataatatttatcccagtagcactattcgtcacatgatctattatagaattctcaatatgatatatacactcagaccccaatattaacaaattcttcaaatacctactcctattcctcattactatgctcatccttgtaaccgcaaacaacctcttccagctattcattggctgagaaggcgtcggaatcatatcatttctactcatcggatgatgatacggacgagcagatgcaaacacagcagccctacaagcaatcttatataaccgcatcggcgacattggtttcattttagcaatagcatggtttctaacaaatctcaatacctgagacctccaacagatcttcatactaaacccaagcgactcaaacatacccttgatcggactagcattagctgcaaccggaaaatccgcccaatttggcctccacccgtgacttccctctgcaatagaaggcccaactcccgtctcagcactactccattcaagcacaatagtggtagcaggtatcttcctactaatccgtttctatcccctcacagaaaacaataaatacatccaatctattacattatgcttaggagccattaccacactatttacagcaatatgcgccctcacccaaaatgacattaagaaaatcatcgccttctccacatccagtcaactgggccttataatagtaactattggcattaaccaaccttacctagctttcctccacatctgtacccacgcctttttcaaagctatactattcatatgctccggttccattattcacagcctaaacgacgaacaagatattcgaaaaataggaggcctatttaaagccatgccattcaccacaacagccctcattgttggcagtctcgcactaacaggaatacccttcctcacaggattctactccaaagacctaatcatcgaagccgccaacacgtcttataccaacgcctgagcccttctaataacattaattgccacctctttcacagctatttacagcacccgtattatttttttcgcacttctaggacaaccccgattccctaccctagttaatattaacgaaaacaacccccttctgatcaactctatcaaacgcttactaattggaagcctcttcgcaggatacatcatttccaacaatattcctccaacaacaattccccaaataactatgccctactacctaaaaacaacagccctaattgttacaatcctaggcttcatcttagccctagaaatcagtaatataactaaaaatctaaaatatcactacccctcaaacgccttcaagttctcaaccttgctagggtatttccccacaattatacatcgcctagctccatacataaatttatcaataagccaaaaatcagcatcctcccttctagacctaatctgactagaagccatcctaccaaaaaccatctcactcgcccaaataaaagcatctaccctggtcacaaaccaaaaaggcctgatcaaactatatttcctctccttcttaatcacaatccttatcagcataatcttatttaatttccacgagtaatttctataataaccacaacaccaattaataaagaccacccagttacaataactaatcaggtaccataactgtataaagccgcaatccctatggcctcttcactaaaaaacccagaatcccctgtatcataaatcacccaatcccctaaaccattaaactcaaacacaacctcaacttccttatcctttaatacataatagaccataaagaactccatcaacaagccagtaacaaatgcccctaaaacagccttattagaaagccaaatttcaggatactgttctgtagccatagccgttgtataaccaaaaactaccatcatacctcccaaataaattaaaaagaccatcaaccccaaaaaggatccaccaaaattcaatacaattccacagccaaccccaccactcacaattaaccctaaccccccataaataggtgaaggtttcgaagaaaaccccacaaaacctatcacgaaaataacgcttagaataaatacaatgtatagtatcattattcttacatggaatctaaccatgactaatgatatgaaaaaccatcgttgtcattcaactacaagaacactaatgactaacattcgaaagtcccacccactaataaaaattgtaaacaatgcattcatcgaccttccagccccatcaaacatttcatcatgatgaaatttcggttccctcctgggaatctgcctaatcctacaaatcctcacaggcctattcctagcaatacactacacatccgacacaacaacagcattctcctctgttacccatatctgccgagacgtgaactacggctgaatcatccgatacatacacgcaaacggagcttcaatgttttttatctgcttatatatgcacgtaggacgaggcttatattacgggtcttacacttttctagaaacatgaaatattggagtaatccttctgctcacagtaatagccacagcatttataggatacgtcctaccatgaggacaaatatcattctgaggagcaacagtcatcaccaacctcttatcagcaatcccatacatcggcacaaatttagtcgaatgaatctgaggcggattctcagtagacaaagcaacccttacccgattcttcgctttccattttatccttccatttatcatcatagcaattgccatagtccacctactattcctccacgaaacaggctccaacaatccaacaggaatttcctcagacatagacaaaatcccattccacccctactataccattaaggacatcttaggggccctcttactaattctagctctaatactactagtactattcgcacccgacctcctcggagacccagataactacaccccagccaatccactcaacacaccccctcacatcaaacccgagtgatacttcttatttgcatacgcaatcttacgatcaatccccaacaaactaggaggagtactagccctagccttctctatcctaattcttgctctaatccccctactacacacctccaaacaacgaagcataatattccgaccactcagccaatgcctattctgagccctagtagcagacctactgacactcacatgaattggaggacaaccagtcgaacacccatatatcaccatcggacaactagcatctgtcctatactttctcctcatcctagtgctaataccaacggccggcacagtcgaaaacaaattactaaaatgaagacaggtctttgtagtacatctaatatactggtcttgtaaaccagagaaggagaacaactaacctccctaagactcaaggaagaaactgcagtctcaccatcaacccccaaagctgaagttctatttaaactattccctgaacactattaatatagttccataaatacaaagagccttatcagtattaaatttatcaaaaatcccaataactcaacacagaatttgcaccctaaccaaatattacaaacaccactagctaacataacacacccatacacagaccacagaatgaattacctaggcaaggggtaatgtacataacattaatgtaataaagacataatatgtatatagtacattaaattatatgccccatgcatataagcaagtacatgacctctatagcagtacataatacatataattattgactgtacatagtacattatgtcaaattcattcttgatagtatatctattatatattccttaccattagatcacgagcttaattaccatgccgcgtgaaaccagcaacccgctaggcagggatccctcttctcgctccgggcccataaaccgtgggggtcgctatccaatgaactttaccaggcatctggttctttcttcagggccatctcatctaaaacggtccattctttcctcttaaataagacatctcgatgg + +' +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> parserClass [ + + ^ BioEntrezXMLGBBasicParser +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> setUp [ + + super setUp. + parseResult := BioParser parseNcbiXmlGBSeq: self gbSet01. +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> testParseAccession [ + + | record | + record := parseResult first. + self + assert: (record at: BioGBSeqCollection qualifierForAccessionWithVersion) + equals: 'HQ184032.1'. + +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> testParseDefinition [ + + | record | + record := parseResult first. + self + assert: (record at: BioGBSeqCollection qualifierForDefinition) + equals: 'Bos taurus isolate Chi597 mitochondrion, complete genome'. + +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> testParseOnlyOneRecord [ + + self assert: parseResult size equals: 1. + +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> testParseReferenceJournal [ + + | record | + record := parseResult first. + self + assert: (record at: BioGBSeqCollection qualifierForReferenceJournal) + equals: 'Submitted (25-AUG-2010) Dipartimento di Genetica e Microbiologia, University of Pavia, Via Ferrata, 1, Pavia 27100, Italy' + + + +] diff --git a/repository/BioParsers-Tests/BioFASTAParserTest.class.st b/repository/BioParsers-Tests/BioFASTAParserTest.class.st new file mode 100644 index 00000000..d0bc9871 --- /dev/null +++ b/repository/BioParsers-Tests/BioFASTAParserTest.class.st @@ -0,0 +1,515 @@ +Class { + #name : 'BioFASTAParserTest', + #superclass : 'BioAbstractFASTAParserTest', + #instVars : [ + 'fastaRecord' + ], + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioFASTAParserTest >> multiFastaSeq01PlainText [ + + ^ '>YAL069W-1.334 Putative promoter sequence +CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA +CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT +ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC +CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC +TGTTCTTCTACCCACCATATTGAAACGCTAACAA +>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA +GTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTT +TTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGG +TACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT' +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> multiFastaSeq06PlainText [ + + ^ '>YAL069W-1.334 Putative promoter sequence +CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA +CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT +ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC +CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC +TGTTCTTCTACCCACCATATTGAAACGCTAACAA + +>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA +GTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTT +TTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGG +TACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT' +] + +{ #category : 'accessing' } +BioFASTAParserTest >> setUp [ + + super setUp. + parser := self parserClass new +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseMultiFasta03 [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq03Header01 sequence: self multiFastaSeq03Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq03Header02 sequence: self multiFastaSeq03Body02). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq03. + self assert: (parseResult isKindOf: BioFastaMultiRecord). + self deny: parseResult records isEmpty. + self assert: parseResult records size equals: 2. + self assert: parseResult sequenceNames asArray equals: + (Array + with: self multiFastaSeq03Header01 allButFirst + with: self multiFastaSeq03Header02 allButFirst). + self assert: parseResult sequenceStrings asArray equals: + (Array + with: self multiFastaSeq03Body01 asCondensedString + with: self multiFastaSeq03Body02 asCondensedString). + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseMultiFasta07 [ + fastaRecord := BioFastaMultiRecord new + addFastaRecord: (BioFastaRecord named: 'Sample sequence 1' sequence: 'garkbdctymvhu'); + addFastaRecord: (BioFastaRecord named: 'Sample sequence 2' sequence: 'ctymvhgarkbda'); + addFastaRecord: (BioFastaRecord named: 'Sample sequence 3' sequence: 'ccccccccccga'); + yourself. + parseResult := self parserClass parseMultiFasta: self multiFastaSeq07. + self assert: (parseResult isKindOf: BioFastaMultiRecord). + self deny: parseResult sequences isEmpty. + self assert: parseResult sequences size equals: 3. + self assert: parseResult equals: fastaRecord +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseMultiFastaThreeSequences [ + + | seqString | + + seqString := self multiFastaSeq02. + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header01 sequence: self multiFastaSeq02Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header02 sequence: self multiFastaSeq02Body02); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header03 sequence: self multiFastaSeq02Body03). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq02PlainText. + + self assert: parseResult records size equals: 3. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseMultiFastaTwoSequences [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header01 sequence: self multiFastaSeq01Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header02 sequence: self multiFastaSeq01Body02). + + parseResult := self parserClass parseMultiFasta: '>YAL069W-1.334 Putative promoter sequence +CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA +CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT +ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC +CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC +TGTTCTTCTACCCACCATATTGAAACGCTAACAA +>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA +GTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTT +TTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGG +TACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT'. + self assert: (parseResult isKindOf: BioFastaMultiRecord). + + self deny: parseResult sequences isEmpty. + self assert: parseResult sequences size equals: 2. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFasta01 [ + + | seqSelectors parseResults | + seqSelectors := self class superclass selectorsInProtocol: 'samples-single'. + parseResults := seqSelectors collect: [ :sel | self parserClass parseFasta: (self perform: sel) ]. + + parseResults do: [ :pResult | self assert: pResult isFastaRecord ] +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFasta02 [ + fastaRecord := BioFastaRecord + named: self multiFastaSeq01Header01 + sequence: self multiFastaSeq01Body01. + parseResult := self parserClass parseFasta: self fastaSeq01. + self assert: (parseResult isKindOf: BioFastaRecord). + + self assert: parseResult name equals: self multiFastaSeq01Header01. + self + assert: parseResult sequence asString + equals: self multiFastaSeq01Body01 asCondensedString +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFasta04 [ + | seqName seqString | + seqName := 'YAL068C-7235.2170 Putative promoter sequence'. + seqString := 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA'. + fastaRecord := BioFastaRecord named: seqName sequence: seqString. + parseResult := self parserClass parseFasta: self fastaSeq04. + self assert: (parseResult isKindOf: BioFastaRecord). + self assert: parseResult equals: fastaRecord. + self assert: parseResult name equals: seqName. + self + assert: parseResult sequence asString + equals: seqString asCondensedString +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFasta07 [ + + self + should: [ self parserClass parseFasta: '>gi|6273291|emb|AF191665.1|AF191665 +' ] + raise: Error. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFastaProteinSequenceIntoRecord [ + + | seqName seqString | + seqName := '>sp_ac|P02769_WOSIG0 \ID=ALBU_BOVIN \DE="Serum albumin precursor (Allergen Bos d 6) (BSA)" \NCBITAXID=9913 \MODRES=(1|Acetyl) \VARIANT=(196|A|T) \LENGTH=589'. + seqString := 'RGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA'. + + fastaRecord := BioFastaRecord named: seqName sequence: seqString. + + parseResult := self parserClass parseFasta: self fastaSeq08. + self assert: (parseResult isKindOf: BioFastaRecord). + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFastaProteinSequenceName [ + + | seqName seqString | + seqName := 'sp_ac|P02769_WOSIG0 \ID=ALBU_BOVIN \DE="Serum albumin precursor (Allergen Bos d 6) (BSA)" \NCBITAXID=9913 \MODRES=(1|Acetyl) \VARIANT=(196|A|T) \LENGTH=589'. + seqString := 'RGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA'. + + fastaRecord := BioFastaRecord named: seqName sequence: seqString. + + parseResult := self parserClass parseFasta: self fastaSeq08. + + self assert: parseResult name equals: seqName. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFastaProteinSequenceString [ + + | seqName seqString | + seqName := 'sp_ac|P02769_WOSIG0 \ID=ALBU_BOVIN \DE="Serum albumin precursor (Allergen Bos d 6) (BSA)" \NCBITAXID=9913 \MODRES=(1|Acetyl) \VARIANT=(196|A|T) \LENGTH=589'. + seqString := 'RGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA'. + + fastaRecord := BioFastaRecord named: seqName sequence: seqString. + + parseResult := self parserClass parseFasta: self fastaSeq08. + + self + assert: parseResult sequence asString + equals: seqString asCondensedString + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseThreeMultiFastaIntoRecord [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header01 sequence: self multiFastaSeq02Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header02 sequence: self multiFastaSeq02Body02); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header03 sequence: self multiFastaSeq02Body03). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq02PlainText. + + self assert: parseResult sequenceNames equals: fastaRecord sequenceNames. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseThreeMultiFastaSequenceNames [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header01 sequence: self multiFastaSeq02Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header02 sequence: self multiFastaSeq02Body02); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header03 sequence: self multiFastaSeq02Body03). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq02PlainText. + + self assert: parseResult equals: fastaRecord. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseThreeMultiFastaSequenceStrings [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header01 sequence: self multiFastaSeq02Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header02 sequence: self multiFastaSeq02Body02); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header03 sequence: self multiFastaSeq02Body03). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq02PlainText. + + self + assert: parseResult sequenceStrings asArray + equals: (Array + with: self multiFastaSeq02Body01 asCondensedString + with: self multiFastaSeq02Body02 asCondensedString + with: self multiFastaSeq02Body03 asCondensedString). + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseTwoMultiFastaIntoRecord [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header01 sequence: self multiFastaSeq01Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header02 sequence: self multiFastaSeq01Body02). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq01PlainText. + + self + assert: parseResult sequenceNames + equals: fastaRecord sequenceNames. + + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseTwoMultiFastaSequenceNames [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header01 sequence: self multiFastaSeq01Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header02 sequence: self multiFastaSeq01Body02). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq01PlainText. + self + assert: (parseResult sequenceNames + bioHasEqualElements: (OrderedCollection + with: self multiFastaSeq01Header01 + with: self multiFastaSeq01Header02)). + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseTwoMultiFastaSequenceStrings [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header01 sequence: self multiFastaSeq01Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header02 sequence: self multiFastaSeq01Body02). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq01PlainText. + self + assert: parseResult sequenceStrings asArray + equals: ( + Array + with: self multiFastaSeq01Body01 asCondensedString + with: self multiFastaSeq01Body02 asCondensedString). + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta01 [ + + | seqSelectors parseResults | + seqSelectors := self class superclass selectorsInProtocol: + 'samples-single'. + parseResults := seqSelectors collect: [ :sel | + self parserClass tokenizeFasta: (self perform: sel) ]. + + parseResults do: [ :pResult | + self assert: pResult isCollection. + self assert: pResult size equals: 2 ] +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta02 [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq01) + bioHasEqualElements: + #('YAL069W-1.334 Putative promoter sequence' 'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATCCAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATATTGAAACGCTAACAA')). +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta03 [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq02) + bioHasEqualElements: #('YAL068C-7235.2170 Putative promoter sequence' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGATAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAAGTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTTTTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGGTACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT')) +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta05 [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq05) + bioHasEqualElements: #( + 'YAL068C-7235.2170 Putative promoter sequence' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA') ). +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta06 [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq08) + bioHasEqualElements: #( + 'sp_ac|P02769_WOSIG0 \ID=ALBU_BOVIN \DE="Serum albumin precursor (Allergen Bos d 6) (BSA)" \NCBITAXID=9913 \MODRES=(1|Acetyl) \VARIANT=(196|A|T) \LENGTH=589' + 'RGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA') ). +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta07 [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq09) + bioHasEqualElements: #('gi|6273291|emb|AF191665.1|AF191665' 'actgtcgatatgctagct') ) +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFastaWithReturnLine [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq04) + bioHasEqualElements: #('YAL068C-7235.2170 Putative promoter sequence' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA')). +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta01 [ + + self + assert: ((self parserClass tokenizeMultiFasta: self multiFastaSeq01PlainText) + bioHasEqualElements: #( + #('YAL069W-1.334 Putative promoter sequence' 'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATCCAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATATTGAAACGCTAACAA') + #('YAL068C-7235.2170 Putative promoter sequence' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGATAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAAGTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTTTTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGGTACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT') ) ). + + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta02 [ + + self assert: ( + (self parserClass tokenizeMultiFasta: self multiFastaSeq02PlainText) + bioHasEqualElements: #( + #('first sequence record' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCA') + #('second sequence record' 'CAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA') + #('third sequence record' 'GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGATAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA') ) ). + + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta03 [ + + self assert: ( ( self parserClass tokenizeMultiFasta: self multiFastaSeq03 ) bioHasEqualElements: #( + #( 'SEQUENCE_1' 'MTEITAAMVKELRESTGAGMMDCKNALSETNGDFDKAVQLLREKGLGKAAKKADRLAAEGLVSVKVSDDFTIAAMRPSYLSYEDLDMTFVENEYKALVAELEKENEERRRLKDPNKPEHKIPQFASRKQLSDAILKEAEEKIKEELKAQGKPEKIWDNIIPGKMNSFIADNSQLDSKLTLMGQFYVMDDKKTVEQVIAEKEKEFGGKIKIVEFICFEVGEGLEKKTEDFAAEVAAQL') + #( 'SEQUENCE_2' 'SATVSEINSETDFVAKNDQFIALTKDTTAHIQSNSLQSVEELHSSTINGVKFEEYLKSQIATIGENLVVRRFATLKAGANGVVNGYIHTNGRVGVVIAAACDSAEVASKSRDLLRQICMH') ) ). + + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta04 [ + + self assert: ( ( self parserClass tokenizeMultiFasta: self multiFastaSeq04 ) bioHasEqualElements: #( + #( 'HSBGPG Human gene for bone gla protein (BGP)' 'GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGTATAAACAGTGCTGGAGGCTGGCGGGGCAGGCCAGCTGAGTCCTGAGCAGCAGCCCAGCGCAGCCACCGAGACACCATGAGAGCCCTCACACTCCTCGCCCTATTGGCCCTGGCCGCACTTTGCATCGCTGGCCAGGCAGGTGAGTGCCCCCACCTCCCCTCAGGCCGCATTGCAGTGGGGGCTGAGAGGAGGAAGCACCATGGCCCACCTCTTCTCACCCCTTTGGCTGGCAGTCCCTTTGCAGTCTAACCACCTTGTTGCAGGCTCAATCCATTTGCCCCAGCTCTGCCCTTGCAGAGGGAGAGGAGGGAAGAGCAAGCTGCCCGAGACGCAGGGGAAGGAGGATGAGGGCCCTGGGGATGAGCTGGGGTGAACCAGGCTCCCTTTCCTTTGCAGGTGCGAAGCCCAGCGGTGCAGAGTCCAGCAAAGGTGCAGGTATGAGGATGGACCTGATGGGTTCCTGGACCCTCCCCTCTCACCCTGGTCCCTCAGTCTCATTCCCCCACTCCTGCCACCTCCTGTCTGGCCATCAGGAAGGCCAGCCTGCTCCCCACCTGATCCTCCCAAACCCAGAGCCACCTGATGCCTGCCCCTCTGCTCCACAGCCTTTGTGTCCAAGCAGGAGGGCAGCGAGGTAGTGAAGAGACCCAGGCGCTACCTGTATCAATGGCTGGGGTGAGAGAAAAGGCAGAGCTGGGCCAAGGCCCTGCCTCTCCGGGATGGTCTGTGGGGGAGCTGCAGCAGGGAGTGGCCTCTCTGGGTTGTGGTGGGGGTACAGGCAGCCTGCCCTGGTGGGCACCCTGGAGCCCCATGTGTAGGGAGAGGAGGGATGGGCATTTTGCACGGGGGCTGATGCCACCACGTCGGGTGTCTCAGAGCCCCAGTCCCCTACCCGGATCCCCTGGAGCCCAGGAGGGAGGTGTGTGAGCTCAATCCGGACTGTGACGAGTTGGCTGACCACATCGGCTTTCAGGAGGCCTATCGGCGCTTCTACGGCCCGGTCTAGGGTGTCGCTCTGCTGGCCTGGCCGGCAACCCCAGTTCTGCTCCTCTCCAGGCACCCTTCTTTCCTCTTCCCCTTGCCCTTGCCCTGACCTCCCAGCCCTATGGATGTGGGGTCCCCATCATCCCAGCTGCTCCCAAATAAACTCCAGAAG') + #( 'HSGLTH1 Human theta 1-globin gene' 'CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGACGGGGGGCCTTGGATCCAGGGCGATTCAGAGGGCCCCGGTCGGAGCTGTCGGAGATTGAGCGCGCGCGGTCCCGGGATCTCCGACGAGGCCCTGGACCCCCGGGCGGCGAAGCTGCGGCGCGGCGCCCCCTGGAGGCCGCGGGACCCCTGGCCGGTCCGCGCAGGCGCAGCGGGGTCGCAGGGCGCGGCGGGTTCCAGCGCGGGGATGGCGCTGTCCGCGGAGGACCGGGCGCTGGTGCGCGCCCTGTGGAAGAAGCTGGGCAGCAACGTCGGCGTCTACACGACAGAGGCCCTGGAAAGGTGCGGCAGGCTGGGCGCCCCCGCCCCCAGGGGCCCTCCCTCCCCAAGCCCCCCGGACGCGCCTCACCCACGTTCCTCTCGCAGGACCTTCCTGGCTTTCCCCGCCACGAAGACCTACTTCTCCCACCTGGACCTGAGCCCCGGCTCCTCACAAGTCAGAGCCCACGGCCAGAAGGTGGCGGACGCGCTGAGCCTCGCCGTGGAGCGCCTGGACGACCTACCCCACGCGCTGTCCGCGCTGAGCCACCTGCACGCGTGCCAGCTGCGAGTGGACCCGGCCAGCTTCCAGGTGAGCGGCTGCCGTGCTGGGCCCCTGTCCCCGGGAGGGCCCCGGCGGGGTGGGTGCGGGGGGCGTGCGGGGCGGGTGCAGGCGAGTGAGCCTTGAGCGCTCGCCGCAGCTCCTGGGCCACTGCCTGCTGGTAACCCTCGCCCGGCACTACCCCGGAGACTTCAGCCCCGCGCTGCAGGCGTCGCTGGACAAGTTCCTGAGCCACGTTATCTCGGCGCTGGTTTCCGAGTACCGCTGAACTGTGGGTGGGTGGCCGCGGGATCCCCAGGCGACCTTCCCCGTGTTTGAGTAAAGCCTCTCCCAGGAGCAGCCTTCTTGCCGTGCTCTCTCGAGGTCAGGACGCGAGAGGAAGGCGC' ) ) ). + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta05 [ + + self assert: ( + (self parserClass tokenizeMultiFasta: self multiFastaSeq05) + bioHasEqualElements: #( + #('sequence1' 'ACTCCCCGTGCGCGCCCGGCCCGTAGCGTCCTCGTCGCCGCCCCTCGTCTCGCAGCCGCAGCCCGCGTGGACGCTCTCGCCTGAGCGCCGCGGACTAGCCCGGGTGGCC') + #('sequence2' 'CAGTCCGGCAGCGCCGGGGTTAAGCGGCCCAAGTAAACGTAGCGCAGCGATCGGCGCCGGAGATTCGCGAACCCGACACTCCGCGCCGCCCGCCGGCCAGGACCCGCGGCGCGATCGCGGCGCCGCGCTACAGCCAGCCTCACTGGCGCGCGGGCGAGCGCACGGGCGCTC' ) + #('sequence3' 'CACGACAGGCCCGCTGAGGCTTGTGCCAGACCTTGGAAACCTCAGGTATATACCTTTCCAGACGCGGGATCTCCCCTCCCC') + #('sequence4' 'CAGCAGACATCTGAATGAAGAAGAGGGTGCCAGCGGGTATGAGGAGTGCATTATCGTTAATGGGAACTTCAGTGACCAGTCCTCAGACACGAAGGATGCTCCCTCACCCCCAGTCTTGGAGGCAATCTGCACAGAGCCAGTCTGCACACC') + )). + + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta07 [ + + self + assert: ((self parserClass tokenizeMultiFasta: self multiFastaSeq07) + bioHasEqualElements: #( + #('Sample sequence 1' 'garkbdctymvhu') + #('Sample sequence 2' 'ctymvhgarkbda') + #('Sample sequence 3' 'ccccccccccga'))). + + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFastaWithAdditionalSeparatorBetweenRecords [ + + self + assert: ((self parserClass tokenizeMultiFasta: self multiFastaSeq06PlainText) + bioHasEqualElements: #( + #('YAL069W-1.334 Putative promoter sequence' 'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATCCAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATATTGAAACGCTAACAA') + #('YAL068C-7235.2170 Putative promoter sequence' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGATAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAAGTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTTTTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGGTACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT') ) ). + + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testTokenizeSingleFastaDescription01 [ + + | seqHeader | + seqHeader := '>gi|198282148|ref|NC_011206.1| Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome'. + + parseResult := self parserClass tokenizeFastaDescription: seqHeader. + self assert: (parseResult isKindOf: Collection). + self + assert: parseResult + equals: + 'Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome' +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testTokenizeSingleFastaHeader01 [ + + | seqHeader | + seqHeader := '>gi|198282148|ref|NC_011206.1| Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome'. + + parseResult := self parserClass tokenizeFastaHeader: seqHeader. + self assert: (parseResult isKindOf: Collection). + self + assert: parseResult + equals: + #( '>gi' '198282148' 'ref' 'NC_011206.1' 'Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome' ) +] diff --git a/repository/BioParsers-Tests/BioGFF3Test.class.st b/repository/BioParsers-Tests/BioGFF3Test.class.st new file mode 100644 index 00000000..a162a963 --- /dev/null +++ b/repository/BioParsers-Tests/BioGFF3Test.class.st @@ -0,0 +1,315 @@ +Class { + #name : 'BioGFF3Test', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioGFF3Test >> baseDirectory [ + + ^ self testFilesFullDirectoryName / 'gff'. +] + +{ #category : 'accessing' } +BioGFF3Test >> hsaFile [ + + ^ self baseDirectory / 'hsa.gff3'. +] + +{ #category : 'accessing' } +BioGFF3Test >> rnoFile [ + + ^ self baseDirectory / 'rno.gff3'. +] + +{ #category : 'tests' } +BioGFF3Test >> testAsBioSequenceFeature [ + + | gff result f sf | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=gene1;Name=EDEN +'. + result := BioGFF3File fromString: gff. + f := result features first. + sf := f asBioSequenceFeature. + self assert: sf primaryTag equals: 'gene'. + self assert: sf sourceTag equals: '.'. + self assert: sf start equals: 1000. + self assert: sf end equals: 9000. + self assert: sf strand equals: '+'. + self assert: sf chromosome equals: 'chr1' +] + +{ #category : 'tests' } +BioGFF3Test >> testAttributesParsing [ + + | gff result f | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=gene1;Name=EDEN;Note=some%20note +'. + result := BioGFF3File fromString: gff. + f := result features first. + self assert: f id equals: 'gene1'. + self assert: f name equals: 'EDEN'. + self assert: (f attributeAt: 'Note') equals: 'some%20note' +] + +{ #category : 'tests' } +BioGFF3Test >> testBasicParse [ + + | gff result | + gff := '##gff-version 3 +ctg123 . gene 1000 9000 . + . ID=gene1;Name=EDEN +'. + result := BioGFF3File fromString: gff. + self assert: result featureCount equals: 1. + self assert: result features first type equals: 'gene'. + self assert: result features first name equals: 'EDEN'. + self assert: result features first id equals: 'gene1' +] + +{ #category : 'tests' } +BioGFF3Test >> testComments [ + + | gff result | + gff := '##gff-version 3 +# This is a comment +# Another comment +chr1 . gene 1000 9000 . + . ID=g1 +'. + result := BioGFF3File fromString: gff. + self assert: result featureCount equals: 1. + self assert: result features first id equals: 'g1' +] + +{ #category : 'tests' } +BioGFF3Test >> testDerivesFromRelationship [ + + | gff result parent children | + gff := '##gff-version 3 +chr1 . miRNA_primary_transcript 17369 17436 . - . ID=MI0022705;Name=hsa-mir-6859-1 +chr1 . miRNA 17409 17431 . - . ID=MIMAT1;Name=hsa-miR-6859-5p;Derives_from=MI0022705 +'. + result := BioGFF3File fromString: gff. + parent := result featureWithId: 'MI0022705'. + self assert: parent isNotNil. + children := result derivesFromOf: parent. + self assert: children size equals: 1. + self assert: children first name equals: 'hsa-miR-6859-5p' +] + +{ #category : 'tests' } +BioGFF3Test >> testFeatureFields [ + + | gff result f | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=gene1;Name=EDEN +'. + result := BioGFF3File fromString: gff. + f := result features first. + self assert: f seqid equals: 'chr1'. + self assert: f source equals: '.'. + self assert: f type equals: 'gene'. + self assert: f start equals: '1000'. + self assert: f end equals: '9000'. + self assert: f score equals: '.'. + self assert: f strand equals: '+'. + self assert: f phase equals: '.' +] + +{ #category : 'tests' } +BioGFF3Test >> testFeatureTypeFiltering [ + + | gff result | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=g1 +chr1 . mRNA 1050 9000 . + . ID=m1;Parent=g1 +chr1 . exon 1300 1500 . + . Parent=m1 +chr1 . CDS 1201 1500 . + 0 ID=c1;Parent=m1 +'. + result := BioGFF3File fromString: gff. + self assert: result featureCount equals: 4. + self assert: (result featuresWithType: 'gene') size equals: 1. + self assert: (result featuresWithType: 'exon') size equals: 1. + self assert: (result featuresWithType: 'CDS') size equals: 1. + self + assert: result featureTypes + equals: #( 'CDS' 'exon' 'gene' 'mRNA' ) +] + +{ #category : 'tests' } +BioGFF3Test >> testFromFileFilteringSeqid [ + + | result | + result := BioGFF3File new + fromFile: self rnoFile + filteringSeqid: 'chr1'. + self assert: result featureCount > 0. + self assert: (result features allSatisfy: [ :f | f seqid = 'chr1' ]) +] + +{ #category : 'tests' } +BioGFF3Test >> testFromFileFilteringTypes [ + + | result | + result := BioGFF3File new + fromFile: self hsaFile + filteringTypes: (Set with: 'miRNA'). + self assert: result featureCount equals: 2883. + self assert: (result features allSatisfy: [ :f | f type = 'miRNA' ]) +] + +{ #category : 'tests' } +BioGFF3Test >> testFromFileMaxFeatures [ + + | result | + result := BioGFF3File new fromFile: self hsaFile maxFeatures: 100. + self assert: result featureCount equals: 100 +] + +{ #category : 'tests' } +BioGFF3Test >> testGffVersion [ + + | gff result | + gff := '##gff-version 3 +ctg123 . gene 1000 9000 . + . ID=gene1 +'. + result := BioGFF3File fromString: gff. + self assert: result gffVersion equals: '3' +] + +{ #category : 'tests' } +BioGFF3Test >> testGroupByType [ + + | gff result groups | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=g1 +chr1 . mRNA 1050 9000 . + . ID=m1 +chr1 . exon 1300 1500 . + . Parent=m1 +'. + result := BioGFF3File fromString: gff. + groups := result groupByType. + self assert: (groups at: 'gene') size equals: 1. + self assert: (groups at: 'mRNA') size equals: 1. + self assert: (groups at: 'exon') size equals: 1 +] + +{ #category : 'tests' } +BioGFF3Test >> testIntegerAccessors [ + + | gff result f | + gff := '##gff-version 3 +chr1 . exon 1000 9000 . + . Parent=mRNA1 +'. + result := BioGFF3File fromString: gff. + f := result features first. + self assert: f startInteger equals: 1000. + self assert: f endInteger equals: 9000. + self assert: f isForwardStrand. + self deny: f isReverseStrand +] + +{ #category : 'tests' } +BioGFF3Test >> testParentChildRelationship [ + + | gff result gene children | + gff := '##gff-version 3 +ctg123 . gene 1000 9000 . + . ID=gene00001;Name=EDEN +ctg123 . mRNA 1050 9000 . + . ID=mRNA00001;Parent=gene00001 +'. + result := BioGFF3File fromString: gff. + gene := result featureWithId: 'gene00001'. + self assert: gene isNotNil. + children := result childrenOf: gene. + self assert: children size equals: 1. + self assert: children first type equals: 'mRNA' +] + +{ #category : 'tests' } +BioGFF3Test >> testParseHsaGffFile [ + + | result | + result := BioGFF3File fromFile: self hsaFile. + self assert: result featureCount equals: 4801. + self + assert: result featureTypes + equals: #( 'miRNA' 'miRNA_primary_transcript' ). + self assert: result seqids size equals: 24. + self assert: (result featuresWithType: 'miRNA') size equals: 2883. + self + assert: (result featuresWithType: 'miRNA_primary_transcript') size + equals: 1918 +] + +{ #category : 'tests' } +BioGFF3Test >> testParseRnoGffFile [ + + | result | + result := BioGFF3File fromFile: self rnoFile. + self assert: result featureCount equals: 1323. + self + assert: result featureTypes + equals: #( 'miRNA' 'miRNA_primary_transcript' ) +] + +{ #category : 'tests' } +BioGFF3Test >> testReverseStrand [ + + | gff result f | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . - . ID=g1 +'. + result := BioGFF3File fromString: gff. + f := result features first. + self assert: f isReverseStrand. + self deny: f isForwardStrand +] + +{ #category : 'tests' } +BioGFF3Test >> testScoreFloat [ + + | f | + f := BioGFF3Feature new score: '.'. + self assert: f scoreFloat equals: nil. + f score: '42.5'. + self assert: f scoreFloat equals: 42.5 +] + +{ #category : 'tests' } +BioGFF3Test >> testSeqidFiltering [ + + | gff result | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=g1 +chr2 . gene 2000 8000 . + . ID=g2 +'. + result := BioGFF3File fromString: gff. + self assert: (result featuresWithSeqid: 'chr1') size equals: 1. + self assert: (result featuresWithSeqid: 'chr2') size equals: 1. + self assert: result seqids equals: #( 'chr1' 'chr2' ) +] + +{ #category : 'tests' } +BioGFF3Test >> testStreamFeaturesFromFile [ + + | count | + count := 0. + BioGFF3File new + streamFeaturesFromFile: self hsaFile + block: [ :f | count := count + 1 ]. + self assert: count equals: 4801 +] + +{ #category : 'tests' } +BioGFF3Test >> testTypePredicates [ + + | f | + f := BioGFF3Feature new type: 'gene'. + self assert: f isOfGeneType. + f type: 'miRNA'. + self assert: f isMiRNAType. + self deny: f isMiRNAPrimaryTranscript. + f type: 'miRNA_primary_transcript'. + self assert: f isMiRNAPrimaryTranscript +] diff --git a/repository/BioParsers-Tests/BioGenBankParserTest.class.st b/repository/BioParsers-Tests/BioGenBankParserTest.class.st new file mode 100644 index 00000000..223eaa20 --- /dev/null +++ b/repository/BioParsers-Tests/BioGenBankParserTest.class.st @@ -0,0 +1,51 @@ +Class { + #name : 'BioGenBankParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioGenBankParserTest >> gbSeq01 [ + " From http://www.genomatix.de/online_help/help/sequence_formats.html " + + ^ 'LOCUS AB000263 368 bp mRNA linear PRI 05-FEB-1999 +DEFINITION Homo sapiens mRNA for prepro cortistatin like peptide, complete + cds. +ACCESSION AB000263 +ORIGIN + 1 acaagatgcc attgtccccc ggcctcctgc tgctgctgct ctccggggcc acggccaccg + 61 ctgccctgcc cctggagggt ggccccaccg gccgagacag cgagcatatg caggaagcgg + 121 caggaataag gaaaagcagc ctcctgactt tcctcgcttg gtggtttgag tggacctccc + 181 aggccagtgc cgggcccctc ataggagagg aagctcggga ggtggccagg cggcaggaag + 241 gcgcaccccc ccagcaatcc gcgcgccggg acagaatgcc ctgcaggaac ttcttctgga + 301 agaccttctc ctcctgcaaa taaaacctca cccatgaatg ctcacgcaag tttaattaca + 361 gacctgaa +// +' +] + +{ #category : 'accessing' } +BioGenBankParserTest >> setUp [ + + super setUp. + parser := BioGBParser new +] + +{ #category : 'testing' } +BioGenBankParserTest >> testGenBankTokenize01 [ + + self + assert: (BioParser tokenizeAccession: 'gb|AAM45611.1|AF384285_1') + equals: #( 'AAM45611' '1' ). + +] + +{ #category : 'testing' } +BioGenBankParserTest >> testGenBankTokenize02 [ + + self + assert: (BioParser tokenizeLocus: 'gb|AAM45611.1|AF384285_1') + equals: 'AF384285_1' + +] diff --git a/repository/BioParsers-Tests/BioGenIdParserTest.class.st b/repository/BioParsers-Tests/BioGenIdParserTest.class.st new file mode 100644 index 00000000..58267d44 --- /dev/null +++ b/repository/BioParsers-Tests/BioGenIdParserTest.class.st @@ -0,0 +1,33 @@ +Class { + #name : 'BioGenIdParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioGenIdParserTest >> setUp [ + " Private - See superimplementor's comment " + + super setUp. + parser := BioGIParser new. + +] + +{ #category : 'testing' } +BioGenIdParserTest >> testTokenizeGenBankIdentifier01 [ + + self assert: (self parser tokenize: 'gi|32128012') equals: '32128012'. + self assert: (self parser tokenize: 'gi|152787') equals: '152787' +] + +{ #category : 'testing' } +BioGenIdParserTest >> testTokenizeGenBankIdentifier02 [ + + self + should: [ self parser tokenize: 'gi' ] + raise: Error. + self + should: [ self parser tokenize: 'gi|' ] + raise: Error +] diff --git a/repository/BioParsers-Tests/BioMAFParserTest.class.st b/repository/BioParsers-Tests/BioMAFParserTest.class.st new file mode 100644 index 00000000..6d09755a --- /dev/null +++ b/repository/BioParsers-Tests/BioMAFParserTest.class.st @@ -0,0 +1,81 @@ +Class { + #name : 'BioMAFParserTest', + #superclass : 'BioAbstractFASTAParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'sample-data' } +BioMAFParserTest >> mafSample01 [ + " From http://genome.ucsc.edu/FAQ/FAQformat.html " + + ^ '##maf version=1 scoring=tba.v8 +# tba.v8 (((human chimp) baboon) (mouse rat)) + +a score=23262.0 +s hg18.chr7 27578828 38 + 158545518 AAA-GGGAATGTTAACCAAATGA---ATTGTCTCTTACGGTG +s panTro1.chr6 28741140 38 + 161576975 AAA-GGGAATGTTAACCAAATGA---ATTGTCTCTTACGGTG +s baboon 116834 38 + 4622798 AAA-GGGAATGTTAACCAAATGA---GTTGTCTCTTATGGTG +s mm4.chr6 53215344 38 + 151104725 -AATGGGAATGTTAAGCAAACGA---ATTGTCTCTCAGTGTG +s rn3.chr4 81344243 40 + 187371129 -AA-GGGGATGCTAAGCCAATGAGTTGTTGTCTCTCAATGTG + +a score=5062.0 +s hg18.chr7 27699739 6 + 158545518 TAAAGA +s panTro1.chr6 28862317 6 + 161576975 TAAAGA +s baboon 241163 6 + 4622798 TAAAGA +s mm4.chr6 53303881 6 + 151104725 TAAAGA +s rn3.chr4 81444246 6 + 187371129 taagga + +a score=6636.0 +s hg18.chr7 27707221 13 + 158545518 gcagctgaaaaca +s panTro1.chr6 28869787 13 + 161576975 gcagctgaaaaca +s baboon 249182 13 + 4622798 gcagctgaaaaca +s mm4.chr6 53310102 13 + 151104725 ACAGCTGAAAATA' +] + +{ #category : 'sample-data' } +BioMAFParserTest >> mafSample02 [ + " From https://cgwb.nci.nih.gov/goldenPath/help/maf.html " + + ^ '##maf version=1 scoring=probability +#mblastz 8.91 02-Jan-2005 + +a score=0.128 +s human_hoxa 100 9 + 100257 ACA-TTACT +s horse_hoxa 120 10 - 98892 ACAATTGCT +s fugu_hoxa 88 8 + 90788 ACA--TGCT + +a score=0.071 +s human_unc 9077 8 + 10998 ACAGTATT +s horse_unc 4555 6 - 5099 ACA--ATT +s fugu_unc 4000 4 + 4038 AC----TT' +] + +{ #category : 'accessing' } +BioMAFParserTest >> parserClass [ + + ^ BioMAFParser +] + +{ #category : 'accessing' } +BioMAFParserTest >> setUp [ + + super setUp. + parser := self parserClass new +] + +{ #category : 'testing' } +BioMAFParserTest >> testMultiSeqAlignment01 [ + +" parseResult := self parser parse: self mafSample01." + self assert: true + +] + +{ #category : 'testing' } +BioMAFParserTest >> testMultiSeqAlignment02 [ + + "parseResult := self parser parse: self mafSample02." + self assert: true + +] diff --git a/repository/BioParsers-Tests/BioNCBIIdParserTest.class.st b/repository/BioParsers-Tests/BioNCBIIdParserTest.class.st new file mode 100644 index 00000000..1033dc7b --- /dev/null +++ b/repository/BioParsers-Tests/BioNCBIIdParserTest.class.st @@ -0,0 +1,83 @@ +Class { + #name : 'BioNCBIIdParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioNCBIIdParserTest >> parserClass [ + " Private - See superimplementor's comment " + + ^ BioNCBIIdParser + +] + +{ #category : 'accessing' } +BioNCBIIdParserTest >> setUp [ + " Private - See superimplementor's comment " + + super setUp. + parser := self parserClass new. + +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testAllIdentifiers [ + + | identifiers | + + identifiers := self parserClass allIdentifiers. + self assert: (identifiers isKindOf: Collection). + self + assertCollection: identifiers + hasSameElements: #('pdb' 'bbs' 'gi' 'gnl' 'lcl' 'pat' 'pir' 'prf' 'sp' 'dbj' 'emb' 'gb' 'ref'). +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForBBS [ + + self + assert: (self parserClass classFor: 'bbs') + equals: BioGIBackBoneIdParser. + +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForDBJ [ + + self assert: (self parserClass classFor: 'dbj') equals: BioDDBJParser. + +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForEMB [ + + self assert: (self parserClass classFor: 'emb') equals: BioEMBLParser. + +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForPDB [ + + self + assert: (self parserClass classFor: 'pdb') + equals: BioBrookhavenProtParser. +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForRefSeq [ + + self + assert: (self parserClass classFor: 'ref') + equals: BioRefSeqParser. +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForSwissProt [ + + self + assert: (self parserClass classFor: 'sp') + equals: BioSwissProtParser + +] diff --git a/repository/BioParsers-Tests/BioPhylipParserTest.class.st b/repository/BioParsers-Tests/BioPhylipParserTest.class.st new file mode 100644 index 00000000..2c2e750d --- /dev/null +++ b/repository/BioParsers-Tests/BioPhylipParserTest.class.st @@ -0,0 +1,254 @@ +Class { + #name : 'BioPhylipParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioPhylipParserTest >> firstLineTokenizer [ + + ^ BioPhylipParser new firstLineTokenizer +] + +{ #category : 'testing' } +BioPhylipParserTest >> phylipInterleavedDNA [ + + ^ ' 6 13 +Archaeopt CGATGCTTAC CGCCGATGCT +HesperorniCGTTACTCGT TGTCGTTACT +BaluchitheTAATGTTAAT TGTTAATGTT +B. virginiTAATGTTCGT TGTTAATGTT +BrontosaurCAAAACCCAT CATCAAAACC +B.subtilisGGCAGCCAAT CACGGCAGCC + +TACCGCCGAT GCTTACCGC +CGTTGTCGTT ACTCGTTGT +AATTGTTAAT GTTAATTGT +CGTTGTTAAT GTTCGTTGT +CATCATCAAA ACCCATCAT +AATCACGGCA GCCAATCAC + +CCCCGCCCCC GCTTACCGC +CCCCGTCCCC ACTCGTTGT +CCCCGTCCCC GTTAATTGT +CCCCGTCCCC GTTCGTTGT +CCCCATCCCC ACCCATCAT +CCCCACCCCC GCCAATCAC +' +] + +{ #category : 'testing' } +BioPhylipParserTest >> phylipInterleavedProtein [ + + ^ ' 5 176 +cox2_leitaMAFILSFWMI FLLDSVIVLL SFVCFVCVWI CALLFSTVLL VSKLNNIYCT +cox2_crifaMAFILSFWMI FLIDAVIVLL SFVCFVCIWI CSLFFSSFLL VSKINNVYCT +cox2_bsaltMSFIISFWML FLIDSLIVLL SGAIFVCIWI CSLFFLCILF ICKLDYIFCS +cox2_trybbMSFILTFWMI FLMDSIIVLI SFSIFLSVWI CALIIATVLT VTKINNIYCT +cox2_tborrMLFFINQLLL LLVDTFVILE IFSLFVCVFI IVMYILFINY NIFLKNINVY + +WDFTASKFID VYWFTIGGMF SLGLLLRLCL LLYFGHLNFV SFDLCKVVGF +WDFTASKFID AYWFTIGGMF VLCLLLRLCL LLYFGCLNFV SFDLCKVVGF +WDFISAKFID LYWFTLGCLF IVCLLIRLCL LLYFSCLNFV CFDLCKCIGF +WDFISSKFID TYWFVLGMMF ILCLLLRLCL LLYFSCINFV SFDLCKVIGF +LDFIGSKYLD LYWFLIGIFF VIVLLIRLCL LLYYSWISLL IFDLCKIMGF + +QWYWVYFIFG ETTIFSNLIL ESDYMIGDLR LLQCNHVLTL LSLVIYKLWL +QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKLWL +QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKVWL +QWYWVYFLFG ETTIFSNLIL ESDYLIGDLR ILQCNHVLTL LSLVIYKLWV +QWYWIFFVFK ENVIFSNLLI ESDYWIGDLR LLQCNNTFNL ICLVVYKIWV + +SAVDVIHSFA ISSLGVKVEN LVAVMK +SAVDVIHSFA VSSLGIKVDC IPGRCN +SAIDVIHSFT LANLGIKVD? ?PGRCN +SAVDVIHSFT ISSLGIKVEN PGRCNE +TSIDVIHSFT ISTLGIKIDC IPGRCN +' +] + +{ #category : 'testing' } +BioPhylipParserTest >> speciesDNALineTokenizer [ + + ^ BioPhylipParser new speciesDNALineTokenizer +] + +{ #category : 'testing' } +BioPhylipParserTest >> speciesDNANamedBlockTokenizer [ + + ^ BioPhylipParser new speciesDNANamedBlockTokenizer +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeDNASpeciesBlock01 [ + " Private - Answer a with a sample phylip DNA " + + | speciesBlock expectedResult firstRecord | + + speciesBlock := 'Archaeopt CGATGCTTAC CGC +HesperorniCGTTACTCGT TGT +BaluchitheTAATGTTAAT TGT +B. virginiTAATGTTCGT TGT +BrontosaurCAAAACCCAT CAT +B.subtilisGGCAGCCAAT CAC'. + expectedResult := #(#('Archaeopt ' 'CGATGCTTAC CGC' nil) #('Hesperorni' 'CGTTACTCGT TGT' nil) #('Baluchithe' 'TAATGTTAAT TGT' nil) #('B. virgini' 'TAATGTTCGT TGT' nil) #('Brontosaur' 'CAAAACCCAT CAT' nil)). + + parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. + firstRecord := parseResult first. + + self assert: firstRecord first equals: 'Archaeopt '. + self assert: firstRecord second equals: 'CGATGCTTAC CGC'. + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeDNASpeciesBlock02 [ + " Private - Answer a with a sample phylip DNA " + + | speciesBlock expectedResult firstRecord | + + speciesBlock := 'Archaeopt CGATGCTTAC CGC +Hes CGTTACTCGT TGT +BaluchitheTAATGTTAAT TGT +B. virginiTAATGTTCGT TGT +BrontosaurCAAAACCCAT CAT +B.subtilisGGCAGCCAAT CAC'. + expectedResult := #( + #('Archaeopt ' 'CGATGCTTAC CGC') + #('Hesperorni' 'CGTTACTCGT TGT') + #('Baluchithe' 'TAATGTTAAT TGT') + #('B. virgini' 'TAATGTTCGT TGT') + #('Brontosaur' 'CAAAACCCAT CAT')). + + parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. + firstRecord := parseResult first. + + self assert: firstRecord first equals: 'Archaeopt '. + self assert: firstRecord second equals: 'CGATGCTTAC CGC'. + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeDNASpeciesLine01 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'Archaeopt CGATGCTTAC CGC'. + expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult ). +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeDNASpeciesLine02 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'Archaeopt CGATGCTTACCGC'. + expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult). + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeDNASpeciesLine03 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'B. virginiTAATGTTCGT TGT'. + expectedResult := #('B. virgini' 'TAATGTTCGTTGT'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult). + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeFirstLine01 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := '6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeFirstLine02 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := ' 6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeFirstLine03 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := '6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeInterleavedDNA [ + " Private - Answer a with a sample phylip DNA " + + | phylipString | + phylipString := self phylipInterleavedDNA. + + parseResult := BioParser tokenizePhylipInterleavedDNA: phylipString. + self assert: parseResult size equals: 4. + self assert: parseResult first equals: 6. + self assert: parseResult second equals: 13. + self assert: (parseResult third bioHasEqualElements: + #( 'Archaeopt ' 'Hesperorni' 'Baluchithe' 'B. virgini' + 'Brontosaur' 'B.subtilis' )). + self assert: (parseResult fourth bioHasEqualElements: + #( 'CGATGCTTACCGCCGATGCTTACCGCCGATGCTTACCGCCCCCGCCCCCGCTTACCGC' + 'CGTTACTCGTTGTCGTTACTCGTTGTCGTTACTCGTTGTCCCCGTCCCCACTCGTTGT' + 'TAATGTTAATTGTTAATGTTAATTGTTAATGTTAATTGTCCCCGTCCCCGTTAATTGT' + 'TAATGTTCGTTGTTAATGTTCGTTGTTAATGTTCGTTGTCCCCGTCCCCGTTCGTTGT' + 'CAAAACCCATCATCAAAACCCATCATCAAAACCCATCATCCCCATCCCCACCCATCAT' + 'GGCAGCCAATCACGGCAGCCAATCACGGCAGCCAATCACCCCCACCCCCGCCAATCAC' )) +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeInterleavedProtein [ + + | phylipString | + phylipString := self phylipInterleavedProtein. + parseResult := BioParser tokenizePhylipInterleavedProtein: phylipString. + + self assert: parseResult size equals: 4. + self assert: parseResult first equals: 5. + self assert: parseResult second equals: 176. + self assert: (parseResult third bioHasEqualElements: + #( 'cox2_leita' 'cox2_crifa' 'cox2_bsalt' 'cox2_trybb' + 'cox2_tborr' )). + self assert: (parseResult fourth bioHasEqualElements: + #( 'MAFILSFWMIFLLDSVIVLLSFVCFVCVWICALLFSTVLLVSKLNNIYCTWDFTASKFIDVYWFTIGGMFSLGLLLRLCLLLYFGHLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYMIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAISSLGVKVENLVAVMK' + 'MAFILSFWMIFLIDAVIVLLSFVCFVCIWICSLFFSSFLLVSKINNVYCTWDFTASKFIDAYWFTIGGMFVLCLLLRLCLLLYFGCLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAVSSLGIKVDCIPGRCN' + 'MSFIISFWMLFLIDSLIVLLSGAIFVCIWICSLFFLCILFICKLDYIFCSWDFISAKFIDLYWFTLGCLFIVCLLIRLCLLLYFSCLNFVCFDLCKCIGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKVWLSAIDVIHSFTLANLGIKVD??PGRCN' + 'MSFILTFWMIFLMDSIIVLISFSIFLSVWICALIIATVLTVTKINNIYCTWDFISSKFIDTYWFVLGMMFILCLLLRLCLLLYFSCINFVSFDLCKVIGFQWYWVYFLFGETTIFSNLILESDYLIGDLRILQCNHVLTLLSLVIYKLWVSAVDVIHSFTISSLGIKVENPGRCNE' + 'MLFFINQLLLLLVDTFVILEIFSLFVCVFIIVMYILFINYNIFLKNINVYLDFIGSKYLDLYWFLIGIFFVIVLLIRLCLLLYYSWISLLIFDLCKIMGFQWYWIFFVFKENVIFSNLLIESDYWIGDLRLLQCNNTFNLICLVVYKIWVTSIDVIHSFTISTLGIKIDCIPGRCN' )) +] diff --git a/repository/BioParsers-Tests/BioProteinParserTest.class.st b/repository/BioParsers-Tests/BioProteinParserTest.class.st new file mode 100644 index 00000000..08037634 --- /dev/null +++ b/repository/BioParsers-Tests/BioProteinParserTest.class.st @@ -0,0 +1,81 @@ +Class { + #name : 'BioProteinParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioProteinParserTest >> setUp [ + + super setUp. + parser := #proteinLetterGapped asPParser. +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinLetterMatches [ + + parser := #proteinLetterGapped asPParser. + + 'ACDEFGHIKLMNPQRSTVWYBXZJUO' do: [: letter | + self assert: (parser matches: (String with: letter))]. + 'acdefghiklmnpqrstvwybxzjuo' do: [: letter | + self assert: (parser matches: (String with: letter))]. + + self deny: (parser matches: ''). + self deny: (parser matches: '.'). + self assert: (parser matches: '?'). + self assert: (parser matches: '-'). + + self should: [parser matches: $a] raise: MessageNotUnderstood. + self should: [parser matches: nil] raise: MessageNotUnderstood. +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseEmpty [ + + self deny: (parser matches: String empty). + +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseLowercaseSequence [ + + self assert: (parser matches: 'MNPQRSTVW' asLowercase). + +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseLowercaseSingleAminoacid [ + + self assert: (parser matches: 'p'). + +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseNumber [ + + self deny: (parser matches: '8743'). +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseSingleMismatch [ + + self assert: (parser matches: '-'). + self assert: (parser matches: '?'). + +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseUppercaseSequence [ + + self assert: (parser matches: 'MNPQRSTVW'). + +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseUppercaseSingleAminoacid [ + + self assert: (parser matches: 'P'). + +] diff --git a/repository/BioParsers-Tests/BioSwissProtParserTest.class.st b/repository/BioParsers-Tests/BioSwissProtParserTest.class.st new file mode 100644 index 00000000..49f5ae5a --- /dev/null +++ b/repository/BioParsers-Tests/BioSwissProtParserTest.class.st @@ -0,0 +1,38 @@ +Class { + #name : 'BioSwissProtParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioSwissProtParserTest >> setUp [ + + super setUp. + parser := BioSwissProtParser new. +] + +{ #category : 'testing' } +BioSwissProtParserTest >> testSwissProtTokenize01 [ + + self + assert: (BioParser tokenizeAccession: 'sp|P80487|HHP_THICU') + equals: #( 'P80487' ) +] + +{ #category : 'testing' } +BioSwissProtParserTest >> testSwissProtTokenize02 [ + + self + assert: (BioParser tokenizeAccession: 'sp|P80487.1|HHP_THICU') + equals: #( 'P80487' '1' ) +] + +{ #category : 'testing' } +BioSwissProtParserTest >> testSwissProtTokenize03 [ + + self + assert: + (BioParser tokenizeSwissProtEntryName: 'sp|Q9UWG2|RL3_METVA') + equals: 'RL3_METVA' +] diff --git a/repository/BioParsers-Tests/package.st b/repository/BioParsers-Tests/package.st new file mode 100644 index 00000000..db56e2fa --- /dev/null +++ b/repository/BioParsers-Tests/package.st @@ -0,0 +1 @@ +Package { #name : 'BioParsers-Tests' } diff --git a/repository/BioParsers/BioBlastContainerNode.class.st b/repository/BioParsers/BioBlastContainerNode.class.st index 7b30c887..4db3247a 100644 --- a/repository/BioParsers/BioBlastContainerNode.class.st +++ b/repository/BioParsers/BioBlastContainerNode.class.st @@ -4,9 +4,9 @@ Class { #instVars : [ 'nodes' ], - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'comparing' } diff --git a/repository/BioParsers/BioBlastHitNode.class.st b/repository/BioParsers/BioBlastHitNode.class.st index ab950f79..8c25b653 100644 --- a/repository/BioParsers/BioBlastHitNode.class.st +++ b/repository/BioParsers/BioBlastHitNode.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioBlastHitNode', #superclass : 'BioBlastValueNode', - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioBlastHspNode.class.st b/repository/BioParsers/BioBlastHspNode.class.st index c3227770..9d1b0323 100644 --- a/repository/BioParsers/BioBlastHspNode.class.st +++ b/repository/BioParsers/BioBlastHspNode.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioBlastHspNode', #superclass : 'BioBlastValueNode', - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioBlastNode.class.st b/repository/BioParsers/BioBlastNode.class.st index 9552a827..cae8deb5 100644 --- a/repository/BioParsers/BioBlastNode.class.st +++ b/repository/BioParsers/BioBlastNode.class.st @@ -12,9 +12,9 @@ Class { #instVars : [ 'nodeName' ], - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioBlastParentNode.class.st b/repository/BioParsers/BioBlastParentNode.class.st index 95461e13..5ac5d51a 100644 --- a/repository/BioParsers/BioBlastParentNode.class.st +++ b/repository/BioParsers/BioBlastParentNode.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioBlastParentNode', #superclass : 'BioBlastContainerNode', - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'instance creation' } diff --git a/repository/BioParsers/BioBlastRootNode.class.st b/repository/BioParsers/BioBlastRootNode.class.st index 753c43ac..f6b50799 100644 --- a/repository/BioParsers/BioBlastRootNode.class.st +++ b/repository/BioParsers/BioBlastRootNode.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioBlastRootNode', #superclass : 'BioBlastParentNode', - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioBlastStructureNode.class.st b/repository/BioParsers/BioBlastStructureNode.class.st index 07089a34..51e06b5c 100644 --- a/repository/BioParsers/BioBlastStructureNode.class.st +++ b/repository/BioParsers/BioBlastStructureNode.class.st @@ -4,9 +4,9 @@ The structure exists only to specify those nodes which are present in the XML an Class { #name : 'BioBlastStructureNode', #superclass : 'BioBlastContainerNode', - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioBlastValueNode.class.st b/repository/BioParsers/BioBlastValueNode.class.st index 974bd135..bcc961ad 100644 --- a/repository/BioParsers/BioBlastValueNode.class.st +++ b/repository/BioParsers/BioBlastValueNode.class.st @@ -7,9 +7,9 @@ Class { #instVars : [ 'value' ], - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEResultKeysParser.class.st b/repository/BioParsers/BioEResultKeysParser.class.st index 1981ba08..c692a391 100644 --- a/repository/BioParsers/BioEResultKeysParser.class.st +++ b/repository/BioParsers/BioEResultKeysParser.class.st @@ -5,9 +5,9 @@ See http://www.ncbi.nlm.nih.gov/books/NBK25500/ for details (15/11/2011) Class { #name : 'BioEResultKeysParser', #superclass : 'BioEntrezResultParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'content' } diff --git a/repository/BioParsers/BioEntrezResultParser.class.st b/repository/BioParsers/BioEntrezResultParser.class.st index 79ee9845..acf50fa9 100644 --- a/repository/BioParsers/BioEntrezResultParser.class.st +++ b/repository/BioParsers/BioEntrezResultParser.class.st @@ -7,9 +7,9 @@ Instance Variables: Class { #name : 'BioEntrezResultParser', #superclass : 'BioSAXParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'content' } diff --git a/repository/BioParsers/BioEntrezXMLGBBasicParser.class.st b/repository/BioParsers/BioEntrezXMLGBBasicParser.class.st index cd6b0ed2..858a5978 100644 --- a/repository/BioParsers/BioEntrezXMLGBBasicParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBBasicParser.class.st @@ -4,9 +4,9 @@ This is a basic parser which can parse accession, definition and sequence nodes Class { #name : 'BioEntrezXMLGBBasicParser', #superclass : 'BioEntrezXMLGenBankSeqParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGBFullParser.class.st b/repository/BioParsers/BioEntrezXMLGBFullParser.class.st index 3b7a7b45..349a4516 100644 --- a/repository/BioParsers/BioEntrezXMLGBFullParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBFullParser.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioEntrezXMLGBFullParser', #superclass : 'BioEntrezXMLGenBankSeqParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGBSeqFeatureQualParser.class.st b/repository/BioParsers/BioEntrezXMLGBSeqFeatureQualParser.class.st index 4a123255..45bcca48 100644 --- a/repository/BioParsers/BioEntrezXMLGBSeqFeatureQualParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBSeqFeatureQualParser.class.st @@ -38,9 +38,9 @@ fileRef := BioObject testFilesFullDirectoryName / 'GenBankTestFiles' / 'TestGBSe Class { #name : 'BioEntrezXMLGBSeqFeatureQualParser', #superclass : 'BioEntrezXMLGBSeqParser', - #category : 'BioParsers-XML', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'XML' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGBSeqFullParser.class.st b/repository/BioParsers/BioEntrezXMLGBSeqFullParser.class.st index cce00f7f..ca92e32d 100644 --- a/repository/BioParsers/BioEntrezXMLGBSeqFullParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBSeqFullParser.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioEntrezXMLGBSeqFullParser', #superclass : 'BioEntrezXMLGBSeqParser', - #category : 'BioParsers-XML', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'XML' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGBSeqJournalParser.class.st b/repository/BioParsers/BioEntrezXMLGBSeqJournalParser.class.st index d2b91438..9f7fd813 100644 --- a/repository/BioParsers/BioEntrezXMLGBSeqJournalParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBSeqJournalParser.class.st @@ -8,9 +8,9 @@ fileRef := BioObject testFilesFullDirectoryName / 'GenBankTestFiles' / 'TestGBSe Class { #name : 'BioEntrezXMLGBSeqJournalParser', #superclass : 'BioEntrezXMLGBSeqParser', - #category : 'BioParsers-XML', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'XML' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGBSeqParser.class.st b/repository/BioParsers/BioEntrezXMLGBSeqParser.class.st index 7049e35a..6c2483dc 100644 --- a/repository/BioParsers/BioEntrezXMLGBSeqParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBSeqParser.class.st @@ -25,9 +25,9 @@ Class { 'matches', 'records' ], - #category : 'BioParsers-XML', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'XML' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGenBankSeqParser.class.st b/repository/BioParsers/BioEntrezXMLGenBankSeqParser.class.st index 3d67ed70..a17b7862 100644 --- a/repository/BioParsers/BioEntrezXMLGenBankSeqParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGenBankSeqParser.class.st @@ -7,9 +7,9 @@ Class { #instVars : [ 'eRecord' ], - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGenSetParser.class.st b/repository/BioParsers/BioEntrezXMLGenSetParser.class.st index fae3eb23..6f8ad224 100644 --- a/repository/BioParsers/BioEntrezXMLGenSetParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGenSetParser.class.st @@ -4,9 +4,9 @@ Class { #instVars : [ 'eRecord' ], - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioFASTABasicParser.class.st b/repository/BioParsers/BioFASTABasicParser.class.st index 8f6136dc..449fbff8 100644 --- a/repository/BioParsers/BioFASTABasicParser.class.st +++ b/repository/BioParsers/BioFASTABasicParser.class.st @@ -4,9 +4,9 @@ This class is not intended to be used directly Class { #name : 'BioFASTABasicParser', #superclass : 'BioIDParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-FASTA', #package : 'BioParsers', - #tag : 'Core' + #tag : 'FASTA' } { #category : 'accessing private' } diff --git a/repository/BioParsers/BioFASTAMultiParser.class.st b/repository/BioParsers/BioFASTAMultiParser.class.st index fbb37e1d..e0ef1217 100644 --- a/repository/BioParsers/BioFASTAMultiParser.class.st +++ b/repository/BioParsers/BioFASTAMultiParser.class.st @@ -4,9 +4,9 @@ Parser for a FASTA file with several sequences. This class is not intended to be Class { #name : 'BioFASTAMultiParser', #superclass : 'BioFASTABasicParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-FASTA', #package : 'BioParsers', - #tag : 'Core' + #tag : 'FASTA' } { #category : 'accessing private' } diff --git a/repository/BioParsers/BioFASTAParser.class.st b/repository/BioParsers/BioFASTAParser.class.st index 99760cbc..4ef7aa66 100644 --- a/repository/BioParsers/BioFASTAParser.class.st +++ b/repository/BioParsers/BioFASTAParser.class.st @@ -4,9 +4,9 @@ Parser for several FASTA file format elements. This class is not intended to be Class { #name : 'BioFASTAParser', #superclass : 'BioAbstractTextParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-FASTA', #package : 'BioParsers', - #tag : 'Core' + #tag : 'FASTA' } { #category : 'accessing-parsers' } diff --git a/repository/BioParsers/BioGFF3CommentRecordNode.class.st b/repository/BioParsers/BioGFF3CommentRecordNode.class.st new file mode 100644 index 00000000..13326c7d --- /dev/null +++ b/repository/BioParsers/BioGFF3CommentRecordNode.class.st @@ -0,0 +1,34 @@ +Class { + #name : 'BioGFF3CommentRecordNode', + #superclass : 'BioGFF3GFF3FileNode', + #instVars : [ + 'text' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3CommentRecordNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitCommentRecord: self +] + +{ #category : 'generated' } +BioGFF3CommentRecordNode >> text [ + + ^ text +] + +{ #category : 'generated' } +BioGFF3CommentRecordNode >> text: aSmaCCToken [ + + text := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3CommentRecordNode >> tokenVariables [ + + ^ #( #text ) +] diff --git a/repository/BioParsers/BioGFF3DirectiveListNode.class.st b/repository/BioParsers/BioGFF3DirectiveListNode.class.st new file mode 100644 index 00000000..b705d52e --- /dev/null +++ b/repository/BioParsers/BioGFF3DirectiveListNode.class.st @@ -0,0 +1,31 @@ +Class { + #name : 'BioGFF3DirectiveListNode', + #superclass : 'BioGFF3GFF3FileNode', + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3DirectiveListNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitDirectiveList: self +] + +{ #category : 'generated' } +BioGFF3DirectiveListNode >> compositeNodeVariables [ + + ^ #( #directives ) +] + +{ #category : 'generated' } +BioGFF3DirectiveListNode >> directives [ + + ^ directives +] + +{ #category : 'generated-initialize-release' } +BioGFF3DirectiveListNode >> initialize [ + super initialize. + directives := OrderedCollection new: 2. +] diff --git a/repository/BioParsers/BioGFF3DirectiveNode.class.st b/repository/BioParsers/BioGFF3DirectiveNode.class.st new file mode 100644 index 00000000..aab2fce7 --- /dev/null +++ b/repository/BioParsers/BioGFF3DirectiveNode.class.st @@ -0,0 +1,34 @@ +Class { + #name : 'BioGFF3DirectiveNode', + #superclass : 'BioGFF3GFF3FileNode', + #instVars : [ + 'text' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3DirectiveNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitDirective: self +] + +{ #category : 'generated' } +BioGFF3DirectiveNode >> text [ + + ^ text +] + +{ #category : 'generated' } +BioGFF3DirectiveNode >> text: aSmaCCToken [ + + text := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3DirectiveNode >> tokenVariables [ + + ^ #( #text ) +] diff --git a/repository/BioParsers/BioGFF3Feature.class.st b/repository/BioParsers/BioGFF3Feature.class.st new file mode 100644 index 00000000..f36b6f6f --- /dev/null +++ b/repository/BioParsers/BioGFF3Feature.class.st @@ -0,0 +1,250 @@ +Class { + #name : 'BioGFF3Feature', + #superclass : 'Object', + #instVars : [ + 'seqid', + 'source', + 'type', + 'start', + 'end', + 'score', + 'strand', + 'phase', + 'attributes', + 'attributesDict' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'as yet unclassified' } +BioGFF3Feature class >> fromFeatureLine: aNode [ + + ^ self new + seqid: aNode seqid value; + source: aNode _source value; + type: aNode type value; + start: aNode start value; + end: aNode end value; + score: aNode score value; + strand: aNode strand value; + phase: aNode phase value; + attributes: aNode _attributes value; + yourself +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> asBioSequenceFeature [ + + | sf | + sf := BioSequenceFeature new. + sf primaryTag: self type. + sf sourceTag: self source. + sf start: self startInteger. + sf end: self endInteger. + sf strand: self strand. + sf score: self scoreFloat. + sf chromosome: self seqid. + sf frame: self phase. + self id ifNotNil: [ :i | sf addTag: 'ID' -> i ]. + self name ifNotNil: [ :n | sf addTag: 'Name' -> n ]. + ^ sf +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> attributeAt: key [ ^ self attributesDict at: key ifAbsent: [ nil ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> attributes [ ^ attributes +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> attributes: aString [ attributes := aString. attributesDict := nil +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> attributesDict [ + + ^ attributesDict ifNil: [ attributesDict := self parseAttributes ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> derivesFrom [ + + ^ self attributesDict at: 'Derives_from' ifAbsent: [ nil ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> end [ ^ end +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> end: aString [ end := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> endInteger [ + + ^ end ifNotNil: [ end asInteger ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> hasAttribute: key [ ^ self attributesDict includesKey: key +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> id [ + + ^ self attributesDict at: 'ID' ifAbsent: [ nil ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isForwardStrand [ ^ strand = (String with: $+) +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isMiRNAPrimaryTranscript [ + + ^ self type = 'miRNA_primary_transcript' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isMiRNAType [ + + ^ self type = 'miRNA' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isOfCDSType [ + + ^ self type = 'CDS' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isOfExonType [ + + ^ self type = 'exon' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isOfGeneType [ + + ^ self type = 'gene' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isOfMRNAType [ + + ^ self type = 'mRNA' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isReverseStrand [ ^ strand = (String with: $-) +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isUnstranded [ ^ strand = (String with: $.) +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> name [ + + ^ self attributesDict at: 'Name' ifAbsent: [ nil ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> parentIds [ + + | p | + p := self attributesDict at: 'Parent' ifAbsent: [ ^ #( ) ]. + p isString ifTrue: [ ^ Array with: p ]. + ^ p asArray +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> parseAttributes [ + + | dict pairs | + dict := Dictionary new. + pairs := attributes splitOn: ';'. + pairs do: [ :pair | + | eqIdx k v | + eqIdx := pair indexOf: $=. + eqIdx > 0 ifTrue: [ + k := (pair copyFrom: 1 to: eqIdx - 1) trimBoth. + v := (pair copyFrom: eqIdx + 1 to: pair size) trimBoth. + dict at: k put: v ] ]. + ^ dict +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> phase [ ^ phase +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> phase: aString [ phase := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> score [ ^ score +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> score: aString [ score := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> scoreFloat [ + + ^ score = (String with: $.) + ifTrue: [ nil ] + ifFalse: [ score asFloat ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> seqid [ ^ seqid +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> seqid: aString [ seqid := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> source [ ^ source +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> source: aString [ source := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> start [ ^ start +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> start: aString [ start := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> startInteger [ + + ^ start ifNotNil: [ start asInteger ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> strand [ ^ strand +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> strand: aString [ strand := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> type [ ^ type +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> type: aString [ type := aString +] diff --git a/repository/BioParsers/BioGFF3FeatureLineNode.class.st b/repository/BioParsers/BioGFF3FeatureLineNode.class.st new file mode 100644 index 00000000..3642a6d8 --- /dev/null +++ b/repository/BioParsers/BioGFF3FeatureLineNode.class.st @@ -0,0 +1,138 @@ +Class { + #name : 'BioGFF3FeatureLineNode', + #superclass : 'BioGFF3GFF3FileNode', + #instVars : [ + 'seqid', + '_source', + 'type', + 'start', + 'end', + 'score', + 'strand', + 'phase', + '_attributes' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> _attributes [ + + ^ _attributes +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> _attributes: aSmaCCToken [ + + _attributes := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> _source [ + + ^ _source +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> _source: aSmaCCToken [ + + _source := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitFeatureLine: self +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> end [ + + ^ end +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> end: aSmaCCToken [ + + end := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> phase [ + + ^ phase +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> phase: aSmaCCToken [ + + phase := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> score [ + + ^ score +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> score: aSmaCCToken [ + + score := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> seqid [ + + ^ seqid +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> seqid: aSmaCCToken [ + + seqid := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> start [ + + ^ start +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> start: aSmaCCToken [ + + start := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> strand [ + + ^ strand +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> strand: aSmaCCToken [ + + strand := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> tokenVariables [ + + ^ #( #seqid #_source #type #start #end #score #strand #phase #_attributes ) +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> type [ + + ^ type +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> type: aSmaCCToken [ + + type := aSmaCCToken +] diff --git a/repository/BioParsers/BioGFF3FeatureListNode.class.st b/repository/BioParsers/BioGFF3FeatureListNode.class.st new file mode 100644 index 00000000..3ef3deee --- /dev/null +++ b/repository/BioParsers/BioGFF3FeatureListNode.class.st @@ -0,0 +1,58 @@ +Class { + #name : 'BioGFF3FeatureListNode', + #superclass : 'BioGFF3GFF3FileNode', + #instVars : [ + 'lines', + '_comments' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3FeatureListNode >> _comments [ + + ^ _comments +] + +{ #category : 'generated' } +BioGFF3FeatureListNode >> _comments: anOrderedCollection [ + + self setParents: self _comments to: nil. + _comments := anOrderedCollection. + self setParents: self _comments to: self +] + +{ #category : 'generated' } +BioGFF3FeatureListNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitFeatureList: self +] + +{ #category : 'generated' } +BioGFF3FeatureListNode >> compositeNodeVariables [ + + ^ #( #lines #_comments ) +] + +{ #category : 'generated-initialize-release' } +BioGFF3FeatureListNode >> initialize [ + super initialize. + lines := OrderedCollection new: 2. + _comments := OrderedCollection new: 2. +] + +{ #category : 'generated' } +BioGFF3FeatureListNode >> lines [ + + ^ lines +] + +{ #category : 'generated' } +BioGFF3FeatureListNode >> lines: anOrderedCollection [ + + self setParents: self lines to: nil. + lines := anOrderedCollection. + self setParents: self lines to: self +] diff --git a/repository/BioParsers/BioGFF3File.class.st b/repository/BioParsers/BioGFF3File.class.st new file mode 100644 index 00000000..12516c23 --- /dev/null +++ b/repository/BioParsers/BioGFF3File.class.st @@ -0,0 +1,272 @@ +Class { + #name : 'BioGFF3File', + #superclass : 'Object', + #instVars : [ + 'directives', + 'features', + 'sourceDirective', + 'gffVersion' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'as yet unclassified' } +BioGFF3File class >> fromFile: aFilename [ + + ^ self fromString: aFilename asFileReference contents +] + +{ #category : 'as yet unclassified' } +BioGFF3File class >> fromString: aString [ + + | parser node input | + input := (aString endsWith: String lf) + ifTrue: [ aString ] + ifFalse: [ aString , String lf ]. + parser := BioGFF3Parser on: (ReadStream on: input). + node := parser parse. + ^ self new fromGFF3FileNode: node +] + +{ #category : 'properties' } +BioGFF3File >> childrenOf: aFeature [ + + | parentId | + parentId := aFeature id ifNil: [ ^ #( ) ]. + ^ self features select: [ :f | f parentIds includes: parentId ] +] + +{ #category : 'querying' } +BioGFF3File >> derivesFromOf: aFeature [ + + | anId | + anId := aFeature id ifNil: [ ^ #( ) ]. + ^ self features select: [ :f | f derivesFrom = anId ] +] + +{ #category : 'accessing' } +BioGFF3File >> directives [ + + ^ directives + ifNil: [ directives := OrderedCollection new ] +] + +{ #category : 'accessing' } +BioGFF3File >> directives: aCollection [ + + directives := aCollection +] + +{ #category : 'querying' } +BioGFF3File >> featureCount [ + + ^ self features size +] + +{ #category : 'querying' } +BioGFF3File >> featureTypes [ + + ^ (self features collect: #type) asSet asSortedCollection asArray +] + +{ #category : 'querying' } +BioGFF3File >> featureWithId: anId [ + + ^ self features detect: [ :f | f id = anId ] ifNone: [ nil ] +] + +{ #category : 'accessing' } +BioGFF3File >> features [ + + ^ features + ifNil: [ features := OrderedCollection new ] +] + +{ #category : 'accessing' } +BioGFF3File >> features: aCollection [ + + features := aCollection +] + +{ #category : 'querying' } +BioGFF3File >> featuresWithSeqid: aSeqid [ + + ^ self features select: [ :f | f seqid = aSeqid ] +] + +{ #category : 'querying' } +BioGFF3File >> featuresWithType: aType [ + + ^ self features select: [ :f | f type = aType ] +] + +{ #category : 'instance creation' } +BioGFF3File >> fromFile: aFilename filteringSeqid: aSeqid [ + "Parse only features for a specific seqid/chromosome. Memory-efficient." + + | stream line tabSeqid | + features := OrderedCollection new. + tabSeqid := aSeqid , (String with: Character tab). + stream := aFilename asFileReference readStream. + [ stream atEnd ] whileFalse: [ + line := stream nextLine. + line ifNotEmpty: [ + line first = $# ifFalse: [ + (line beginsWith: tabSeqid) ifTrue: [ + features add: (self parseFeatureLine: line) ] ] ] ]. + stream close. + self features: features. + ^ self +] + +{ #category : 'instance creation' } +BioGFF3File >> fromFile: aFilename filteringTypes: typeSet [ + "Parse only features whose type is in typeSet. Memory-efficient for large files." + + | stream line | + features := OrderedCollection new. + stream := aFilename asFileReference readStream. + [ stream atEnd ] whileFalse: [ + line := stream nextLine. + line ifNotEmpty: [ + line first = $# ifFalse: [ + | fields type | + fields := line findTokens: String tab. + fields size >= 3 ifTrue: [ + type := fields at: 3. + (typeSet includes: type) ifTrue: [ + features add: (self parseFeatureLine: line) ] ] ] ] ]. + stream close. + self features: features. + ^ self +] + +{ #category : 'instance creation' } +BioGFF3File >> fromFile: aFilename maxFeatures: maxCount [ + "Parse at most maxCount features. Useful for previews." + + | stream line count | + features := OrderedCollection new. + count := 0. + stream := aFilename asFileReference readStream. + [ stream atEnd or: [ count >= maxCount ] ] whileFalse: [ + line := stream nextLine. + line ifNotEmpty: [ + line first = $# ifFalse: [ + | feature | + feature := self parseFeatureLine: line. + feature ifNotNil: [ + features add: feature. + count := count + 1 ] ] ] ]. + stream close. + self features: features. + ^ self +] + +{ #category : 'instance creation' } +BioGFF3File >> fromGFF3FileNode: aNode [ + + | directiveTexts featuresList | + directiveTexts := aNode directives + ifNil: [ OrderedCollection new ] + ifNotNil: [ :dl | + dl directives + ifNil: [ OrderedCollection new ] + ifNotNil: [ :ds | + ds collect: [ :d | d text value ] ] ]. + self directives: directiveTexts. + featuresList := aNode features + ifNil: [ OrderedCollection new ] + ifNotNil: [ :fl | + fl lines + ifNil: [ OrderedCollection new ] + ifNotNil: [ :lines | + lines collect: [ :line | + BioGFF3Feature fromFeatureLine: line ] ] ]. + self features: featuresList. + ^ self +] + +{ #category : 'accessing' } +BioGFF3File >> gffVersion [ + + gffVersion ifNil: [ + | directive | + directive := self directives + detect: [ :d | d beginsWith: '##gff-version' ] + ifNone: [ ^ nil ]. + gffVersion := (directive copyReplaceAll: '##gff-version' with: '') + trimBoth ]. + ^ gffVersion +] + +{ #category : 'querying' } +BioGFF3File >> groupByType [ + + | groups | + groups := Dictionary new. + self features do: [ :f | + (groups at: f type ifAbsentPut: [ OrderedCollection new ]) add: f ]. + ^ groups +] + +{ #category : 'parsing' } +BioGFF3File >> parseFeatureLine: aLine [ + + | fields | + fields := aLine findTokens: String tab. + fields size < 9 ifTrue: [ ^ nil ]. + ^ BioGFF3Feature new + seqid: fields first; + source: (fields at: 2); + type: (fields at: 3); + start: (fields at: 4); + end: (fields at: 5); + score: (fields at: 6); + strand: (fields at: 7); + phase: (fields at: 8); + attributes: (fields at: 9); + yourself +] + +{ #category : 'querying' } +BioGFF3File >> seqids [ + + ^ (self features collect: #seqid) asSet asSortedCollection asArray +] + +{ #category : 'accessing' } +BioGFF3File >> source [ + + ^ self directives + detect: [ :d | d beginsWith: '##source' ] + ifNone: [ nil ] +] + +{ #category : 'querying' } +BioGFF3File >> streamFeaturesFromFile: aFilename block: aBlock [ + "Evaluate aBlock for each feature parsed from file, without storing all in memory. + aBlock receives each BioGFF3Feature." + + | stream line count | + count := 0. + stream := aFilename asFileReference readStream. + [ stream atEnd ] whileFalse: [ + line := stream nextLine. + line ifNotEmpty: [ + (line beginsWith: '##') + ifTrue: [ "skip directives" ] + ifFalse: [ + line first = $# + ifTrue: [ "skip comments" ] + ifFalse: [ + | feature | + feature := self parseFeatureLine: line. + feature ifNotNil: [ + aBlock value: feature. + count := count + 1 ] ] ] ] ]. + stream close. + ^ count +] diff --git a/repository/BioParsers/BioGFF3GFF3FileNode.class.st b/repository/BioParsers/BioGFF3GFF3FileNode.class.st new file mode 100644 index 00000000..2bcd8716 --- /dev/null +++ b/repository/BioParsers/BioGFF3GFF3FileNode.class.st @@ -0,0 +1,51 @@ +Class { + #name : 'BioGFF3GFF3FileNode', + #superclass : 'SmaCCParseNode', + #instVars : [ + 'directives', + 'features' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitGFF3File: self +] + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> directives [ + + ^ directives +] + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> directives: aBioGFF3DirectiveListNode [ + + self directives notNil ifTrue: [ self directives parent: nil ]. + directives := aBioGFF3DirectiveListNode. + self directives notNil ifTrue: [ self directives parent: self ] +] + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> features [ + + ^ features +] + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> features: aBioGFF3FeatureListNode [ + + self features notNil ifTrue: [ self features parent: nil ]. + features := aBioGFF3FeatureListNode. + self features notNil ifTrue: [ self features parent: self ] +] + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> nodeVariables [ + + ^ #( #directives #features ) +] diff --git a/repository/BioParsers/BioGFF3GFF3FileNodeVisitor.class.st b/repository/BioParsers/BioGFF3GFF3FileNodeVisitor.class.st new file mode 100644 index 00000000..2038d900 --- /dev/null +++ b/repository/BioParsers/BioGFF3GFF3FileNodeVisitor.class.st @@ -0,0 +1,9 @@ +Class { + #name : 'BioGFF3GFF3FileNodeVisitor', + #superclass : 'Object', + #traits : 'TBioGFF3GFF3FileNodeVisitor', + #classTraits : 'TBioGFF3GFF3FileNodeVisitor classTrait', + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} diff --git a/repository/BioParsers/BioNCBIBlastSAXParser.class.st b/repository/BioParsers/BioNCBIBlastSAXParser.class.st index d136a377..516c8397 100644 --- a/repository/BioParsers/BioNCBIBlastSAXParser.class.st +++ b/repository/BioParsers/BioNCBIBlastSAXParser.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioNCBIBlastSAXParser', #superclass : 'BioSAXParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-XML', #package : 'BioParsers', - #tag : 'Core' + #tag : 'XML' } { #category : 'handling - content' } diff --git a/repository/BioParsers/BioNCBIBlastSAXTokenizer.class.st b/repository/BioParsers/BioNCBIBlastSAXTokenizer.class.st index f38a0dfe..bc56b797 100644 --- a/repository/BioParsers/BioNCBIBlastSAXTokenizer.class.st +++ b/repository/BioParsers/BioNCBIBlastSAXTokenizer.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioNCBIBlastSAXTokenizer', #superclass : 'BioNCBIBlastSAXParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-XML', #package : 'BioParsers', - #tag : 'Core' + #tag : 'XML' } { #category : 'handling - content' } diff --git a/repository/BioParsers/BioNCBIXMLBlastParser.class.st b/repository/BioParsers/BioNCBIXMLBlastParser.class.st index b9d71104..624f435f 100644 --- a/repository/BioParsers/BioNCBIXMLBlastParser.class.st +++ b/repository/BioParsers/BioNCBIXMLBlastParser.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioNCBIXMLBlastParser', #superclass : 'BioXMLParser', - #category : 'BioParsers-XML', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'XML' + #tag : 'BLAST' } { #category : 'initialize-release' } diff --git a/repository/BioParsers/BioParser.class.st b/repository/BioParsers/BioParser.class.st index b39429f5..4feb7ad4 100644 --- a/repository/BioParsers/BioParser.class.st +++ b/repository/BioParsers/BioParser.class.st @@ -106,6 +106,20 @@ BioParser class >> parseFullNcbiXmlGBSeq: aString [ ^ BioEntrezXMLGBSeqFullParser parse: aString ] +{ #category : 'parse-gff3' } +BioParser class >> parseGff3: aGffString [ + " Parse aFastaString and answer instance " + + ^ BioGFF3File fromString: aGffString +] + +{ #category : 'parse-gff3' } +BioParser class >> parseGff3File: aGff3FilePath [ + " Parse aGff3FilePath and answer a instance " + + ^ BioGFF3File new fromFile: aGff3FilePath +] + { #category : 'parse-fasta' } BioParser class >> parseMultiFasta: aFastaString [ " Parser aFastaString representing a MultiFASTA sequence. diff --git a/repository/BioParsers/BioSAXParser.class.st b/repository/BioParsers/BioSAXParser.class.st index e7ab468d..4472ea5c 100644 --- a/repository/BioParsers/BioSAXParser.class.st +++ b/repository/BioParsers/BioSAXParser.class.st @@ -18,9 +18,9 @@ Class { 'selectedNodes', 'current' ], - #category : 'BioParsers-Core', + #category : 'BioParsers-XML', #package : 'BioParsers', - #tag : 'Core' + #tag : 'XML' } { #category : 'instance creation' } diff --git a/repository/BioParsers/TBioGFF3GFF3FileNodeVisitor.trait.st b/repository/BioParsers/TBioGFF3GFF3FileNodeVisitor.trait.st new file mode 100644 index 00000000..db1041bc --- /dev/null +++ b/repository/BioParsers/TBioGFF3GFF3FileNodeVisitor.trait.st @@ -0,0 +1,44 @@ +Trait { + #name : 'TBioGFF3GFF3FileNodeVisitor', + #traits : 'TSmaCCParseNodeVisitor', + #classTraits : 'TSmaCCParseNodeVisitor classTrait', + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitCommentRecord: aCommentRecord [ + + ^ self visitGFF3File: aCommentRecord +] + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitDirective: aDirective [ + + ^ self visitGFF3File: aDirective +] + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitDirectiveList: aDirectiveList [ + + ^ self visitGFF3File: aDirectiveList +] + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitFeatureLine: aFeatureLine [ + + ^ self visitGFF3File: aFeatureLine +] + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitFeatureList: aFeatureList [ + + ^ self visitGFF3File: aFeatureList +] + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitGFF3File: aGFF3File [ + + ^ self visitSmaCCParseNode: aGFF3File +] From a3737bc74bda3beb0d37876944f76dc2f89bf075 Mon Sep 17 00:00:00 2001 From: Hernan Morales Date: Mon, 27 Apr 2026 17:43:42 -0300 Subject: [PATCH 2/2] Add PHYLIP SmaCC-based parser, first version. Add tests. --- .../BioPhylipParserTest.class.st | 606 +++++++++------ .../BioPhylipPetitParserTest.class.st | 254 ++++++ .../BioParsers/BioPhylipParser.class.st | 722 ++++++++++++++---- .../BioParsers/BioPhylipPetitParser.class.st | 218 ++++++ 4 files changed, 1394 insertions(+), 406 deletions(-) create mode 100644 repository/BioParsers-Tests/BioPhylipPetitParserTest.class.st create mode 100644 repository/BioParsers/BioPhylipPetitParser.class.st diff --git a/repository/BioParsers-Tests/BioPhylipParserTest.class.st b/repository/BioParsers-Tests/BioPhylipParserTest.class.st index 2c2e750d..55a66c27 100644 --- a/repository/BioParsers-Tests/BioPhylipParserTest.class.st +++ b/repository/BioParsers-Tests/BioPhylipParserTest.class.st @@ -5,250 +5,366 @@ Class { #package : 'BioParsers-Tests' } -{ #category : 'testing' } -BioPhylipParserTest >> firstLineTokenizer [ - - ^ BioPhylipParser new firstLineTokenizer -] - -{ #category : 'testing' } -BioPhylipParserTest >> phylipInterleavedDNA [ - - ^ ' 6 13 -Archaeopt CGATGCTTAC CGCCGATGCT -HesperorniCGTTACTCGT TGTCGTTACT -BaluchitheTAATGTTAAT TGTTAATGTT -B. virginiTAATGTTCGT TGTTAATGTT -BrontosaurCAAAACCCAT CATCAAAACC -B.subtilisGGCAGCCAAT CACGGCAGCC - -TACCGCCGAT GCTTACCGC -CGTTGTCGTT ACTCGTTGT -AATTGTTAAT GTTAATTGT -CGTTGTTAAT GTTCGTTGT -CATCATCAAA ACCCATCAT -AATCACGGCA GCCAATCAC - -CCCCGCCCCC GCTTACCGC -CCCCGTCCCC ACTCGTTGT -CCCCGTCCCC GTTAATTGT -CCCCGTCCCC GTTCGTTGT -CCCCATCCCC ACCCATCAT -CCCCACCCCC GCCAATCAC -' -] - -{ #category : 'testing' } -BioPhylipParserTest >> phylipInterleavedProtein [ - - ^ ' 5 176 -cox2_leitaMAFILSFWMI FLLDSVIVLL SFVCFVCVWI CALLFSTVLL VSKLNNIYCT -cox2_crifaMAFILSFWMI FLIDAVIVLL SFVCFVCIWI CSLFFSSFLL VSKINNVYCT -cox2_bsaltMSFIISFWML FLIDSLIVLL SGAIFVCIWI CSLFFLCILF ICKLDYIFCS -cox2_trybbMSFILTFWMI FLMDSIIVLI SFSIFLSVWI CALIIATVLT VTKINNIYCT -cox2_tborrMLFFINQLLL LLVDTFVILE IFSLFVCVFI IVMYILFINY NIFLKNINVY - -WDFTASKFID VYWFTIGGMF SLGLLLRLCL LLYFGHLNFV SFDLCKVVGF -WDFTASKFID AYWFTIGGMF VLCLLLRLCL LLYFGCLNFV SFDLCKVVGF -WDFISAKFID LYWFTLGCLF IVCLLIRLCL LLYFSCLNFV CFDLCKCIGF -WDFISSKFID TYWFVLGMMF ILCLLLRLCL LLYFSCINFV SFDLCKVIGF -LDFIGSKYLD LYWFLIGIFF VIVLLIRLCL LLYYSWISLL IFDLCKIMGF - -QWYWVYFIFG ETTIFSNLIL ESDYMIGDLR LLQCNHVLTL LSLVIYKLWL -QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKLWL -QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKVWL -QWYWVYFLFG ETTIFSNLIL ESDYLIGDLR ILQCNHVLTL LSLVIYKLWV -QWYWIFFVFK ENVIFSNLLI ESDYWIGDLR LLQCNNTFNL ICLVVYKIWV - -SAVDVIHSFA ISSLGVKVEN LVAVMK -SAVDVIHSFA VSSLGIKVDC IPGRCN -SAIDVIHSFT LANLGIKVD? ?PGRCN -SAVDVIHSFT ISSLGIKVEN PGRCNE -TSIDVIHSFT ISTLGIKIDC IPGRCN -' -] - -{ #category : 'testing' } -BioPhylipParserTest >> speciesDNALineTokenizer [ - - ^ BioPhylipParser new speciesDNALineTokenizer -] - -{ #category : 'testing' } -BioPhylipParserTest >> speciesDNANamedBlockTokenizer [ - - ^ BioPhylipParser new speciesDNANamedBlockTokenizer -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeDNASpeciesBlock01 [ - " Private - Answer a with a sample phylip DNA " - - | speciesBlock expectedResult firstRecord | - - speciesBlock := 'Archaeopt CGATGCTTAC CGC -HesperorniCGTTACTCGT TGT -BaluchitheTAATGTTAAT TGT -B. virginiTAATGTTCGT TGT -BrontosaurCAAAACCCAT CAT -B.subtilisGGCAGCCAAT CAC'. - expectedResult := #(#('Archaeopt ' 'CGATGCTTAC CGC' nil) #('Hesperorni' 'CGTTACTCGT TGT' nil) #('Baluchithe' 'TAATGTTAAT TGT' nil) #('B. virgini' 'TAATGTTCGT TGT' nil) #('Brontosaur' 'CAAAACCCAT CAT' nil)). - - parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. - firstRecord := parseResult first. - - self assert: firstRecord first equals: 'Archaeopt '. - self assert: firstRecord second equals: 'CGATGCTTAC CGC'. - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeDNASpeciesBlock02 [ - " Private - Answer a with a sample phylip DNA " - - | speciesBlock expectedResult firstRecord | - - speciesBlock := 'Archaeopt CGATGCTTAC CGC -Hes CGTTACTCGT TGT -BaluchitheTAATGTTAAT TGT -B. virginiTAATGTTCGT TGT -BrontosaurCAAAACCCAT CAT -B.subtilisGGCAGCCAAT CAC'. - expectedResult := #( - #('Archaeopt ' 'CGATGCTTAC CGC') - #('Hesperorni' 'CGTTACTCGT TGT') - #('Baluchithe' 'TAATGTTAAT TGT') - #('B. virgini' 'TAATGTTCGT TGT') - #('Brontosaur' 'CAAAACCCAT CAT')). - - parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. - firstRecord := parseResult first. - - self assert: firstRecord first equals: 'Archaeopt '. - self assert: firstRecord second equals: 'CGATGCTTAC CGC'. - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeDNASpeciesLine01 [ - " Private - Answer a with a sample phylip DNA " - - | speciesLineBlock expectedResult | - - speciesLineBlock := 'Archaeopt CGATGCTTAC CGC'. - expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). - parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. - - self assert: (parseResult bioHasEqualElements: expectedResult ). -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeDNASpeciesLine02 [ - " Private - Answer a with a sample phylip DNA " - - | speciesLineBlock expectedResult | - - speciesLineBlock := 'Archaeopt CGATGCTTACCGC'. - expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). - parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. - - self assert: (parseResult bioHasEqualElements: expectedResult). - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeDNASpeciesLine03 [ - " Private - Answer a with a sample phylip DNA " - - | speciesLineBlock expectedResult | - - speciesLineBlock := 'B. virginiTAATGTTCGT TGT'. - expectedResult := #('B. virgini' 'TAATGTTCGTTGT'). - parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. - - self assert: (parseResult bioHasEqualElements: expectedResult). - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeFirstLine01 [ - " Private - Answer a with a sample phylip DNA " - - | firstLine | - - firstLine := '6 13 -'. - parseResult := self firstLineTokenizer parse: firstLine. - self assert: (parseResult bioHasEqualElements: #('6' '13') ). - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeFirstLine02 [ - " Private - Answer a with a sample phylip DNA " - - | firstLine | - - firstLine := ' 6 13 -'. - parseResult := self firstLineTokenizer parse: firstLine. - self assert: (parseResult bioHasEqualElements: #('6' '13') ). - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeFirstLine03 [ - " Private - Answer a with a sample phylip DNA " - - | firstLine | - - firstLine := '6 13 -'. - parseResult := self firstLineTokenizer parse: firstLine. - self assert: (parseResult bioHasEqualElements: #('6' '13') ). - -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeInterleavedDNA [ - " Private - Answer a with a sample phylip DNA " - - | phylipString | - phylipString := self phylipInterleavedDNA. - - parseResult := BioParser tokenizePhylipInterleavedDNA: phylipString. - self assert: parseResult size equals: 4. - self assert: parseResult first equals: 6. - self assert: parseResult second equals: 13. - self assert: (parseResult third bioHasEqualElements: - #( 'Archaeopt ' 'Hesperorni' 'Baluchithe' 'B. virgini' - 'Brontosaur' 'B.subtilis' )). - self assert: (parseResult fourth bioHasEqualElements: - #( 'CGATGCTTACCGCCGATGCTTACCGCCGATGCTTACCGCCCCCGCCCCCGCTTACCGC' - 'CGTTACTCGTTGTCGTTACTCGTTGTCGTTACTCGTTGTCCCCGTCCCCACTCGTTGT' - 'TAATGTTAATTGTTAATGTTAATTGTTAATGTTAATTGTCCCCGTCCCCGTTAATTGT' - 'TAATGTTCGTTGTTAATGTTCGTTGTTAATGTTCGTTGTCCCCGTCCCCGTTCGTTGT' - 'CAAAACCCATCATCAAAACCCATCATCAAAACCCATCATCCCCATCCCCACCCATCAT' - 'GGCAGCCAATCACGGCAGCCAATCACGGCAGCCAATCACCCCCACCCCCGCCAATCAC' )) -] - -{ #category : 'testing' } -BioPhylipParserTest >> testTokenizeInterleavedProtein [ - - | phylipString | - phylipString := self phylipInterleavedProtein. - parseResult := BioParser tokenizePhylipInterleavedProtein: phylipString. +{ #category : 'as yet unclassified' } +BioPhylipParserTest >> testAmbiguousDNAAlphabetDetection [ + "Note: Ambiguous DNA codes like N, R, Y overlap with amino acid codes. + BioSmalltalk detects such sequences as protein alphabet by default. + This test verifies that sequences are still created correctly." + | phylip aln seq | + phylip := '2 10 +Seq1 AACGTGGNNA +Seq2 CCGTATGGNN +'. + aln := BioPhylipParser parseString: phylip. + seq := aln sequences first. + "Sequence is created and can be used regardless of alphabet detection" + self assert: seq size equals: 10. + self assert: (seq asString includesSubstring: 'NN') +] + +{ #category : 'tests' } +BioPhylipParserTest >> testAsPhylipStringRelaxed [ + + | phylip aln output | + phylip := '3 10 +Homo_sapiens AACGTGGCCA +Pan_troglodytes CCGTATGGCC +Gorilla GGCTTTGACC +'. + aln := BioPhylipParser parseString: phylip. + output := BioPhylipParser new asPhylipStringRelaxed: aln. + self assert: (output includesSubstring: '3 10'). + self assert: (output includesSubstring: 'Homo_sapiens'). + self assert: (output includesSubstring: 'AACGTGGCCA') +] + +{ #category : 'tests' } +BioPhylipParserTest >> testAutoDetectsInterleaved [ + + | phylip aln | + phylip := '2 20 +S1 ATGCTAGCTA +S2 CCGCTAGCTA +GCTAGCTAGC +GCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 2. + self assert: aln numberOfBases equals: 20 +] + +{ #category : 'tests' } +BioPhylipParserTest >> testAutoDetectsSequential [ + + | phylip aln | + "Sequential: taxon data on consecutive lines, continuation lines have no name" + phylip := '2 20 +S1 ATGC +GCTAGCTAGCTAGCTA +S2 CCGC +TAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 2. + self assert: aln numberOfBases equals: 20 +] + +{ #category : 'tests' } +BioPhylipParserTest >> testClassicStrict5Taxa [ + + | phylip aln | + phylip := '5 13 +Alpha AACGTGGCCACAT +Beta AAGGTCGCCACAC +Gamma CAGTTCGCCACAA +Delta GAGATTTCCGCCT +Epsilon GAGATCTCCGCCC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 5. + self assert: aln numberOfBases equals: 13. + self assert: (aln sequenceNames includes: 'Alpha'). + self assert: (aln sequenceNames includes: 'Epsilon'). + self assert: aln sequences first asString equals: 'AACGTGGCCACAT'. + self assert: aln sequences last asString equals: 'GAGATCTCCGCCC' +] + +{ #category : 'as yet unclassified' } +BioPhylipParserTest >> testDNAAlphabetDetection [ + | phylip aln seq | + phylip := '2 10 +Seq1 AACGTGGCCA +Seq2 CCGTATGGCA +'. + aln := BioPhylipParser parseString: phylip. + seq := aln sequences first. + self assert: (seq alphabet class name) equals: #BioIUPACUnambiguousDNA +] + +{ #category : 'tests' } +BioPhylipParserTest >> testGapsAndAmbiguityCodes [ + + | phylip aln | + phylip := '3 10 +Seq1 AAC-GG??TN +Seq2 CCN-AT???K +Seq3 GGRYY??-KM +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 10. + self assert: aln sequences first asString equals: 'AAC-GG??TN' +] + +{ #category : 'tests' } +BioPhylipParserTest >> testInterleavedWithBlankLines [ + + | phylip aln | + phylip := '3 40 +Taxon1 ATGCTAGCTAGCTAGCTAGC +Taxon2 CCGCTAGCTAGCTAGCTAGC +Taxon3 GGGCTAGCTAGCTAGCTAGC + +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 40 +] + +{ #category : 'tests' } +BioPhylipParserTest >> testMinimalFile [ + + | phylip aln | + phylip := '2 1 +A T +B G +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 2. + self assert: aln numberOfBases equals: 1. + self assert: aln sequences first asString equals: 'T'. + self assert: aln sequences last asString equals: 'G' +] + +{ #category : 'tests' } +BioPhylipParserTest >> testParserProperties [ + + | phylip aln | + phylip := '3 10 +Taxon1 AACGTGGCCA +Taxon2 CCGTATGGCC +Taxon3 GGCTTTGACC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 10. + self assert: (aln sequenceNames includes: 'Taxon1'). + self assert: (aln sequenceNames includes: 'Taxon3') +] + +{ #category : 'as yet unclassified' } +BioPhylipParserTest >> testProteinAlphabetDetection [ + | phylip aln seq | + phylip := '2 15 +Human MVKQLEARKRPEQQE +Mouse MVKQLEARHRPEQQK +'. + aln := BioPhylipParser parseString: phylip. + seq := aln sequences first. + self assert: (seq alphabet class name) equals: #BioIUPACProtein +] + +{ #category : 'tests' } +BioPhylipParserTest >> testProteinSequences [ - self assert: parseResult size equals: 4. - self assert: parseResult first equals: 5. - self assert: parseResult second equals: 176. - self assert: (parseResult third bioHasEqualElements: - #( 'cox2_leita' 'cox2_crifa' 'cox2_bsalt' 'cox2_trybb' - 'cox2_tborr' )). - self assert: (parseResult fourth bioHasEqualElements: - #( 'MAFILSFWMIFLLDSVIVLLSFVCFVCVWICALLFSTVLLVSKLNNIYCTWDFTASKFIDVYWFTIGGMFSLGLLLRLCLLLYFGHLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYMIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAISSLGVKVENLVAVMK' - 'MAFILSFWMIFLIDAVIVLLSFVCFVCIWICSLFFSSFLLVSKINNVYCTWDFTASKFIDAYWFTIGGMFVLCLLLRLCLLLYFGCLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAVSSLGIKVDCIPGRCN' - 'MSFIISFWMLFLIDSLIVLLSGAIFVCIWICSLFFLCILFICKLDYIFCSWDFISAKFIDLYWFTLGCLFIVCLLIRLCLLLYFSCLNFVCFDLCKCIGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKVWLSAIDVIHSFTLANLGIKVD??PGRCN' - 'MSFILTFWMIFLMDSIIVLISFSIFLSVWICALIIATVLTVTKINNIYCTWDFISSKFIDTYWFVLGMMFILCLLLRLCLLLYFSCINFVSFDLCKVIGFQWYWVYFLFGETTIFSNLILESDYLIGDLRILQCNHVLTLLSLVIYKLWVSAVDVIHSFTISSLGIKVENPGRCNE' - 'MLFFINQLLLLLVDTFVILEIFSLFVCVFIIVMYILFINYNIFLKNINVYLDFIGSKYLDLYWFLIGIFFVIVLLIRLCLLLYYSWISLLIFDLCKIMGFQWYWIFFVFKENVIFSNLLIESDYWIGDLRLLQCNNTFNLICLVVYKIWVTSIDVIHSFTISTLGIKIDCIPGRCN' )) + | phylip aln | + "Protein sequences with amino acid codes" + phylip := '3 10 +Human MVKQLEARKR +Mouse MVKQLEARHR +Chicken GGCTTTGACC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 10. + self assert: (aln sequenceNames includes: 'Human') +] + +{ #category : 'as yet unclassified' } +BioPhylipParserTest >> testRNAAlphabetDetection [ + | phylip aln seq | + phylip := '2 10 +Seq1 AACGUGGUUU +Seq2 CCGUAUGGAU +'. + aln := BioPhylipParser parseString: phylip. + seq := aln sequences first. + self assert: (seq alphabet class name) equals: #BioIUPACUnambiguousRNA +] + +{ #category : 'tests' } +BioPhylipParserTest >> testRelaxedInterleaved [ + + | phylip aln | + phylip := '3 40 +Homo_sapiens ATGCTAGCTAGCTAGCTAGC +Pan_troglodytes CCGCTAGCTAGCTAGCTAGC +Gorilla_gorilla GGGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 40. + self assert: (aln sequenceNames includes: 'Homo_sapiens'). + self assert: (aln sequenceNames includes: 'Gorilla_gorilla') +] + +{ #category : 'tests' } +BioPhylipParserTest >> testRelaxedSequential [ + + | phylip aln | + phylip := '4 20 +Homo_sapiens AACGTGGCCACATACGTGGC +Pan_troglodytes AAGGTCGCCACACAAGGTCC +Gorilla_gorilla CAGTTCGCCACAACAGTTCC +Pongo_abelii GAGATTTCCGCCTGAGATTT +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 4. + self assert: aln numberOfBases equals: 20. + self assert: (aln sequenceNames includes: 'Homo_sapiens'). + self assert: (aln sequenceNames includes: 'Pongo_abelii') +] + +{ #category : 'tests' } +BioPhylipParserTest >> testRoundTripStrict [ + + | phylip aln output | + phylip := '3 10 +Taxon1 AACGTGGCCA +Taxon2 CCGTATGGCC +Taxon3 GGCTTTGACC +'. + aln := BioPhylipParser parseString: phylip. + output := BioPhylipParser new asPhylipStringStrict: aln. + self assert: (output includesSubstring: '3 10'). + self assert: (output includesSubstring: 'Taxon1'). + self assert: (output includesSubstring: 'AACGTGGCCA') +] + +{ #category : 'tests' } +BioPhylipParserTest >> testSequentialWithWrapping [ + + | phylip aln | + phylip := '3 40 +Taxon1 ATGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +Taxon2 CCGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +Taxon3 GGGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 40. + self assert: aln sequences first asString size equals: 40 +] + +{ #category : 'tests' } +BioPhylipParserTest >> testSpacesInSequences [ + + | phylip aln | + phylip := '2 10 +Seq1 AACGTGGCCA +Seq2 CCGTATGGCC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 2. + self assert: aln numberOfBases equals: 10. + self assert: aln sequences first asString equals: 'AACGTGGCCA' +] + +{ #category : 'tests' } +BioPhylipParserTest >> testStrictInterleaved [ + + | phylip aln | + phylip := '3 40 +Taxon1 ATGCTAGCTAGCTAGCTAGC +Taxon2 CCGCTAGCTAGCTAGCTAGC +Taxon3 GGGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +TAGCTAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 3. + self assert: aln numberOfBases equals: 40. + self + assert: aln sequenceNames asArray + equals: #( 'Taxon1' 'Taxon2' 'Taxon3' ). + self + assert: aln sequences first asString + equals: 'ATGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGCTAGC' +] + +{ #category : 'tests' } +BioPhylipParserTest >> testStrictSequential [ + + | phylip aln | + phylip := '5 13 +Alpha AACGTGGCCACAT +Beta AAGGTCGCCACAC +Gamma CAGTTCGCCACAA +Delta GAGATTTCCGCCT +Epsilon GAGATCTCCGCCC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 5. + self assert: aln numberOfBases equals: 13. + self assert: (aln sequenceNames includes: 'Alpha'). + self assert: (aln sequenceNames includes: 'Epsilon'). + self assert: aln sequences first asString equals: 'AACGTGGCCACAT' +] + +{ #category : 'tests' } +BioPhylipParserTest >> testThreeBlockInterleaved [ + + | phylip aln | + phylip := '2 60 +S1 ATGCTAGCTAGCTAGCTAGCTAGCTAGCTA +S2 ATGCTAGCTAGCTAGCTAGCTAGCTAGCTA +GCTAGCTAGCTAGCTAGCTAGCTAGCTAGC +GCTAGCTAGCTAGCTAGCTAGCTAGCTAGC +'. + aln := BioPhylipParser parseString: phylip. + self assert: aln size equals: 2. + self assert: aln numberOfBases equals: 60. + self assert: aln sequences first asString size equals: 60 +] + +{ #category : 'tests' } +BioPhylipParserTest >> testValidatorRejectsWrongSeqLength [ + + | phylip | + phylip := '2 10 +Seq1 AACGTG +Seq2 CCGTAT +'. + self should: [ BioPhylipParser parseString: phylip ] raise: Error +] + +{ #category : 'tests' } +BioPhylipParserTest >> testValidatorRejectsWrongTaxaCount [ + + | phylip | + phylip := '3 10 +Seq1 AACGTGGCCA +Seq2 CCGTATGGCC +'. + self should: [ BioPhylipParser parseString: phylip ] raise: Error ] diff --git a/repository/BioParsers-Tests/BioPhylipPetitParserTest.class.st b/repository/BioParsers-Tests/BioPhylipPetitParserTest.class.st new file mode 100644 index 00000000..9975e9b2 --- /dev/null +++ b/repository/BioParsers-Tests/BioPhylipPetitParserTest.class.st @@ -0,0 +1,254 @@ +Class { + #name : 'BioPhylipPetitParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioPhylipPetitParserTest >> firstLineTokenizer [ + + ^ BioPhylipParser new firstLineTokenizer +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> phylipInterleavedDNA [ + + ^ ' 6 13 +Archaeopt CGATGCTTAC CGCCGATGCT +HesperorniCGTTACTCGT TGTCGTTACT +BaluchitheTAATGTTAAT TGTTAATGTT +B. virginiTAATGTTCGT TGTTAATGTT +BrontosaurCAAAACCCAT CATCAAAACC +B.subtilisGGCAGCCAAT CACGGCAGCC + +TACCGCCGAT GCTTACCGC +CGTTGTCGTT ACTCGTTGT +AATTGTTAAT GTTAATTGT +CGTTGTTAAT GTTCGTTGT +CATCATCAAA ACCCATCAT +AATCACGGCA GCCAATCAC + +CCCCGCCCCC GCTTACCGC +CCCCGTCCCC ACTCGTTGT +CCCCGTCCCC GTTAATTGT +CCCCGTCCCC GTTCGTTGT +CCCCATCCCC ACCCATCAT +CCCCACCCCC GCCAATCAC +' +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> phylipInterleavedProtein [ + + ^ ' 5 176 +cox2_leitaMAFILSFWMI FLLDSVIVLL SFVCFVCVWI CALLFSTVLL VSKLNNIYCT +cox2_crifaMAFILSFWMI FLIDAVIVLL SFVCFVCIWI CSLFFSSFLL VSKINNVYCT +cox2_bsaltMSFIISFWML FLIDSLIVLL SGAIFVCIWI CSLFFLCILF ICKLDYIFCS +cox2_trybbMSFILTFWMI FLMDSIIVLI SFSIFLSVWI CALIIATVLT VTKINNIYCT +cox2_tborrMLFFINQLLL LLVDTFVILE IFSLFVCVFI IVMYILFINY NIFLKNINVY + +WDFTASKFID VYWFTIGGMF SLGLLLRLCL LLYFGHLNFV SFDLCKVVGF +WDFTASKFID AYWFTIGGMF VLCLLLRLCL LLYFGCLNFV SFDLCKVVGF +WDFISAKFID LYWFTLGCLF IVCLLIRLCL LLYFSCLNFV CFDLCKCIGF +WDFISSKFID TYWFVLGMMF ILCLLLRLCL LLYFSCINFV SFDLCKVIGF +LDFIGSKYLD LYWFLIGIFF VIVLLIRLCL LLYYSWISLL IFDLCKIMGF + +QWYWVYFIFG ETTIFSNLIL ESDYMIGDLR LLQCNHVLTL LSLVIYKLWL +QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKLWL +QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKVWL +QWYWVYFLFG ETTIFSNLIL ESDYLIGDLR ILQCNHVLTL LSLVIYKLWV +QWYWIFFVFK ENVIFSNLLI ESDYWIGDLR LLQCNNTFNL ICLVVYKIWV + +SAVDVIHSFA ISSLGVKVEN LVAVMK +SAVDVIHSFA VSSLGIKVDC IPGRCN +SAIDVIHSFT LANLGIKVD? ?PGRCN +SAVDVIHSFT ISSLGIKVEN PGRCNE +TSIDVIHSFT ISTLGIKIDC IPGRCN +' +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> speciesDNALineTokenizer [ + + ^ BioPhylipParser new speciesDNALineTokenizer +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> speciesDNANamedBlockTokenizer [ + + ^ BioPhylipParser new speciesDNANamedBlockTokenizer +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeDNASpeciesBlock01 [ + " Private - Answer a with a sample phylip DNA " + + | speciesBlock expectedResult firstRecord | + + speciesBlock := 'Archaeopt CGATGCTTAC CGC +HesperorniCGTTACTCGT TGT +BaluchitheTAATGTTAAT TGT +B. virginiTAATGTTCGT TGT +BrontosaurCAAAACCCAT CAT +B.subtilisGGCAGCCAAT CAC'. + expectedResult := #(#('Archaeopt ' 'CGATGCTTAC CGC' nil) #('Hesperorni' 'CGTTACTCGT TGT' nil) #('Baluchithe' 'TAATGTTAAT TGT' nil) #('B. virgini' 'TAATGTTCGT TGT' nil) #('Brontosaur' 'CAAAACCCAT CAT' nil)). + + parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. + firstRecord := parseResult first. + + self assert: firstRecord first equals: 'Archaeopt '. + self assert: firstRecord second equals: 'CGATGCTTAC CGC'. + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeDNASpeciesBlock02 [ + " Private - Answer a with a sample phylip DNA " + + | speciesBlock expectedResult firstRecord | + + speciesBlock := 'Archaeopt CGATGCTTAC CGC +Hes CGTTACTCGT TGT +BaluchitheTAATGTTAAT TGT +B. virginiTAATGTTCGT TGT +BrontosaurCAAAACCCAT CAT +B.subtilisGGCAGCCAAT CAC'. + expectedResult := #( + #('Archaeopt ' 'CGATGCTTAC CGC') + #('Hesperorni' 'CGTTACTCGT TGT') + #('Baluchithe' 'TAATGTTAAT TGT') + #('B. virgini' 'TAATGTTCGT TGT') + #('Brontosaur' 'CAAAACCCAT CAT')). + + parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. + firstRecord := parseResult first. + + self assert: firstRecord first equals: 'Archaeopt '. + self assert: firstRecord second equals: 'CGATGCTTAC CGC'. + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeDNASpeciesLine01 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'Archaeopt CGATGCTTAC CGC'. + expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult ). +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeDNASpeciesLine02 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'Archaeopt CGATGCTTACCGC'. + expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult). + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeDNASpeciesLine03 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'B. virginiTAATGTTCGT TGT'. + expectedResult := #('B. virgini' 'TAATGTTCGTTGT'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult). + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeFirstLine01 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := '6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeFirstLine02 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := ' 6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeFirstLine03 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := '6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeInterleavedDNA [ + " Private - Answer a with a sample phylip DNA " + + | phylipString | + phylipString := self phylipInterleavedDNA. + + parseResult := BioParser tokenizePhylipInterleavedDNA: phylipString. + self assert: parseResult size equals: 4. + self assert: parseResult first equals: 6. + self assert: parseResult second equals: 13. + self assert: (parseResult third bioHasEqualElements: + #( 'Archaeopt ' 'Hesperorni' 'Baluchithe' 'B. virgini' + 'Brontosaur' 'B.subtilis' )). + self assert: (parseResult fourth bioHasEqualElements: + #( 'CGATGCTTACCGCCGATGCTTACCGCCGATGCTTACCGCCCCCGCCCCCGCTTACCGC' + 'CGTTACTCGTTGTCGTTACTCGTTGTCGTTACTCGTTGTCCCCGTCCCCACTCGTTGT' + 'TAATGTTAATTGTTAATGTTAATTGTTAATGTTAATTGTCCCCGTCCCCGTTAATTGT' + 'TAATGTTCGTTGTTAATGTTCGTTGTTAATGTTCGTTGTCCCCGTCCCCGTTCGTTGT' + 'CAAAACCCATCATCAAAACCCATCATCAAAACCCATCATCCCCATCCCCACCCATCAT' + 'GGCAGCCAATCACGGCAGCCAATCACGGCAGCCAATCACCCCCACCCCCGCCAATCAC' )) +] + +{ #category : 'testing' } +BioPhylipPetitParserTest >> testTokenizeInterleavedProtein [ + + | phylipString | + phylipString := self phylipInterleavedProtein. + parseResult := BioParser tokenizePhylipInterleavedProtein: phylipString. + + self assert: parseResult size equals: 4. + self assert: parseResult first equals: 5. + self assert: parseResult second equals: 176. + self assert: (parseResult third bioHasEqualElements: + #( 'cox2_leita' 'cox2_crifa' 'cox2_bsalt' 'cox2_trybb' + 'cox2_tborr' )). + self assert: (parseResult fourth bioHasEqualElements: + #( 'MAFILSFWMIFLLDSVIVLLSFVCFVCVWICALLFSTVLLVSKLNNIYCTWDFTASKFIDVYWFTIGGMFSLGLLLRLCLLLYFGHLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYMIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAISSLGVKVENLVAVMK' + 'MAFILSFWMIFLIDAVIVLLSFVCFVCIWICSLFFSSFLLVSKINNVYCTWDFTASKFIDAYWFTIGGMFVLCLLLRLCLLLYFGCLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAVSSLGIKVDCIPGRCN' + 'MSFIISFWMLFLIDSLIVLLSGAIFVCIWICSLFFLCILFICKLDYIFCSWDFISAKFIDLYWFTLGCLFIVCLLIRLCLLLYFSCLNFVCFDLCKCIGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKVWLSAIDVIHSFTLANLGIKVD??PGRCN' + 'MSFILTFWMIFLMDSIIVLISFSIFLSVWICALIIATVLTVTKINNIYCTWDFISSKFIDTYWFVLGMMFILCLLLRLCLLLYFSCINFVSFDLCKVIGFQWYWVYFLFGETTIFSNLILESDYLIGDLRILQCNHVLTLLSLVIYKLWVSAVDVIHSFTISSLGIKVENPGRCNE' + 'MLFFINQLLLLLVDTFVILEIFSLFVCVFIIVMYILFINYNIFLKNINVYLDFIGSKYLDLYWFLIGIFFVIVLLIRLCLLLYYSWISLLIFDLCKIMGFQWYWIFFVFKENVIFSNLLIESDYWIGDLRLLQCNNTFNLICLVVYKIWVTSIDVIHSFTISTLGIKIDCIPGRCN' )) +] diff --git a/repository/BioParsers/BioPhylipParser.class.st b/repository/BioParsers/BioPhylipParser.class.st index 46e0ce8a..bea56d9f 100644 --- a/repository/BioParsers/BioPhylipParser.class.st +++ b/repository/BioParsers/BioPhylipParser.class.st @@ -1,218 +1,618 @@ -" -Documentation taken from http://bioweb2.pasteur.fr/docs/phylip/doc/main.html#inputfiles - -" Class { #name : 'BioPhylipParser', - #superclass : 'BioAbstractTextParser', - #category : 'BioParsers-Core', + #superclass : 'BioObject', + #instVars : [ + 'numTaxa', + 'numChars', + 'taxaNames', + 'sequences', + 'isInterleaved', + 'isStrict', + 'currentLine', + 'lines', + 'errorLine' + ], + #category : 'BioParsers-PHYLIP', #package : 'BioParsers', - #tag : 'Core' + #tag : 'PHYLIP' } -{ #category : 'accessing-dna' } -BioPhylipParser >> buildDNAResults: aCollection [ - " Answer an identified object for the receiver's parsing output " +{ #category : 'as yet unclassified' } +BioPhylipParser class >> parseFile: aFileReference [ + ^ self new parseFile: aFileReference +] - | tokenized sequences | - - tokenized := self buildTokens: aCollection. - sequences := tokenized third - with: tokenized fourth - do: [: first : snd | BioSequence newAmbiguousDNA: snd named: first ]. - ^ BioPhylip new - numberOfTaxa: tokenized first; - numberOfCharacters: tokenized second; - sequences: sequences. +{ #category : 'as yet unclassified' } +BioPhylipParser class >> parseString: aString [ + ^ self new parseString: aString ] -{ #category : 'accessing-protein' } -BioPhylipParser >> buildProteinResults: aCollection [ - " Answer an identified object for the receiver's parsing output " +{ #category : 'converting' } +BioPhylipParser >> asAlignment [ + "Build a BioAlignment from the parsed data. + Uses BioSequence class>>newNamed:sequence: which auto-detects the alphabet." + | alignment | + alignment := BioAlignment new. + 1 to: numTaxa do: [ :i | + | seq | + seq := BioSequence newNamed: (taxaNames at: i) sequence: (sequences at: i) asUppercase. + alignment addFromSequence: seq ]. + ^ alignment +] - | tokenized sequences | - - tokenized := self buildTokens: aCollection. - sequences := tokenized third - with: tokenized fourth - do: [: first : snd | BioSequence newProtein: snd named: first ]. - ^ BioPhylip new - numberOfTaxa: tokenized first; - numberOfCharacters: tokenized second; - sequences: sequences. +{ #category : 'converting' } +BioPhylipParser >> asPhylipStringRelaxed: anAlignment [ + "Answer a relaxed PHYLIP string from anAlignment (variable-length names, sequential)." + + ^ String streamContents: [ :s | + s + nextPutAll: anAlignment size asString; + space; + nextPutAll: anAlignment numberOfBases asString; + cr. + anAlignment sequences do: [ :seq | + s + nextPutAll: (seq name ifNil: [ 'Unnamed' ]); + space; + nextPutAll: seq asString; + cr ] ] +] + +{ #category : 'converting' } +BioPhylipParser >> asPhylipStringStrict: anAlignment [ + "Answer a strict PHYLIP string from anAlignment (10-char names, sequential)." + + ^ String streamContents: [ :s | + s + nextPutAll: anAlignment size asString; + space; + nextPutAll: anAlignment numberOfBases asString; + cr. + anAlignment sequences do: [ :seq | + | name padded | + name := seq name ifNil: [ 'Unnamed' ]. + padded := name size > 10 + ifTrue: [ name copyFrom: 1 to: 10 ] + ifFalse: [ + name + , + (String new: 10 - name size withAll: Character space) ]. + s + nextPutAll: padded; + nextPutAll: seq asString; + cr ] ] +] + +{ #category : 'private' } +BioPhylipParser >> cleanSequence: aString [ + "Remove whitespace from sequence data. PHYLIP allows spaces within sequences." + + ^ aString reject: [ :c | c isSeparator ] ] { #category : 'accessing' } -BioPhylipParser >> buildTokens: aCollection [ - " Answer a tokenized parsing aCollection " - - ^ Array - with: (self taxaNumberFrom: aCollection) - with: (aCollection first second asNumber) - with: (aCollection second collect: #first) - with: (self buildTokensFrom: aCollection). +BioPhylipParser >> currentLine [ + ^ currentLine +] + +{ #category : 'accessing' } +BioPhylipParser >> currentLine: anInt [ + + currentLine := anInt +] +{ #category : 'private' } +BioPhylipParser >> detectFormat [ + "Auto-detect strict/relaxed and sequential/interleaved." + + | firstDataLine nameField | + currentLine > lines size ifTrue: [ + Error signal: 'No data lines after header' ]. + firstDataLine := lines at: currentLine. + nameField := self extractNameFromLine: firstDataLine. + isStrict := nameField size <= 10 and: [ + firstDataLine size >= 10 and: [ + (firstDataLine + copyFrom: 1 + to: (10 min: firstDataLine size)) trimBoth + = nameField ] ]. + isInterleaved := self detectInterleaved ] -{ #category : 'accessing-private' } -BioPhylipParser >> buildTokensBlock [ +{ #category : 'private' } +BioPhylipParser >> detectInterleaved [ + "Determine if the file is interleaved or sequential. + If the first taxon has numChars chars, it is sequential (or single-block). + Otherwise, skip first N taxa lines, then check if the next N lines + are pure sequence (=> interleaved) or include a new taxon name (=> sequential)." + + | i firstSeqLen firstDataLineIdx consecutiveContLines | + firstDataLineIdx := currentLine. + [ + firstDataLineIdx <= lines size and: [ + (lines at: firstDataLineIdx) trimBoth isEmpty ] ] whileTrue: [ + firstDataLineIdx := firstDataLineIdx + 1 ]. + firstDataLineIdx > lines size ifTrue: [ ^ false ]. + firstSeqLen := (self extractSequenceFromLine: + (lines at: firstDataLineIdx)) size. + firstSeqLen >= numChars ifTrue: [ ^ false ]. + i := firstDataLineIdx. + numTaxa timesRepeat: [ + [ i <= lines size and: [ (lines at: i) trimBoth isEmpty ] ] + whileTrue: [ i := i + 1 ]. + i := i + 1 ]. + [ i <= lines size and: [ (lines at: i) trimBoth isEmpty ] ] + whileTrue: [ i := i + 1 ]. + i > lines size ifTrue: [ ^ false ]. + consecutiveContLines := 0. + [ i <= lines size and: [ consecutiveContLines < numTaxa ] ] + whileTrue: [ + | trimmed | + trimmed := (lines at: i) trimBoth. + trimmed isEmpty + ifTrue: [ i := i + 1 ] + ifFalse: [ + (self nextLineLooksLikeNewTaxon: (lines at: i)) ifTrue: [ + ^ false ]. + consecutiveContLines := consecutiveContLines + 1. + i := i + 1 ] ]. + ^ consecutiveContLines >= numTaxa +] + +{ #category : 'private' } +BioPhylipParser >> detectSequenceClass [ + "Answer the sequence class to use for creating sequences. + BioSequence newNamed:sequence: auto-detects the alphabet from the sequence content." + ^ BioSequence +] - ^ [: node | - OrderedCollection - with: node first - with: (node second collect: #allButLast) - with: (((node third reject: [: line | line first isEmpty ]) collect: #first) collect: #withoutBlanks ) ] +{ #category : 'private' } +BioPhylipParser >> detectStrictOrRelaxed [ + "Detect whether the file uses strict (10-char names) or relaxed (variable-length names) format." + + | firstDataLine nameField | + currentLine > lines size ifTrue: [ + isStrict := false. + ^ self ]. + firstDataLine := lines at: currentLine. + nameField := self extractNameFromLineRelaxed: firstDataLine. + isStrict := nameField size <= 10 and: [ + firstDataLine size >= 10 and: [ + (firstDataLine + copyFrom: 1 + to: (10 min: firstDataLine size)) trimBoth + = nameField ] ] ] { #category : 'accessing' } -BioPhylipParser >> buildTokensFrom: aCollection [ +BioPhylipParser >> errorAt: lineNum message: aString [ + "Signal a parse error with line number context." - | taxaNumber collection seqIndex seqBlock | + self error: + 'Phylip parse error at line ' , lineNum asString , ': ' , aString +] - taxaNumber := self taxaNumberFrom: aCollection. - collection := self buildTokensFromFirstBlock: aCollection. - seqIndex := 1. - (seqBlock := aCollection third) doWithIndex: [:seq :index | - seqIndex = (taxaNumber + 1) - ifTrue: [seqIndex := 1]. - index <= seqBlock size - ifFalse: [ ^ collection ]. - collection - at: seqIndex - put: (String - streamContents: [:str | str - nextPutAll: (collection at: seqIndex); - nextPutAll: (seqBlock at: index)]). - seqIndex := seqIndex + 1]. - ^ collection +{ #category : 'private' } +BioPhylipParser >> extractNameFromLine: aLine [ + "Extract the taxon name from a data line. + In relaxed mode: name is everything before the first whitespace. + In strict mode: name is the first 10 characters (trimmed). + Returns the trimmed name string." + + | trimmed wsIdx | + isStrict ifNotNil: [ + isStrict ifTrue: [ + aLine size < 10 ifTrue: [ ^ aLine trimBoth ]. + ^ (aLine copyFrom: 1 to: 10) trimBoth ] ]. + "Relaxed or unknown: name ends at first whitespace" + trimmed := aLine trimBoth. + wsIdx := self indexOfFirstWhitespaceIn: trimmed. + wsIdx = 0 ifTrue: [ ^ trimmed ]. + ^ trimmed copyFrom: 1 to: wsIdx - 1 +] +{ #category : 'private' } +BioPhylipParser >> extractNameFromLineRelaxed: aLine [ + "Extract the taxon name assuming relaxed format (name ends at first whitespace)." + + | trimmed wsIdx | + trimmed := aLine trimBoth. + wsIdx := self indexOfFirstWhitespaceIn: trimmed. + wsIdx = 0 ifTrue: [ ^ trimmed ]. + ^ trimmed copyFrom: 1 to: wsIdx - 1 +] + +{ #category : 'private' } +BioPhylipParser >> extractSequenceFromLine: aLine [ + "Extract the sequence portion from a data line (after the taxon name)." + + | nameSeq nameEnd | + isStrict ifTrue: [ + aLine size <= 10 ifTrue: [ ^ '' ]. + ^ self cleanSequence: (aLine copyFrom: 11 to: aLine size) ]. + "Relaxed: find end of name, rest is sequence" + nameSeq := aLine trimBoth. + nameEnd := self indexOfFirstWhitespaceIn: nameSeq. + nameEnd = 0 ifTrue: [ ^ '' ]. + ^ self cleanSequence: + (nameSeq copyFrom: nameEnd + 1 to: nameSeq size) ] { #category : 'accessing' } -BioPhylipParser >> buildTokensFromFirstBlock: aCollection [ +BioPhylipParser >> indexOfFirstWhitespaceIn: aString [ + "Answer the 1-based index of the first whitespace character (space or tab) in aString, + or 0 if none." - ^ aCollection second collect: [: seq | seq second withoutBlanks ] + 1 to: aString size do: [ :i | + (aString at: i) isSeparator ifTrue: [ ^ i ] ]. + ^ 0 ] -{ #category : 'accessing-dna' } -BioPhylipParser >> dnaInterleaveLineTokenizer [ +{ #category : 'initialization' } +BioPhylipParser >> initialize [ - ^ (self dnaInterleaveSequenceTokenizer , #newline asPParser) star + super initialize. + taxaNames := OrderedCollection new. + sequences := OrderedCollection new. + currentLine := 0. + lines := #( ). + errorLine := 0 ] -{ #category : 'accessing-dna' } -BioPhylipParser >> dnaInterleaveSequenceTokenizer [ +{ #category : 'accessing' } +BioPhylipParser >> isInterleaved [ ^ isInterleaved +] - ^ #dnaLetter asPParser trimBlanks star flatten +{ #category : 'accessing' } +BioPhylipParser >> isInterleaved: aBool [ isInterleaved := aBool ] -{ #category : 'accessing-private' } -BioPhylipParser >> firstLineTokenizer [ - " Answer a Parser for parsing the first line of the format " - - ^ (#number asPParser / self parserForAnyButNumber) , - (self parserForAnyButNumber) , - #blank asPParser plus optional flatten , - #newline asPParser ==> [ : node | - node asOrderedCollection - removeAllSuchThat: [ : elem | elem allSatisfy: [ : e | e = Character space ] ]; - copyWithoutAll: { - Character lf asString . - Character cr asString } ] -] - -{ #category : 'accessing-dna' } -BioPhylipParser >> parseInterleavedDNA: aString [ - " Answer an object with the result of parsing aString with the receiver's parser " - - | parseResults | - - parseResults := self parseString: aString. - ^ self isSuccess - ifTrue: [ results := self buildDNAResults: parseResults ] - ifFalse: [ self signalInvalidObject: parseResults ]. +{ #category : 'testing' } +BioPhylipParser >> isSequenceChar: c [ + "^ true if c is a valid PHYLIP sequence character (IUPAC nucleotide/amino acid + gap)." + + ^ 'ACGTURYNWSMKHDVBacgturynwsmkhbdvb-?.' includes: c ] -{ #category : 'accessing-protein' } -BioPhylipParser >> parseInterleavedProtein: aString [ - " Answer an object with the result of parsing aString with the receiver's parser " - - | parseResults | - - parseResults := self parseString: aString. - ^ self isSuccess - ifTrue: [ results := self buildProteinResults: parseResults ] - ifFalse: [ self signalInvalidObject: parseResults ]. +{ #category : 'accessing' } +BioPhylipParser >> isStrict [ ^ isStrict ] -{ #category : 'accessing-private' } -BioPhylipParser >> parserForAnyButNumber [ +{ #category : 'accessing' } +BioPhylipParser >> isStrict: aBool [ isStrict := aBool +] - ^ #digit asPParser negate plus , #number asPParser ==> [: n | n second ] +{ #category : 'private' } +BioPhylipParser >> lineStartsWithName: aLine [ + "Check if aLine starts with a taxon name (vs being a pure sequence continuation line). + In strict mode: the first 10 chars contain a name followed by spaces/padding. + In relaxed mode: the line has a name (non-sequence-like prefix) followed by whitespace + and then sequence data." + + | trimmed wsIdx potentialName first10 | + trimmed := aLine trimBoth. + trimmed isEmpty ifTrue: [ ^ false ]. + isStrict ifTrue: [ + trimmed size < 10 ifTrue: [ ^ true ]. "Short lines in strict mode must be names" + first10 := (trimmed copyFrom: 1 to: 10) trimBoth. + ^ (self looksLikeSequence: first10) not ]. + "Relaxed: check for name + whitespace + sequence pattern" + wsIdx := self indexOfFirstWhitespaceIn: trimmed. + wsIdx = 0 ifTrue: [ ^ false ]. "No whitespace = pure sequence continuation" + wsIdx > 50 ifTrue: [ ^ false ]. "Whitespace too far in = sequence not name" + potentialName := trimmed copyFrom: 1 to: wsIdx - 1. + ^ (self looksLikeSequence: potentialName) not ] -{ #category : 'accessing-dna' } -BioPhylipParser >> speciesDNALineTokenizer [ - " Answer a Parser for parsing the species names line " - - ^ ((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten , - self dnaInterleaveSequenceTokenizer +{ #category : 'accessing' } +BioPhylipParser >> lines: anArray [ lines := anArray ] -{ #category : 'accessing-dna' } -BioPhylipParser >> speciesDNANamedBlockTokenizer [ - " Answer a Parser for parsing the sequence blocks " - - ^ (self speciesDNALineTokenizer , #newline asPParser flatten) star +{ #category : 'private' } +BioPhylipParser >> looksLikeSequence: aString [ + "Check if aString looks like sequence data (mostly IUPAC characters)." + + | seqChars totalChars ratio | + totalChars := aString size. + totalChars = 0 ifTrue: [ ^ false ]. + seqChars := 0. + aString do: [ :c | + (self isSequenceChar: c) ifTrue: [ seqChars := seqChars + 1 ] ]. + ratio := seqChars / totalChars. + ^ ratio > 0.9 ] -{ #category : 'accessing-private' } -BioPhylipParser >> speciesFobiddenNames [ - " Private - Answer a with receiver's not allowed Characters in a species name " - - ^ OrderedCollection new - add: Character cr; - add: Character lf; - add: $[; - add: $]; - add: $(; - add: $); - add: $:; - add: $;; - add: $,; - yourself +{ #category : 'private' } +BioPhylipParser >> nextLineLooksLikeContinuation: aLine [ + "Check if aLine looks like a sequence continuation (no name prefix)." + + | trimmed wsIdx potentialName first10 | + trimmed := aLine trimBoth. + trimmed isEmpty ifTrue: [ ^ true ]. + isStrict ifTrue: [ + trimmed size >= 10 ifTrue: [ + first10 := (trimmed copyFrom: 1 to: 10) trimBoth. + first10 isEmpty ifTrue: [ ^ true ]. + ^ self looksLikeSequence: first10 ]. + ^ true ]. + "Relaxed: check if line starts with a name" + wsIdx := self indexOfFirstWhitespaceIn: trimmed. + wsIdx = 0 ifTrue: [ ^ true ]. + wsIdx > 30 ifTrue: [ ^ true ]. + potentialName := trimmed copyFrom: 1 to: wsIdx - 1. + ^ self looksLikeSequence: potentialName +] + +{ #category : 'private' } +BioPhylipParser >> nextLineLooksLikeNewTaxon: aLine [ + "Check if aLine looks like a new taxon name + sequence (vs pure sequence continuation). + Returns true if it looks like a new taxon (=> interleaved). + Returns false if it looks like continuation (=> sequential)." + + | trimmed wsIdx potentialName first10 | + trimmed := aLine trimBoth. + trimmed isEmpty ifTrue: [ ^ false ]. + isStrict ifTrue: [ "Strict: first 10 chars should be a name (not sequence chars)" + trimmed size < 10 ifTrue: [ ^ self looksLikeSequence: trimmed not ]. + first10 := (trimmed copyFrom: 1 to: 10) trimBoth. + first10 isEmpty ifTrue: [ ^ false ]. + ^ (self looksLikeSequence: first10) not ]. + "Relaxed: name ends at first whitespace, check if it looks like a name" + wsIdx := self indexOfFirstWhitespaceIn: trimmed. + wsIdx = 0 ifTrue: [ ^ false ]. "No whitespace = pure sequence, not a name" + wsIdx > 30 ifTrue: [ ^ false ]. "Whitespace very far in = probably sequence not name" + potentialName := trimmed copyFrom: 1 to: wsIdx - 1. + ^ (self looksLikeSequence: potentialName) not ] { #category : 'accessing' } -BioPhylipParser >> taxaNumberFrom: aCollection [ +BioPhylipParser >> numChars [ ^ numChars +] - ^ aCollection first first asNumber +{ #category : 'accessing' } +BioPhylipParser >> numTaxa [ ^ numTaxa ] -{ #category : 'accessing-dna' } -BioPhylipParser >> tokenizeInterleavedDNA [ - " Private - Tokenize the receiver's epression as DNA data " +{ #category : 'parsing' } +BioPhylipParser >> parseAsInterleaved: aString [ + "Parse assuming interleaved format." + + self initialize. + lines := (aString lines select: [ :l | l notEmpty ]) asArray. + self parseHeader. + isInterleaved := true. + self detectStrictOrRelaxed. + self parseBodyInterleaved. + self validate. + ^ self asAlignment +] - parser := - ( self firstLineTokenizer , - self speciesDNANamedBlockTokenizer , - self dnaInterleaveLineTokenizer ) ==> self buildTokensBlock. - ^ self tokenize. - - +{ #category : 'parsing' } +BioPhylipParser >> parseAsSequential: aString [ + "Parse assuming sequential format." + + self initialize. + lines := (aString lines select: [ :l | l notEmpty ]) asArray. + self parseHeader. + isInterleaved := false. + self detectStrictOrRelaxed. + self parseBodySequential. + self validate. + ^ self asAlignment ] -{ #category : 'accessing-protein' } -BioPhylipParser >> tokenizeInterleavedProtein [ - " Private - Tokenize the receiver's epression as Protein data " +{ #category : 'parsing' } +BioPhylipParser >> parseBody [ + "Parse the data body after the header. Dispatches to sequential or interleaved parser." - parser := - self firstLineTokenizer , - (((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten , - #proteinLetterGapped asPParser trimBlanks star flatten , - #newline asPParser) star , - (#proteinLetterGapped asPParser trimBlanks star flatten , #newline asPParser) star ==> self buildTokensBlock. - ^ self tokenize. - + isInterleaved + ifTrue: [ self parseBodyInterleaved ] + ifFalse: [ self parseBodySequential ] +] + +{ #category : 'parsing' } +BioPhylipParser >> parseBodyInterleaved [ + "Parse interleaved format: first block has N name+seq lines, subsequent blocks are seq-only." + + | totalRead | + numTaxa timesRepeat: [ + [ + currentLine <= lines size and: [ + (lines at: currentLine) trimBoth isEmpty ] ] whileTrue: [ + currentLine := currentLine + 1 ]. + currentLine > lines size ifTrue: [ + Error signal: 'Unexpected end of input in interleaved block' ]. + taxaNames add: (self extractNameFromLine: (lines at: currentLine)). + sequences add: + (self extractSequenceFromLine: (lines at: currentLine)). + currentLine := currentLine + 1 ]. + totalRead := 0. + [ currentLine <= lines size ] whileTrue: [ + | line trimmed | + line := lines at: currentLine. + trimmed := line trimBoth. + trimmed isEmpty + ifTrue: [ currentLine := currentLine + 1 ] + ifFalse: [ + | idx | + idx := totalRead \\ numTaxa. + idx := idx + 1. + sequences + at: idx + put: (sequences at: idx) , (self cleanSequence: trimmed). + totalRead := totalRead + 1. + currentLine := currentLine + 1 ] ] +] + +{ #category : 'parsing' } +BioPhylipParser >> parseBodySequential [ + "Parse sequential format: each taxon's data appears on consecutive lines. + A taxon starts with a name line, followed by optional continuation lines. + We detect name lines by checking if the line starts with a taxon name + (has a recognizable name prefix followed by whitespace then sequence data)." + + | taxonIndex currentSeq charsRead | + taxonIndex := 0. + currentSeq := ''. + charsRead := 0. + [ currentLine <= lines size ] whileTrue: [ + | line trimmed name seq | + line := lines at: currentLine. + trimmed := line trimBoth. + trimmed isEmpty + ifTrue: [ currentLine := currentLine + 1 ] + ifFalse: [ + (taxonIndex < numTaxa and: [ self lineStartsWithName: line ]) + ifTrue: [ "New taxon line - save previous sequence if any" + taxonIndex > 0 ifTrue: [ sequences add: currentSeq ]. + name := self extractNameFromLine: line. + seq := self extractSequenceFromLine: line. + taxaNames add: name. + currentSeq := seq. + charsRead := seq size. + taxonIndex := taxonIndex + 1 ] + ifFalse: [ "Continuation of current taxon" + seq := self cleanSequence: trimmed. + currentSeq := currentSeq , seq. + charsRead := charsRead + seq size ]. + currentLine := currentLine + 1 ] ]. + "Add last sequence" + currentSeq ifNotEmpty: [ sequences add: currentSeq ] +] + +{ #category : 'parsing' } +BioPhylipParser >> parseFile: aFileReference [ + "Parse a PHYLIP format file. Answer a ." + + ^ self parseString: aFileReference asFileReference contents +] + +{ #category : 'parsing' } +BioPhylipParser >> parseHeader [ + + | header firstNum secondNum | + currentLine < lines size ifFalse: [ + Error signal: 'Empty PHYLIP input' ]. + header := lines first trimBoth. + firstNum := self readFirstNumber: header. + secondNum := self readSecondNumber: header. + firstNum ifNil: [ Error signal: 'Cannot read taxa count' ]. + secondNum ifNil: [ Error signal: 'Cannot read character count' ]. + firstNum < 2 ifTrue: [ Error signal: 'Need at least 2 taxa' ]. + secondNum < 1 ifTrue: [ Error signal: 'Need at least 1 character' ]. + numTaxa := firstNum. + numChars := secondNum. + currentLine := 2 +] + +{ #category : 'parsing' } +BioPhylipParser >> parseString: aString [ + "Parse a PHYLIP format string. Auto-detect sequential/interleaved. + Try sequential first; if validation fails, try interleaved." + + | aln | + aln := [ self parseAsSequential: aString ] + on: Error + do: [ nil ]. + aln ifNotNil: [ ^ aln ]. + ^ self parseAsInterleaved: aString +] + +{ #category : 'private' } +BioPhylipParser >> readFirstNumber: aString [ + "Read the first integer from the header line." + + | idx | + idx := aString indexOf: Character space startingAt: 1. + idx = 0 ifTrue: [ + idx := aString indexOf: Character tab startingAt: 1 ]. + idx = 0 ifTrue: [ ^ nil ]. + ^ [ (aString copyFrom: 1 to: idx - 1) asInteger ] + on: Error + do: [ nil ] +] + +{ #category : 'private' } +BioPhylipParser >> readSecondNumber: aString [ + "Read the second integer from the header line." + + | idx rest | + idx := aString indexOf: Character space startingAt: 1. + idx = 0 ifTrue: [ + idx := aString indexOf: Character tab startingAt: 1 ]. + idx = 0 ifTrue: [ ^ nil ]. + rest := (aString copyFrom: idx + 1 to: aString size) trimBoth. + ^ [ rest asInteger ] + on: Error + do: [ nil ] +] + +{ #category : 'initialization' } +BioPhylipParser >> resetForParse: aString [ + "Reset parser state and set up lines from aString." + + self initialize. + lines := (aString lines select: [ :l | l notEmpty ]) asArray. + currentLine := 2. + self parseHeader +] + +{ #category : 'accessing' } +BioPhylipParser >> sequences [ ^ sequences +] + +{ #category : 'accessing' } +BioPhylipParser >> taxaNames [ ^ taxaNames +] + +{ #category : 'private' } +BioPhylipParser >> tryParseInterleaved: aString [ + "Try to parse assuming interleaved format. Signal error on failure." + + | savedState | + savedState := self deepCopy. + [ + self resetForParse: aString. + isInterleaved := true. + self detectStrictOrRelaxed. + self parseBodyInterleaved. + self validate. + ^ self asAlignment ] + on: Error + do: [ :ex | "Restore state and re-raise" + savedState restoreTo: self. + ex pass ] +] + +{ #category : 'private' } +BioPhylipParser >> tryParseSequential: aString [ + "Try to parse assuming sequential format." + + self initialize. + lines := (aString lines select: [ :l | l notEmpty ]) asArray. + self parseHeader. + isInterleaved := false. + self detectStrictOrRelaxed. + self parseBodySequential. + self validate. + ^ self asAlignment +] + +{ #category : 'accessing' } +BioPhylipParser >> validate [ + "Validate the parsed data." + + taxaNames size = numTaxa ifFalse: [ + Error signal: 'Expected ' , numTaxa asString , ' taxa but found ' + , taxaNames size asString ]. + sequences size = numTaxa ifFalse: [ + Error signal: + 'Expected ' , numTaxa asString , ' sequences but found ' + , sequences size asString ]. + 1 to: numTaxa do: [ :i | + (sequences at: i) size = numChars ifFalse: [ + Error signal: 'Sequence for ' , (taxaNames at: i) , ' has ' + , (sequences at: i) size asString + , ' chars but header specifies ' , numChars asString ] ] ] diff --git a/repository/BioParsers/BioPhylipPetitParser.class.st b/repository/BioParsers/BioPhylipPetitParser.class.st new file mode 100644 index 00000000..140b5dd8 --- /dev/null +++ b/repository/BioParsers/BioPhylipPetitParser.class.st @@ -0,0 +1,218 @@ +" +Documentation taken from http://bioweb2.pasteur.fr/docs/phylip/doc/main.html#inputfiles + +" +Class { + #name : 'BioPhylipPetitParser', + #superclass : 'BioAbstractTextParser', + #category : 'BioParsers-Core', + #package : 'BioParsers', + #tag : 'Core' +} + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> buildDNAResults: aCollection [ + " Answer an identified object for the receiver's parsing output " + + | tokenized sequences | + + tokenized := self buildTokens: aCollection. + sequences := tokenized third + with: tokenized fourth + do: [: first : snd | BioSequence newAmbiguousDNA: snd named: first ]. + ^ BioPhylip new + numberOfTaxa: tokenized first; + numberOfCharacters: tokenized second; + sequences: sequences. +] + +{ #category : 'accessing-protein' } +BioPhylipPetitParser >> buildProteinResults: aCollection [ + " Answer an identified object for the receiver's parsing output " + + | tokenized sequences | + + tokenized := self buildTokens: aCollection. + sequences := tokenized third + with: tokenized fourth + do: [: first : snd | BioSequence newProtein: snd named: first ]. + ^ BioPhylip new + numberOfTaxa: tokenized first; + numberOfCharacters: tokenized second; + sequences: sequences. +] + +{ #category : 'accessing' } +BioPhylipPetitParser >> buildTokens: aCollection [ + " Answer a tokenized parsing aCollection " + + ^ Array + with: (self taxaNumberFrom: aCollection) + with: (aCollection first second asNumber) + with: (aCollection second collect: #first) + with: (self buildTokensFrom: aCollection). + + +] + +{ #category : 'accessing-private' } +BioPhylipPetitParser >> buildTokensBlock [ + + ^ [: node | + OrderedCollection + with: node first + with: (node second collect: #allButLast) + with: (((node third reject: [: line | line first isEmpty ]) collect: #first) collect: #withoutBlanks ) ] +] + +{ #category : 'accessing' } +BioPhylipPetitParser >> buildTokensFrom: aCollection [ + + | taxaNumber collection seqIndex seqBlock | + + taxaNumber := self taxaNumberFrom: aCollection. + collection := self buildTokensFromFirstBlock: aCollection. + seqIndex := 1. + (seqBlock := aCollection third) doWithIndex: [:seq :index | + seqIndex = (taxaNumber + 1) + ifTrue: [seqIndex := 1]. + index <= seqBlock size + ifFalse: [ ^ collection ]. + collection + at: seqIndex + put: (String + streamContents: [:str | str + nextPutAll: (collection at: seqIndex); + nextPutAll: (seqBlock at: index)]). + seqIndex := seqIndex + 1]. + ^ collection + +] + +{ #category : 'accessing' } +BioPhylipPetitParser >> buildTokensFromFirstBlock: aCollection [ + + ^ aCollection second collect: [: seq | seq second withoutBlanks ] +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> dnaInterleaveLineTokenizer [ + + ^ (self dnaInterleaveSequenceTokenizer , #newline asPParser) star +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> dnaInterleaveSequenceTokenizer [ + + ^ #dnaLetter asPParser trimBlanks star flatten +] + +{ #category : 'accessing-private' } +BioPhylipPetitParser >> firstLineTokenizer [ + " Answer a Parser for parsing the first line of the format " + + ^ (#number asPParser / self parserForAnyButNumber) , + (self parserForAnyButNumber) , + #blank asPParser plus optional flatten , + #newline asPParser ==> [ : node | + node asOrderedCollection + removeAllSuchThat: [ : elem | elem allSatisfy: [ : e | e = Character space ] ]; + copyWithoutAll: { + Character lf asString . + Character cr asString } ] +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> parseInterleavedDNA: aString [ + " Answer an object with the result of parsing aString with the receiver's parser " + + | parseResults | + + parseResults := self parseString: aString. + ^ self isSuccess + ifTrue: [ results := self buildDNAResults: parseResults ] + ifFalse: [ self signalInvalidObject: parseResults ]. +] + +{ #category : 'accessing-protein' } +BioPhylipPetitParser >> parseInterleavedProtein: aString [ + " Answer an object with the result of parsing aString with the receiver's parser " + + | parseResults | + + parseResults := self parseString: aString. + ^ self isSuccess + ifTrue: [ results := self buildProteinResults: parseResults ] + ifFalse: [ self signalInvalidObject: parseResults ]. +] + +{ #category : 'accessing-private' } +BioPhylipPetitParser >> parserForAnyButNumber [ + + ^ #digit asPParser negate plus , #number asPParser ==> [: n | n second ] +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> speciesDNALineTokenizer [ + " Answer a Parser for parsing the species names line " + + ^ ((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten , + self dnaInterleaveSequenceTokenizer +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> speciesDNANamedBlockTokenizer [ + " Answer a Parser for parsing the sequence blocks " + + ^ (self speciesDNALineTokenizer , #newline asPParser flatten) star +] + +{ #category : 'accessing-private' } +BioPhylipPetitParser >> speciesFobiddenNames [ + " Private - Answer a with receiver's not allowed Characters in a species name " + + ^ OrderedCollection new + add: Character cr; + add: Character lf; + add: $[; + add: $]; + add: $(; + add: $); + add: $:; + add: $;; + add: $,; + yourself +] + +{ #category : 'accessing' } +BioPhylipPetitParser >> taxaNumberFrom: aCollection [ + + ^ aCollection first first asNumber +] + +{ #category : 'accessing-dna' } +BioPhylipPetitParser >> tokenizeInterleavedDNA [ + " Private - Tokenize the receiver's epression as DNA data " + + parser := + ( self firstLineTokenizer , + self speciesDNANamedBlockTokenizer , + self dnaInterleaveLineTokenizer ) ==> self buildTokensBlock. + ^ self tokenize. + + +] + +{ #category : 'accessing-protein' } +BioPhylipPetitParser >> tokenizeInterleavedProtein [ + " Private - Tokenize the receiver's epression as Protein data " + + parser := + self firstLineTokenizer , + (((PP2PredicateObjectNode noneOf: self speciesFobiddenNames) times: 10) flatten , + #proteinLetterGapped asPParser trimBlanks star flatten , + #newline asPParser) star , + (#proteinLetterGapped asPParser trimBlanks star flatten , #newline asPParser) star ==> self buildTokensBlock. + ^ self tokenize. + +]