From cee6bfd69205a753cd59ac13d196695a6a777538 Mon Sep 17 00:00:00 2001 From: Hernan Morales Date: Mon, 27 Apr 2026 15:38:56 -0300 Subject: [PATCH] Add GFF3 parser Repackaging BioParserTests -> BioParser-Tests --- .../BaselineOfBioSmalltalk.class.st | 6 +- .../BioFormatters/BioGFF3Formatter.class.st | 52 - .../BioAbstractFASTAParserTest.class.st | 355 ++++ .../BioAbstractParserTest.class.st | 22 + .../BioAccessionParserTest.class.st | 136 ++ .../BioCGCParserTest.class.st | 31 + .../BioDNANucleotideParserTest.class.st | 66 + .../BioDNASequenceParserTest.class.st | 45 + .../BioDegenerateBaseParserTest.class.st | 110 ++ .../BioEMBLParserTest.class.st | 33 + ...trezXMLGenBankAccessionParserTest.class.st | 1445 +++++++++++++++++ .../BioFASTAParserTest.class.st | 515 ++++++ .../BioParsers-Tests/BioGFF3Test.class.st | 315 ++++ .../BioGenBankParserTest.class.st | 51 + .../BioGenIdParserTest.class.st | 33 + .../BioMAFParserTest.class.st | 81 + .../BioNCBIIdParserTest.class.st | 83 + .../BioPhylipParserTest.class.st | 254 +++ .../BioProteinParserTest.class.st | 81 + .../BioSwissProtParserTest.class.st | 38 + repository/BioParsers-Tests/package.st | 1 + .../BioParsers/BioBlastContainerNode.class.st | 4 +- .../BioParsers/BioBlastHitNode.class.st | 4 +- .../BioParsers/BioBlastHspNode.class.st | 4 +- repository/BioParsers/BioBlastNode.class.st | 4 +- .../BioParsers/BioBlastParentNode.class.st | 4 +- .../BioParsers/BioBlastRootNode.class.st | 4 +- .../BioParsers/BioBlastStructureNode.class.st | 4 +- .../BioParsers/BioBlastValueNode.class.st | 4 +- .../BioParsers/BioEResultKeysParser.class.st | 4 +- .../BioParsers/BioEntrezResultParser.class.st | 4 +- .../BioEntrezXMLGBBasicParser.class.st | 4 +- .../BioEntrezXMLGBFullParser.class.st | 4 +- ...ioEntrezXMLGBSeqFeatureQualParser.class.st | 4 +- .../BioEntrezXMLGBSeqFullParser.class.st | 4 +- .../BioEntrezXMLGBSeqJournalParser.class.st | 4 +- .../BioEntrezXMLGBSeqParser.class.st | 4 +- .../BioEntrezXMLGenBankSeqParser.class.st | 4 +- .../BioEntrezXMLGenSetParser.class.st | 4 +- .../BioParsers/BioFASTABasicParser.class.st | 4 +- .../BioParsers/BioFASTAMultiParser.class.st | 4 +- repository/BioParsers/BioFASTAParser.class.st | 4 +- .../BioGFF3CommentRecordNode.class.st | 34 + .../BioGFF3DirectiveListNode.class.st | 31 + .../BioParsers/BioGFF3DirectiveNode.class.st | 34 + repository/BioParsers/BioGFF3Feature.class.st | 250 +++ .../BioGFF3FeatureLineNode.class.st | 138 ++ .../BioGFF3FeatureListNode.class.st | 58 + repository/BioParsers/BioGFF3File.class.st | 272 ++++ .../BioParsers/BioGFF3GFF3FileNode.class.st | 51 + .../BioGFF3GFF3FileNodeVisitor.class.st | 9 + .../BioParsers/BioNCBIBlastSAXParser.class.st | 4 +- .../BioNCBIBlastSAXTokenizer.class.st | 4 +- .../BioParsers/BioNCBIXMLBlastParser.class.st | 4 +- repository/BioParsers/BioParser.class.st | 14 + repository/BioParsers/BioSAXParser.class.st | 4 +- .../TBioGFF3GFF3FileNodeVisitor.trait.st | 44 + 57 files changed, 4683 insertions(+), 105 deletions(-) delete mode 100644 repository/BioFormatters/BioGFF3Formatter.class.st create mode 100644 repository/BioParsers-Tests/BioAbstractFASTAParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioAbstractParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioAccessionParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioCGCParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioDNANucleotideParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioDNASequenceParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioDegenerateBaseParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioEMBLParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioEntrezXMLGenBankAccessionParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioFASTAParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioGFF3Test.class.st create mode 100644 repository/BioParsers-Tests/BioGenBankParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioGenIdParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioMAFParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioNCBIIdParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioPhylipParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioProteinParserTest.class.st create mode 100644 repository/BioParsers-Tests/BioSwissProtParserTest.class.st create mode 100644 repository/BioParsers-Tests/package.st create mode 100644 repository/BioParsers/BioGFF3CommentRecordNode.class.st create mode 100644 repository/BioParsers/BioGFF3DirectiveListNode.class.st create mode 100644 repository/BioParsers/BioGFF3DirectiveNode.class.st create mode 100644 repository/BioParsers/BioGFF3Feature.class.st create mode 100644 repository/BioParsers/BioGFF3FeatureLineNode.class.st create mode 100644 repository/BioParsers/BioGFF3FeatureListNode.class.st create mode 100644 repository/BioParsers/BioGFF3File.class.st create mode 100644 repository/BioParsers/BioGFF3GFF3FileNode.class.st create mode 100644 repository/BioParsers/BioGFF3GFF3FileNodeVisitor.class.st create mode 100644 repository/BioParsers/TBioGFF3GFF3FileNodeVisitor.trait.st diff --git a/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st b/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st index f0a57868..0af2294d 100644 --- a/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st +++ b/repository/BaselineOfBioSmalltalk/BaselineOfBioSmalltalk.class.st @@ -83,7 +83,7 @@ BaselineOfBioSmalltalk >> baselineCommonPackages: spec [ package: 'BioBenchmarks' with: [ spec requires: #('BioTools' ). ]; package: 'BioBlast' with: [ spec requires: #('BioWrappers' 'BioParsers' 'BioEntrez' ). ]; package: 'BioBlastSamples' with: [ spec requires: #('BioToolsSamples' ). ]; - package: 'BioBlastTests' with: [ spec requires: #('BioTools-Tests' 'BioBlast' 'BioParserTests' ). ]; + package: 'BioBlastTests' with: [ spec requires: #('BioTools-Tests' 'BioBlast' 'BioParser-Tests' ). ]; package: 'BioClassifier' with: [ spec requires: #('BioTools' ). ]; package: 'BioClassifierTests' with: [ spec requires: #('BioTools-Tests' 'BioClassifier' ). ]; package: 'BioEBI' with: [ spec requires: #('BioTools' 'BioWrappers' ). ]; @@ -97,7 +97,7 @@ BaselineOfBioSmalltalk >> baselineCommonPackages: spec [ package: 'BioNCBI' with: [ spec requires: #('BioTools' ). ]; package: 'BioNCBITests' with: [ spec requires: #('BioTools-Tests' 'BioNCBI' ). ]; package: 'BioNGS' with: [ spec requires: #('BioTools' 'BioWrappers' ). ]; - package: 'BioParserTests' with: [ spec requires: #('BioTools-Tests' 'BioParsers' ). ]; + package: 'BioParser-Tests' with: [ spec requires: #('BioTools-Tests' 'BioParsers' ). ]; package: 'BioParsers' with: [ spec requires: #('BioWrappers' 'BioTools' ). ]; package: 'BioPharoCommon' with: [ spec requires: #('BioTools' ). ]; package: 'BioPharo4' with: [ spec requires: #('BioPharoCommon' ). ]; @@ -236,7 +236,7 @@ BaselineOfBioSmalltalk >> baselineTestsGroup: spec [ 'BioTools-Tests' 'BioBlastTests' 'BioWrapperTests' - 'BioParserTests' + 'BioParser-Tests' 'BioEntrezTests' 'BioNCBITests' 'BioFormatterTests' diff --git a/repository/BioFormatters/BioGFF3Formatter.class.st b/repository/BioFormatters/BioGFF3Formatter.class.st deleted file mode 100644 index 535c6787..00000000 --- a/repository/BioFormatters/BioGFF3Formatter.class.st +++ /dev/null @@ -1,52 +0,0 @@ -" -The [GFF3 format](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md) addresses the most common extensions to GFF, while preserving backward compatibility with previous formats. - -## Instance Variables - -- sequenceFeature: `BioSequenceFeature` - -" -Class { - #name : 'BioGFF3Formatter', - #superclass : 'BioSequenceFeatureFormatter', - #category : 'BioFormatters-Formatters', - #package : 'BioFormatters', - #tag : 'Formatters' -} - -{ #category : 'accessing' } -BioGFF3Formatter class >> identifiers [ - "Answer a of identifiers of the receiver" - - ^ #('GFF' 'GFF3') -] - -{ #category : 'converting' } -BioGFF3Formatter >> asString [ - " Answer a representation of the receiver " - - ^ String streamContents: [ : outStream | - outStream - nextPutAll: self sequenceFeature name; - nextPut: self delimiter; - nextPutAll: self sequenceFeature sourceTag; - nextPut: self delimiter; - nextPutAll: self sequenceFeature primaryTag; - nextPut: self delimiter; - nextPutAll: self sequenceFeature start asString; - nextPut: self delimiter; - nextPutAll: self sequenceFeature end asString; - nextPut: self delimiter; - nextPutAll: self sequenceFeature score asString; - nextPut: self delimiter; - nextPutAll: self sequenceFeature strand asString; - nextPut: self delimiter; - nextPutAll: self sequenceFeature tag asString ] -] - -{ #category : 'accessing' } -BioGFF3Formatter >> delimiter [ - " Answer a used to delimit fields between the receiver's elements " - - ^ Character tab -] diff --git a/repository/BioParsers-Tests/BioAbstractFASTAParserTest.class.st b/repository/BioParsers-Tests/BioAbstractFASTAParserTest.class.st new file mode 100644 index 00000000..1a8fa7d4 --- /dev/null +++ b/repository/BioParsers-Tests/BioAbstractFASTAParserTest.class.st @@ -0,0 +1,355 @@ +Class { + #name : 'BioAbstractFASTAParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ '>YAL069W-1.334 Putative promoter sequence +CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA +CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT +ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC +CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC +TGTTCTTCTACCCACCATATTGAAACGCTAACAA +' +] + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ '>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA +GTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTT +TTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGG +TACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT +' +] + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq04 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ '>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCA +CAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGA +AGTTTATATATAAATTTCCTTTTTATTGGA +' +] + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq05 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ '>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +' +] + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq08 [ + " From http://www.citizendia.org/FASTA_format " + + ^ '>sp_ac|P02769_WOSIG0 \ID=ALBU_BOVIN \DE="Serum albumin precursor (Allergen Bos d 6) (BSA)" \NCBITAXID=9913 \MODRES=(1|Acetyl) \VARIANT=(196|A|T) \LENGTH=589 +RGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCV +ADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPK +LKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKG +ACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLV +TDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEK +DAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEA +TLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKV +PQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTK +CCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKH +KPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA +' +] + +{ #category : 'samples-single' } +BioAbstractFASTAParserTest >> fastaSeq09 [ + " Definition line from the BioPython manual " + + ^ '>gi|6273291|emb|AF191665.1|AF191665 +actgtcgat +atgctagct +' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html + + thisContext receiver new multiFastaSeq01 + " + + ^ String streamContents: [ : stream | + stream + nextPutAll: self multiFastaSeq01Header01; cr; + nextPutAll: self multiFastaSeq01Body01; cr; + nextPutAll: self multiFastaSeq01Header02; cr; + nextPutAll: self multiFastaSeq01Body02 + ]. + +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq01Body01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA +CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT +ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC +CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC +TGTTCTTCTACCCACCATATTGAAACGCTAACAA' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq01Body02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA +GTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTT +TTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGG +TACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq01Header01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'YAL069W-1.334 Putative promoter sequence' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq01Header02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'YAL068C-7235.2170 Putative promoter sequence' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html + + thisContext receiver new multiFastaSeq02 + " + + ^ String streamContents: [ : stream | + stream + nextPutAll: self multiFastaSeq02Header01; cr; + nextPutAll: self multiFastaSeq02Body01; cr; + nextPutAll: self multiFastaSeq02Header02; cr; + nextPutAll: self multiFastaSeq02Body02; cr; + nextPutAll: self multiFastaSeq02Header03; cr; + nextPutAll: self multiFastaSeq02Body03 + ]. +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Body01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCA' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Body02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'CAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGA +AGTTTATATATAAATTTCCTTTTTATTGGA' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Body03 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Header01 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'first sequence record' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Header02 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'second sequence record' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02Header03 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html " + + ^ 'third sequence record' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq02PlainText [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html + + thisContext receiver new multiFastaSeq02PlainText + " + + ^ '>first sequence record +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCA +>second sequence record +CAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGA +AGTTTATATATAAATTTCCTTTTTATTGGA +>third sequence record +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq03 [ + " From http://www.citizendia.org/FASTA_format + + thisContext receiver new multiFastaSeq03 + " + + ^ String streamContents: [ : stream | + stream + nextPutAll: self multiFastaSeq03Header01; cr; + nextPutAll: self multiFastaSeq03Body01; cr; + nextPutAll: self multiFastaSeq03Header02; cr; + nextPutAll: self multiFastaSeq03Body02 ]. +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq03Body01 [ + " From http://www.citizendia.org/FASTA_format " + + ^ 'MTEITAAMVKELRESTGAGMMDCKNALSETNGDFDKAVQLLREKGLGKAAKKADRLAAEG +LVSVKVSDDFTIAAMRPSYLSYEDLDMTFVENEYKALVAELEKENEERRRLKDPNKPEHK +IPQFASRKQLSDAILKEAEEKIKEELKAQGKPEKIWDNIIPGKMNSFIADNSQLDSKLTL +MGQFYVMDDKKTVEQVIAEKEKEFGGKIKIVEFICFEVGEGLEKKTEDFAAEVAAQL' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq03Body02 [ + " From http://www.citizendia.org/FASTA_format " + + ^ 'SATVSEINSETDFVAKNDQFIALTKDTTAHIQSNSLQSVEELHSSTINGVKFEEYLKSQI +ATIGENLVVRRFATLKAGANGVVNGYIHTNGRVGVVIAAACDSAEVASKSRDLLRQICMH' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq03Header01 [ + " From http://www.citizendia.org/FASTA_format " + + ^ '>SEQUENCE_1' + +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq03Header02 [ + " From http://www.citizendia.org/FASTA_format " + + ^ '>SEQUENCE_2' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq04 [ + " http://www.cbs.dtu.dk/services/NetGene2/fasta.php " + + ^ '>HSBGPG Human gene for bone gla protein (BGP) +GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGT +ATAAACAGTGCTGGAGGCTGGCGGGGCAGGCCAGCTGAGTCCTGAGCAGCAGCCCAGCGCAGCCACCGAGACACC +ATGAGAGCCCTCACACTCCTCGCCCTATTGGCCCTGGCCGCACTTTGCATCGCTGGCCAGGCAGGTGAGTGCCCC +CACCTCCCCTCAGGCCGCATTGCAGTGGGGGCTGAGAGGAGGAAGCACCATGGCCCACCTCTTCTCACCCCTTTG +GCTGGCAGTCCCTTTGCAGTCTAACCACCTTGTTGCAGGCTCAATCCATTTGCCCCAGCTCTGCCCTTGCAGAGG +GAGAGGAGGGAAGAGCAAGCTGCCCGAGACGCAGGGGAAGGAGGATGAGGGCCCTGGGGATGAGCTGGGGTGAAC +CAGGCTCCCTTTCCTTTGCAGGTGCGAAGCCCAGCGGTGCAGAGTCCAGCAAAGGTGCAGGTATGAGGATGGACC +TGATGGGTTCCTGGACCCTCCCCTCTCACCCTGGTCCCTCAGTCTCATTCCCCCACTCCTGCCACCTCCTGTCTG +GCCATCAGGAAGGCCAGCCTGCTCCCCACCTGATCCTCCCAAACCCAGAGCCACCTGATGCCTGCCCCTCTGCTC +CACAGCCTTTGTGTCCAAGCAGGAGGGCAGCGAGGTAGTGAAGAGACCCAGGCGCTACCTGTATCAATGGCTGGG +GTGAGAGAAAAGGCAGAGCTGGGCCAAGGCCCTGCCTCTCCGGGATGGTCTGTGGGGGAGCTGCAGCAGGGAGTG +GCCTCTCTGGGTTGTGGTGGGGGTACAGGCAGCCTGCCCTGGTGGGCACCCTGGAGCCCCATGTGTAGGGAGAGG +AGGGATGGGCATTTTGCACGGGGGCTGATGCCACCACGTCGGGTGTCTCAGAGCCCCAGTCCCCTACCCGGATCC +CCTGGAGCCCAGGAGGGAGGTGTGTGAGCTCAATCCGGACTGTGACGAGTTGGCTGACCACATCGGCTTTCAGGA +GGCCTATCGGCGCTTCTACGGCCCGGTCTAGGGTGTCGCTCTGCTGGCCTGGCCGGCAACCCCAGTTCTGCTCCT +CTCCAGGCACCCTTCTTTCCTCTTCCCCTTGCCCTTGCCCTGACCTCCCAGCCCTATGGATGTGGGGTCCCCATC +ATCCCAGCTGCTCCCAAATAAACTCCAGAAG +>HSGLTH1 Human theta 1-globin gene +CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGA +CGGGGGGCCTTGGATCCAGGGCGATTCAGAGGGCCCCGGTCGGAGCTGTCGGAGATTGAGCGCGCGCGGTCCCGG +GATCTCCGACGAGGCCCTGGACCCCCGGGCGGCGAAGCTGCGGCGCGGCGCCCCCTGGAGGCCGCGGGACCCCTG +GCCGGTCCGCGCAGGCGCAGCGGGGTCGCAGGGCGCGGCGGGTTCCAGCGCGGGGATGGCGCTGTCCGCGGAGGA +CCGGGCGCTGGTGCGCGCCCTGTGGAAGAAGCTGGGCAGCAACGTCGGCGTCTACACGACAGAGGCCCTGGAAAG +GTGCGGCAGGCTGGGCGCCCCCGCCCCCAGGGGCCCTCCCTCCCCAAGCCCCCCGGACGCGCCTCACCCACGTTC +CTCTCGCAGGACCTTCCTGGCTTTCCCCGCCACGAAGACCTACTTCTCCCACCTGGACCTGAGCCCCGGCTCCTC +ACAAGTCAGAGCCCACGGCCAGAAGGTGGCGGACGCGCTGAGCCTCGCCGTGGAGCGCCTGGACGACCTACCCCA +CGCGCTGTCCGCGCTGAGCCACCTGCACGCGTGCCAGCTGCGAGTGGACCCGGCCAGCTTCCAGGTGAGCGGCTG +CCGTGCTGGGCCCCTGTCCCCGGGAGGGCCCCGGCGGGGTGGGTGCGGGGGGCGTGCGGGGCGGGTGCAGGCGAG +TGAGCCTTGAGCGCTCGCCGCAGCTCCTGGGCCACTGCCTGCTGGTAACCCTCGCCCGGCACTACCCCGGAGACT +TCAGCCCCGCGCTGCAGGCGTCGCTGGACAAGTTCCTGAGCCACGTTATCTCGGCGCTGGTTTCCGAGTACCGCT +GAACTGTGGGTGGGTGGCCGCGGGATCCCCAGGCGACCTTCCCCGTGTTTGAGTAAAGCCTCTCCCAGGAGCAGC +CTTCTTGCCGTGCTCTCTCGAGGTCAGGACGCGAGAGGAAGGCGC +' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq05 [ + " http://quma.cdb.riken.jp/help/multiFastaHelp.html " + + ^ '>sequence1 +ACTCCCCGTGCGCGCCCGGCCCGTAGCGTCCTCGTCGCCGCCCCTCGTCTCGCAGCCGCAGCCCGCGTGG +ACGCTCTCGCCTGAGCGCCGCGGACTAGCCCGGGTGGCC +>sequence2 +CAGTCCGGCAGCGCCGGGGTTAAGCGGCCCAAGTAAACGTAGCGCAGCGATCGGCGCCGGAGATTCGCGA +ACCCGACACTCCGCGCCGCCCGCCGGCCAGGACCCGCGGCGCGATCGCGGCGCCGCGCTACAGCCAGCCT +CACTGGCGCGCGGGCGAGCGCACGGGCGCTC +>sequence3 +CACGACAGGCCCGCTGAGGCTTGTGCCAGACCTTGGAAACCTCAGGTATATACCTTTCCAGACGCGGGAT +CTCCCCTCCCC +>sequence4 +CAGCAGACATCTGAATGAAGAAGAGGGTGCCAGCGGGTATGAGGAGTGCATTATCGTTAATGGGAACTTC +AGTGACCAGTCCTCAGACACGAAGGATGCTCCCTCACCCCCAGTCTTGGAGGCAATCTGCACAGAGCCAG +TCTGCACACC' +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq06 [ + " From http://www.dalkescientific.com/writings/NBN/parsing.html + This is the multiFastaSeq01 with additional lines between records + thisContext receiver new multiFastaSeq06 + " + + ^ String streamContents: [ : stream | + stream + nextPutAll: self multiFastaSeq01Header01; cr; + nextPutAll: self multiFastaSeq01Body01; cr; cr; + nextPutAll: self multiFastaSeq01Header02; cr; + nextPutAll: self multiFastaSeq01Body02 ]. + +] + +{ #category : 'samples-multi' } +BioAbstractFASTAParserTest >> multiFastaSeq07 [ + + ^ '>Sample sequence 1 +garkbdctymvhu + +>Sample sequence 2 +ctymvhgarkbda + +>Sample sequence 3 +ccccccccccga' + +] diff --git a/repository/BioParsers-Tests/BioAbstractParserTest.class.st b/repository/BioParsers-Tests/BioAbstractParserTest.class.st new file mode 100644 index 00000000..8fc4d8d7 --- /dev/null +++ b/repository/BioParsers-Tests/BioAbstractParserTest.class.st @@ -0,0 +1,22 @@ +Class { + #name : 'BioAbstractParserTest', + #superclass : 'BioAbstractTest', + #instVars : [ + 'parser', + 'parseResult' + ], + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioAbstractParserTest >> parser [ + + ^ parser +] + +{ #category : 'accessing' } +BioAbstractParserTest >> parserClass [ + + ^ BioParser +] diff --git a/repository/BioParsers-Tests/BioAccessionParserTest.class.st b/repository/BioParsers-Tests/BioAccessionParserTest.class.st new file mode 100644 index 00000000..915b5093 --- /dev/null +++ b/repository/BioParsers-Tests/BioAccessionParserTest.class.st @@ -0,0 +1,136 @@ +Class { + #name : 'BioAccessionParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioAccessionParserTest >> parserClass [ + " Private - See superimplementor's comment " + + ^ BioAccessionParser +] + +{ #category : 'accessing' } +BioAccessionParserTest >> setUp [ + + super setUp. + parser := self parserClass new +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession01 [ + + parseResult := self parser parse: 'gi|555|emb|X65215.1|'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'X65215'. + self assert: parseResult version equals: '1'. + +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession02 [ + + parseResult := self parser parse: 'gi|226437718|gb|AC150860.6|'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'AC150860'. + self assert: parseResult version equals: '6'. + +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession03 [ + + parseResult := self parser parse: 'gi|207524544|gb|AC226190.2|'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'AC226190'. + self assert: parseResult version equals: '2'. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession04 [ + + parseResult := self parser parse: 'gb|AC226190.2|'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'AC226190'. + self assert: parseResult version equals: '2'. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession05 [ + + parseResult := self parser parse: 'AC150530.4'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'AC150530'. + self assert: parseResult version equals: '4'. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession06 [ + + parseResult := self parser parse: 'AC150707'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'AC150707'. + self deny: parseResult hasVersion. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession07 [ + + parseResult := self parser parse: '>gi|198282148|ref|NC_011206.1| Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'NC_011206'. + self assert: parseResult version equals: '1'. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testParseAccession08 [ + + parseResult := self parser parse: '>gi|104773257|ref|NC_008054.1| Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842, complete genome'. + self assert: (parseResult isKindOf: BioAccession). + self assert: parseResult name equals: 'NC_008054'. + self assert: parseResult version equals: '1'. +] + +{ #category : 'testing' } +BioAccessionParserTest >> testTokenizeAccession01 [ + + self + assert: (BioParser tokenizeAccession: 'gi|555|emb|X65215.1|') + equals: #( 'X65215' '1' ). + self + assert: (BioParser tokenizeAccession: 'gi|226437718|gb|AC150860.6|') + equals: #( 'AC150860' '6' ). + self + assert: (BioParser tokenizeAccession: 'gi|207524544|gb|AC226190.2|') + equals: #( 'AC226190' '2' ). + self + assert: + (BioParser tokenizeAccession: 'gi|207524544|gb|AC226190.2345|') + equals: #( 'AC226190' '2345' ). + self + assert: (BioParser tokenizeAccession: 'gb|AC226190.2|') + equals: #( 'AC226190' '2' ). + self + assert: (BioParser tokenizeAccession: 'AC150530.4') + equals: #( 'AC150530' '4' ). + self + assert: (BioParser tokenizeAccession: 'AC150707') + equals: #( 'AC150707' ) +] + +{ #category : 'testing' } +BioAccessionParserTest >> testTokenizeAccession02 [ + + self assert: ( self parser tokenize: 'gi|555|emb|X65215.1|' ) = #('X65215' '1') . + self assert: ( self parser tokenize: 'gi|226437718|gb|AC150860.6|' ) = #('AC150860' '6'). + self assert: ( self parser tokenize: 'gi|207524544|gb|AC226190.2|' ) = #('AC226190' '2'). + self assert: ( self parser tokenize: 'gi|207524544|gb|AC226190.2345|' ) = #('AC226190' '2345'). + self assert: ( self parser tokenize: 'gb|AC226190.2|' ) = #('AC226190' '2'). + self assert: ( self parser tokenize: 'AC150530.4' ) = #('AC150530' '4'). + self assert: ( self parser tokenize: 'AC150707' ) = #('AC150707'). + + self assert: ( ( self parser tokenize: '>gi|198282148|ref|NC_011206.1| Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome' ) = #('NC_011206' '1') ). + self assert: ( ( self parser tokenize: '>gi|104773257|ref|NC_008054.1| Lactobacillus delbrueckii subsp. bulgaricus ATCC 11842, complete genome' ) = #('NC_008054' '1') ). +] diff --git a/repository/BioParsers-Tests/BioCGCParserTest.class.st b/repository/BioParsers-Tests/BioCGCParserTest.class.st new file mode 100644 index 00000000..22be415f --- /dev/null +++ b/repository/BioParsers-Tests/BioCGCParserTest.class.st @@ -0,0 +1,31 @@ +Class { + #name : 'BioCGCParserTest', + #superclass : 'BioAbstractParserTest', + #instVars : [ + 'cgcParser' + ], + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioCGCParserTest >> cgcSeq01 [ + " From http://www.genomatix.de/online_help/help/sequence_formats.html " + + ^ 'ID AB000263 standard; RNA; PRI; 368 BP. +XX +AC AB000263; +XX +DE Homo sapiens mRNA for prepro cortistatin like peptide, complete cds. +XX +SQ Sequence 368 BP; +AB000263 Length: 368 Check: 4514 .. + 1 acaagatgcc attgtccccc ggcctcctgc tgctgctgct ctccggggcc acggccaccg + 61 ctgccctgcc cctggagggt ggccccaccg gccgagacag cgagcatatg caggaagcgg + 121 caggaataag gaaaagcagc ctcctgactt tcctcgcttg gtggtttgag tggacctccc + 181 aggccagtgc cgggcccctc ataggagagg aagctcggga ggtggccagg cggcaggaag + 241 gcgcaccccc ccagcaatcc gcgcgccggg acagaatgcc ctgcaggaac ttcttctgga + 301 agaccttctc ctcctgcaaa taaaacctca cccatgaatg ctcacgcaag tttaattaca + 361 gacctgaa +' +] diff --git a/repository/BioParsers-Tests/BioDNANucleotideParserTest.class.st b/repository/BioParsers-Tests/BioDNANucleotideParserTest.class.st new file mode 100644 index 00000000..7c955e89 --- /dev/null +++ b/repository/BioParsers-Tests/BioDNANucleotideParserTest.class.st @@ -0,0 +1,66 @@ +Class { + #name : 'BioDNANucleotideParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'running' } +BioDNANucleotideParserTest >> setUp [ + + super setUp. + parser := #dnaLetter asPParser. + +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNALetterMatchA [ + + self assert: (parser matches: 'a'). + self assert: (parser matches: 'A'). + +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNALetterMatchC [ + + self assert: (parser matches: 'c'). + self assert: (parser matches: 'C'). + +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNALetterMatchG [ + + self assert: (parser matches: 'g'). + self assert: (parser matches: 'G'). + +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNALetterMatchN [ + + self assert: (parser matches: 'N'). + self assert: (parser matches: 'n'). +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNALetterMatchT [ + + self assert: (parser matches: 't'). + self assert: (parser matches: 'T'). + +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNANucelotideEmpty [ + + self deny: (parser matches: String empty) +] + +{ #category : 'testing' } +BioDNANucleotideParserTest >> testDNANucleotideCharacter [ + + self should: [ parser matches: $a ] raise: MessageNotUnderstood. + self should: [ parser matches: nil ] raise: MessageNotUnderstood. +] diff --git a/repository/BioParsers-Tests/BioDNASequenceParserTest.class.st b/repository/BioParsers-Tests/BioDNASequenceParserTest.class.st new file mode 100644 index 00000000..6fb3afe7 --- /dev/null +++ b/repository/BioParsers-Tests/BioDNASequenceParserTest.class.st @@ -0,0 +1,45 @@ +Class { + #name : 'BioDNASequenceParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioDNASequenceParserTest >> setUp [ + + super setUp. + parser := #dnaSequence asPParser. + +] + +{ #category : 'testing' } +BioDNASequenceParserTest >> testDNASeqMatchString [ + + self assert: (parser matches: 'actg'). + self assert: (parser matches: 'ACTG'). + +] + +{ #category : 'testing' } +BioDNASequenceParserTest >> testDNASeqMatches [ + + self assert: (parser matches: 'a'). + self assert: (parser matches: 'A'). + self assert: (parser matches: 'N'). + self assert: (parser matches: 'n'). + +] + +{ #category : 'testing' } +BioDNASequenceParserTest >> testDNASeqNumber [ + + self deny: (parser matches: '8743'). +] + +{ #category : 'testing' } +BioDNASequenceParserTest >> testDNASeqParseEmpty [ + + self deny: (parser matches: String empty). + +] diff --git a/repository/BioParsers-Tests/BioDegenerateBaseParserTest.class.st b/repository/BioParsers-Tests/BioDegenerateBaseParserTest.class.st new file mode 100644 index 00000000..40834565 --- /dev/null +++ b/repository/BioParsers-Tests/BioDegenerateBaseParserTest.class.st @@ -0,0 +1,110 @@ +Class { + #name : 'BioDegenerateBaseParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithSeparators01 [ + + parseResult := self parserClass parseAmbiguousWithSeparators: 'AT[A/C]TA'. + + self assert: parseResult size equals: 5. + self assert: parseResult asString equals: 'ATMTA'. +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithSeparators02 [ + + parseResult := self parserClass parseAmbiguousWithSeparators: '[G/A]ACTGCA'. + + self assert: parseResult size equals: 7. + self assert: parseResult asString equals: 'RACTGCA'. + +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithSeparators03 [ + + parseResult := self parserClass parseAmbiguousWithSeparators: 'ACTGCA[T/C]'. + + self assert: parseResult size equals: 7. + self assert: parseResult asString equals: 'ACTGCAY' +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithoutSeparators01 [ + + parseResult := self parserClass parseAmbiguousWithoutSeparators: 'AT[AC]TA'. + + self assert: parseResult size equals: 5. + self assert: parseResult asString equals: 'ATMTA'. +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithoutSeparators02 [ + + parseResult := self parserClass parseAmbiguousWithoutSeparators: '[GA]ACTGCA'. + + self assert: parseResult size equals: 7. + self assert: parseResult asString equals: 'RACTGCA'. + +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testParseAmbiguousSequenceWithoutSeparators03 [ + + parseResult := self parserClass parseAmbiguousWithoutSeparators: 'ACTGCA[TC]'. + + self assert: parseResult size equals: 7. + self assert: parseResult asString equals: 'ACTGCAY' +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testTokenizeAmbiguousSequenceWithoutSeparators [ + self + assert: (self parserClass tokenizeAmbiguousWithoutSeparators: 'AT[AC]TA') + equals: 'ATMTA'. + self + assert: (self parserClass tokenizeAmbiguousWithoutSeparators: '[GA]ACTGCA') + equals: 'RACTGCA'. + self + assert: (self parserClass tokenizeAmbiguousWithoutSeparators: 'ACTGCA[TC]') + equals: 'ACTGCAY'. + self + assert: (self parserClass tokenizeAmbiguousWithoutSeparators: 'AT[AC]TA[CA]') + equals: 'ATMTAM'. + self + assert: + (self parserClass tokenizeAmbiguousWithoutSeparators: '[GA]ACT[AG]GCA') + equals: 'RACTRGCA'. + self + assert: + (self parserClass tokenizeAmbiguousWithoutSeparators: '[CT]ACTGCA[TC]') + equals: 'YACTGCAY' +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testTokenizeDelimitedAmbiguousSequence [ + + self assert: (self parserClass tokenizeAmbiguousWithSeparators: 'AT[A/C]TA') equals: 'ATMTA'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: '[G/A]ACTGCA') equals: 'RACTGCA'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: 'ACTGCA[T/C]') equals: 'ACTGCAY'. + + self assert: (self parserClass tokenizeAmbiguousWithSeparators: 'AT[A/C]TA[C/A]') equals: 'ATMTAM'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: '[G/A]ACT[A/G]GCA') equals: 'RACTRGCA'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: '[C/T]ACTGCA[T/C]') equals: 'YACTGCAY'. +] + +{ #category : 'testing' } +BioDegenerateBaseParserTest >> testTokenizeResolveAmbiguousSequence [ + +" self assert: (self parserClass tokenizeAmbiguousLettersToSeparators: 'ATMTA') equals: 'AT[A/C]TA'. + self assert: (self parserClass tokenizeAmbiguousLettersToSeparators: 'RACTGCA') equals: '[G/A]ACTGCA'. + self assert: (self parserClass tokenizeAmbiguousLettersToSeparators: 'ACTGCAY') equals: 'ACTGCA[T/C]'." + +" self assert: (self parserClass tokenizeAmbiguousWithSeparators: 'AT[A/C]TA[C/A]') equals: 'ATMTAM'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: '[G/A]ACT[A/G]GCA') equals: 'RACTRGCA'. + self assert: (self parserClass tokenizeAmbiguousWithSeparators: '[C/T]ACTGCA[T/C]') equals: 'YACTGCAY'. " +] diff --git a/repository/BioParsers-Tests/BioEMBLParserTest.class.st b/repository/BioParsers-Tests/BioEMBLParserTest.class.st new file mode 100644 index 00000000..f70b17c4 --- /dev/null +++ b/repository/BioParsers-Tests/BioEMBLParserTest.class.st @@ -0,0 +1,33 @@ +Class { + #name : 'BioEMBLParserTest', + #superclass : 'BioAbstractParserTest', + #instVars : [ + 'emblParser' + ], + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioEMBLParserTest >> emblSeq01 [ + " From http://www.genomatix.de/online_help/help/sequence_formats.html " + + ^ 'An example sequence in EMBL format is: + +ID AB000263 standard; RNA; PRI; 368 BP. +XX +AC AB000263; +XX +DE Homo sapiens mRNA for prepro cortistatin like peptide, complete cds. +XX +SQ Sequence 368 BP; + acaagatgcc attgtccccc ggcctcctgc tgctgctgct ctccggggcc acggccaccg 60 + ctgccctgcc cctggagggt ggccccaccg gccgagacag cgagcatatg caggaagcgg 120 + caggaataag gaaaagcagc ctcctgactt tcctcgcttg gtggtttgag tggacctccc 180 + aggccagtgc cgggcccctc ataggagagg aagctcggga ggtggccagg cggcaggaag 240 + gcgcaccccc ccagcaatcc gcgcgccggg acagaatgcc ctgcaggaac ttcttctgga 300 + agaccttctc ctcctgcaaa taaaacctca cccatgaatg ctcacgcaag tttaattaca 360 + gacctgaa 368 +// +' +] diff --git a/repository/BioParsers-Tests/BioEntrezXMLGenBankAccessionParserTest.class.st b/repository/BioParsers-Tests/BioEntrezXMLGenBankAccessionParserTest.class.st new file mode 100644 index 00000000..8c6f4492 --- /dev/null +++ b/repository/BioParsers-Tests/BioEntrezXMLGenBankAccessionParserTest.class.st @@ -0,0 +1,1445 @@ +Class { + #name : 'BioEntrezXMLGenBankAccessionParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> gbSet01 [ + " GBSeq size = 1 " + + ^ ' + + + + HQ184032 + 16339 + double + DNA + circular + MAM + 07-JAN-2011 + 07-JAN-2011 + Bos taurus isolate Chi597 mitochondrion, complete genome + HQ184032 + HQ184032.1 + + gb|HQ184032.1| + gi|306977295 + + mitochondrion Bos taurus (cattle) + Bos taurus + Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi; Mammalia; Eutheria; Laurasiatheria; Cetartiodactyla; Ruminantia; Pecora; Bovidae; Bovinae; Bos + + + 1 + 1..16339 + + Bonfiglio,S. + Achilli,A. + Olivieri,A. + Negrini,R. + Colli,L. + Liotta,L. + Ajmone-Marsan,P. + Torroni,A. + Ferretti,L. + + The Enigmatic Origin of Bovine mtDNA Haplogroup R: Sporadic Interbreeding or an Independent Event of Bos primigenius Domestication in Italy? + PLoS ONE 5 (12), E15760 (2010) + + + doi + 10.1371/journal.pone.0015760 + + + 21209945 + Publication Status: Online-Only + + + 2 + 1..16339 + + Bonfiglio,S. + Achilli,A. + Olivieri,A. + Negrini,R. + Colli,L. + Liotta,L. + Ajmone-Marsan,P. + Torroni,A. + Ferretti,L. + + Direct Submission + Submitted (25-AUG-2010) Dipartimento di Genetica e Microbiologia, University of Pavia, Via Ferrata, 1, Pavia 27100, Italy + + + + + source + 1..16339 + + + 1 + 16339 + HQ184032.1 + + + + + organism + Bos taurus + + + organelle + mitochondrion + + + mol_type + genomic DNA + + + isolate + Chi597 + + + db_xref + taxon:9913 + + + haplogroup + Q2 + + + country + Italy + + + note + breed: Chianina + + + + + D-loop + join(15793..16339,1..362) + + + 15793 + 16339 + HQ184032.1 + + + 1 + 362 + HQ184032.1 + + + join + + + tRNA + 364..430 + + + 364 + 430 + HQ184032.1 + + + + + product + tRNA-Phe + + + + + rRNA + 431..1386 + + + 431 + 1386 + HQ184032.1 + + + + + product + small subunit ribosomal RNA + + + + + tRNA + 1387..1453 + + + 1387 + 1453 + HQ184032.1 + + + + + product + tRNA-Val + + + + + rRNA + 1454..3024 + + + 1454 + 3024 + HQ184032.1 + + + + + product + large subunit ribosomal RNA + + + + + tRNA + 3025..3099 + + + 3025 + 3099 + HQ184032.1 + + + + + product + tRNA-Leu + + + note + codons recognized: UUR + + + + + gene + 3102..4057 + + + 3102 + 4057 + HQ184032.1 + + + + + gene + ND1 + + + + + CDS + 3102..4057 + + + 3102 + 4057 + HQ184032.1 + + + + + gene + ND1 + + + note + TAA stop codon is completed by the addition of 3' A residues to the mRNA + + + codon_start + 1 + + + transl_except + (pos:4056..4057,aa:TERM) + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 1 + + + protein_id + ADN11801.1 + + + db_xref + GI:306977296 + + + translation + MFMINILMLIIPILLAVAFLTLVERKVLGYMQLRKGPNVVGPYGLLQPIADAIKLFIKEPLRPATSSASMFILAPIMALGLALTMWIPLPMPYPLINMNLGVLFILAMSSLAVYSILWSGWASNSKYALIGALRAVAQTISYEVTLAIILLSVLLMSGSFTLSTLITTQEQMWLILPAWPLAMMWFISTLAETNRAPFDLTEGESELVSGFNVEYAAGPFALFFMAEYANIIMMNIFTAILFLGTSHNPHMPELYTINFTIKSLLLTMSFLWIRASYPRFRYDQLMHLLWKNFLPLTLALCMWHVSLPILTSGIPPQT + + + + + tRNA + 4058..4126 + + + 4058 + 4126 + HQ184032.1 + + + + + product + tRNA-Ile + + + + + tRNA + complement(4124..4195) + + + 4195 + 4124 + + HQ184032.1 + + + + + product + tRNA-Gln + + + + + tRNA + 4198..4266 + + + 4198 + 4266 + HQ184032.1 + + + + + product + tRNA-Met + + + + + gene + 4267..5308 + + + 4267 + 5308 + HQ184032.1 + + + + + gene + ND2 + + + + + CDS + 4267..5308 + + + 4267 + 5308 + HQ184032.1 + + + + + gene + ND2 + + + note + TAA stop codon is completed by the addition of 3' A residues to the mRNA + + + codon_start + 1 + + + transl_except + (pos:5308,aa:TERM) + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 2 + + + protein_id + ADN11802.1 + + + db_xref + GI:306977297 + + + translation + MNPIIFIIILLTIMLGTIIVMISSHWLLVWIGFEMNMLAIIPIMMKNHNPRATEASTKYFLTQSTASMLLMMAVIINLMFSGQWTVMKLFNPMASMLMTMALAMKLGMAPFHFWVPEVTQGIPLSSGLILLTWQKLAPMSVLYQIFPSINLNLILTLSVLSILIGGWGGLNQTQLRKIMAYSSIAHMGWMTAVLPYNPTMTLLNLIIYIIMTSTMFTMFMANSTTTTLSLSHTWNKTPIMTVLILATLLSMGGLPPLSGFMPKWMIIQEMTKNNSIILPTFMAITALLNLYFYMRLTYSTTLTMFPSTNNMKMKWQFPLMKKMTFLPTMVVLSTMMLPLTPMLSVLE + + + + + tRNA + 5309..5375 + + + 5309 + 5375 + HQ184032.1 + + + + + product + tRNA-Trp + + + + + tRNA + complement(5377..5445) + + + 5445 + 5377 + + HQ184032.1 + + + + + product + tRNA-Ala + + + + + tRNA + complement(5447..5519) + + + 5519 + 5447 + + HQ184032.1 + + + + + product + tRNA-Asn + + + + + rep_origin + 5520..5550 + + + 5520 + 5550 + HQ184032.1 + + + + + note + origin of L-strand replication + + + + + tRNA + complement(5552..5618) + + + 5618 + 5552 + + HQ184032.1 + + + + + product + tRNA-Cys + + + + + tRNA + complement(5619..5686) + + + 5686 + 5619 + + HQ184032.1 + + + + + product + tRNA-Tyr + + + + + gene + 5688..7232 + + + 5688 + 7232 + HQ184032.1 + + + + + gene + COX1 + + + + + CDS + 5688..7232 + + + 5688 + 7232 + HQ184032.1 + + + + + gene + COX1 + + + codon_start + 1 + + + transl_table + 2 + + + product + cytochrome c oxidase subunit I + + + protein_id + ADN11803.1 + + + db_xref + GI:306977298 + + + translation + MFINRWLFSTNHKDIGTLYLLFGAWAGMVGTALSLLIRAELGQPGTLLGDDQIYNVVVTAHAFVMIFFMVMPIMIGGFGNWLVPLMIGAPDMAFPRMNNMSFWLLPPSFLLLLASSMVEAGAGTGWTVYPPLAGNLAHAGASVDLTIFSLHLAGVSSILGAINFITTIINMKPPAMSQYQTPLFVWSVMITAVLLLLSLPVLAAGITMLLTDRNLNTTFFDPAGGGDPILYQHLFWFFGHPEVYILILPGFGMISHIVTYYSGKKEPFGYMGMVWAMMSIGFLGFIVWAHHMFTVGMDVDTRAYFTSATMIIAIPTGVKVFSWLATLHGGNIKWSPAMMWALGFIFLFTVGGLTGIVLANSSLDIVLHDTYYVVAHFHYVLSMGAVFAIMGGFVHWFPLFSGYTLNDTWAKIHFAIMFVGVNMTFFPQHFLGLSGMPRRYSDYPDAYTMWNTISSMGSFISLTAVMLMVFIIWEAFASKREVLTVDLTTTNLEWLNGCPPPYHTFEEPTYVNLK + + + + + tRNA + complement(7230..7300) + + + 7300 + 7230 + + HQ184032.1 + + + + + product + tRNA-Ser + + + note + codons recognized: UCN + + + + + tRNA + 7305..7373 + + + 7305 + 7373 + HQ184032.1 + + + + + product + tRNA-Asp + + + + + gene + 7375..8058 + + + 7375 + 8058 + HQ184032.1 + + + + + gene + COX2 + + + + + CDS + 7375..8058 + + + 7375 + 8058 + HQ184032.1 + + + + + gene + COX2 + + + codon_start + 1 + + + transl_table + 2 + + + product + cytochrome c oxidase subunit II + + + protein_id + ADN11804.1 + + + db_xref + GI:306977299 + + + translation + MAYPMQLGFQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQEVETIWTILPAIILILIALPSLRILYMMDEINNPSLTVKTMGHQWYWSYEYTDYEDLSFDSYMIPTSELKPGELRLLEVDNRVVLPMEMTIRMLVSSEDVLHSWAVPSLGLKTDAIPGRLNQMTLMSSRPGLYYGQCSEICGSNHSFMPIVLELVPLKYFEKWSASML + + + + + tRNA + 8062..8128 + + + 8062 + 8128 + HQ184032.1 + + + + + product + tRNA-Lys + + + + + gene + 8130..8330 + + + 8130 + 8330 + HQ184032.1 + + + + + gene + ATP8 + + + + + CDS + 8130..8330 + + + 8130 + 8330 + HQ184032.1 + + + + + gene + ATP8 + + + codon_start + 1 + + + transl_table + 2 + + + product + ATP synthase F0 subunit 8 + + + protein_id + ADN11805.1 + + + db_xref + GI:306977300 + + + translation + MPQLDTSTWLTMILSMFLTLFIIFQLKVSKHNFYHNPELTPTKMLKQNTPWETKWTKIYLPLLLPL + + + + + gene + 8291..8971 + + + 8291 + 8971 + HQ184032.1 + + + + + gene + ATP6 + + + + + CDS + 8291..8971 + + + 8291 + 8971 + HQ184032.1 + + + + + gene + ATP6 + + + codon_start + 1 + + + transl_table + 2 + + + product + ATP synthase F0 subunit 6 + + + protein_id + ADN11806.1 + + + db_xref + GI:306977301 + + + translation + MNENLFTSFTTPVILGLPLVTLIVLFPSLLFPTSNRLVSNRFVTLQQWMLQLVSKQMMSIHNSKGQTWTLMLMSLILFIGSTNLLGLLPHSFTPTTQLSMNLGMAIPLWAGAVITGFRNKTKASLAHFLPQGTPTPLIPMLVIIETISLFIQPMALAVRLTANITAGHLLIHLIGGATLALMSISTTTALITFTILTLLTILEFAVAMIQAYVFTLLVSLYLHDNT + + + + + gene + 8971..9754 + + + 8971 + 9754 + HQ184032.1 + + + + + gene + COX3 + + + + + CDS + 8971..9754 + + + 8971 + 9754 + HQ184032.1 + + + + + gene + COX3 + + + note + TAA stop codon is completed by the addition of 3' A residues to the mRNA + + + codon_start + 1 + + + transl_except + (pos:9754,aa:TERM) + + + transl_table + 2 + + + product + cytochrome c oxidase subunit III + + + protein_id + ADN11807.1 + + + db_xref + GI:306977302 + + + translation + MTHQTHAYHMVNPSPWPLTGALSALLMTSGLTMWFHFNSMTLLMIGLTTNMLTMYQWWRDVIRESTFQGHHTPAVQKGLRYGMILFIISEVLFFTGFFWAFYHSSLAPTPELGGCWPPTGIHPLNPLEVPLLNTSVLLASGVSITWAHHSLMEGDRKHMLQALFITITLGVYFTLLQASEYYEAPFTISDGVYGSTFFVATGFHGLHVIIGSTFLIVCFFRQLKFHFTSNHHFGFEAAAWYWHFVDVVWLFLYVSIYWWGS + + + + + tRNA + 9755..9823 + + + 9755 + 9823 + HQ184032.1 + + + + + product + tRNA-Gly + + + + + gene + 9824..10170 + + + 9824 + 10170 + HQ184032.1 + + + + + gene + ND3 + + + + + CDS + 9824..10170 + + + 9824 + 10170 + HQ184032.1 + + + + + gene + ND3 + + + note + TAA stop codon is completed by the addition of 3' A residues to the mRNA + + + codon_start + 1 + + + transl_except + (pos:10169..10170,aa:TERM) + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 3 + + + protein_id + ADN11808.1 + + + db_xref + GI:306977303 + + + translation + MNLMLALLTNFTLATLLVIIAFWLPQLNVYSEKTSPYECGFDPMGSARLPFSMKFFLVAITFLLFDLEIALLLPLPWASQTANLNTMLTMALFLIILLAVSLAYEWTQKGLEWTE + + + + + tRNA + 10171..10239 + + + 10171 + 10239 + HQ184032.1 + + + + + product + tRNA-Arg + + + + + gene + 10240..10536 + + + 10240 + 10536 + HQ184032.1 + + + + + gene + ND4L + + + + + CDS + 10240..10536 + + + 10240 + 10536 + HQ184032.1 + + + + + gene + ND4L + + + codon_start + 1 + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 4L + + + protein_id + ADN11809.1 + + + db_xref + GI:306977304 + + + translation + MSMVYMNIMMAFTVSLVGLLMYRSHLMSSLLCLEGMMLSLFVMAALTILNSHFTLASMMPIILLVFAACEAALGLSLLVMVSNTYGTDYVQNLNLLQC + + + + + gene + 10530..11907 + + + 10530 + 11907 + HQ184032.1 + + + + + gene + ND4 + + + + + CDS + 10530..11907 + + + 10530 + 11907 + HQ184032.1 + + + + + gene + ND4 + + + note + TAA stop codon is completed by the addition of 3' A residues to the mRNA + + + codon_start + 1 + + + transl_except + (pos:11907,aa:TERM) + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 4 + + + protein_id + ADN11810.1 + + + db_xref + GI:306977305 + + + translation + MLKYIIPTIMLMPLTWLSKNNMIWVNSTAHSLLISFTSLLLMNQFGDNSLNFSLLFFSDSLSTPLLILTMWLLPLMLMASQHHLSKENLTRKKLFITMLISLQLFLIMTFTAMELILFYILFEATLVPTLIIITRWGNQTERLNAGLYFLFYTLAGSLPLLVALIYIQNTVGSLNFLMLQYWVQPVHNSWSNVFMWLACMMAFMVKMPLYGLHLWLPKAHVEAPIAGSMVLAAVLLKLGGYGMLRITLILNPMTDFMAYPFIMLSLWGMIMTSSICLRQTDLKSLIAYSSVSHMALVIVAILIQTPWSYMGATALMIAHGLTSSMLFCLANSNYERIHSRTMILARGLQTLLPLMATWWLLASLTNLALPPTINLIGELFVVMSTFSWSNITIILMGVNMVITALYSLYMLIMTQRGKYTYHINNISPSFTRENALMSLHILPLLLLTLNPKIILGPLY + + + + + tRNA + 11908..11977 + + + 11908 + 11977 + HQ184032.1 + + + + + product + tRNA-His + + + + + tRNA + 11978..12037 + + + 11978 + 12037 + HQ184032.1 + + + + + product + tRNA-Ser + + + note + codons recognized: AGY + + + + + tRNA + 12039..12109 + + + 12039 + 12109 + HQ184032.1 + + + + + product + tRNA-Leu + + + note + codons recognized: CUN + + + + + gene + 12110..13930 + + + 12110 + 13930 + HQ184032.1 + + + + + gene + ND5 + + + + + CDS + 12110..13930 + + + 12110 + 13930 + HQ184032.1 + + + + + gene + ND5 + + + codon_start + 1 + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 5 + + + protein_id + ADN11811.1 + + + db_xref + GI:306977306 + + + translation + MNMFSSLSLVTLLLLTMPIMMMSFNTYKPSNYPLYVKTAISYAFITSMIPTMMFIHSGQELIISNWHWLTIQTLKLSLSFKMDYFSMMFIPVALFVTWSIMEFSMWYMHSDPNINKFFKYLLLFLITMLILVTANNLFQLFIGWEGVGIMSFLLIGWWYGRADANTAALQAILYNRIGDIGFILAMAWFLTNLNTWDLQQIFMLNPSDSNMPLIGLALAATGKSAQFGLHPWLPSAMEGPTPVSALLHSSTMVVAGIFLLIRFYPLTENNKYIQSITLCLGAITTLFTAMCALTQNDIKKIIAFSTSSQLGLMMVTIGINQPYLAFLHICTHAFFKAMLFMCSGSIIHSLNDEQDIRKMGGLFKAMPFTTTALIVGSLALTGMPFLTGFYSKDLIIEAANTSYTNAWALLMTLIATSFTAIYSTRIIFFALLGQPRFPTLVNINENNPLLINSIKRLLIGSLFAGYIISNNIPPTTIPQMTMPYYLKTTALIVTILGFILALEISNMTKNLKYHYPSNAFKFSTLLGYFPTIMHRLAPYMNLSMSQKSASSLLDLIWLEAILPKTISLAQMKASTLVTNQKGLIKLYFLSFLITILISMILFNFHE + + + + + gene + complement(13914..14441) + + + 14441 + 13914 + + HQ184032.1 + + + + + gene + ND6 + + + + + CDS + complement(13914..14441) + + + 14441 + 13914 + + HQ184032.1 + + + + + gene + ND6 + + + codon_start + 1 + + + transl_table + 2 + + + product + NADH dehydrogenase subunit 6 + + + protein_id + ADN11812.1 + + + db_xref + GI:306977307 + + + translation + MMLYIVFILSVIFVMGFVGFSSKPSPIYGGLGLIVSGGVGCGIVLNFGGSFLGLMVFLIYLGGMMVVFGYTTAMATEQYPEIWLSNKAVLGAFVTGLLMEFFMVYYVLKDKEVEVVFEFNGLGDWVIYDTGDSGFFSEEAMGIAALYSYGTWLVIVTGWSLLIGVVVIMEITRGN + + + + + tRNA + complement(14442..14510) + + + 14510 + 14442 + + HQ184032.1 + + + + + product + tRNA-Glu + + + + + gene + 14515..15654 + + + 14515 + 15654 + HQ184032.1 + + + + + gene + CYTB + + + + + CDS + 14515..15654 + + + 14515 + 15654 + HQ184032.1 + + + + + gene + CYTB + + + codon_start + 1 + + + transl_table + 2 + + + product + cytochrome b + + + protein_id + ADN11813.1 + + + db_xref + GI:306977308 + + + translation + MTNIRKSHPLMKIVNNAFIDLPAPSNISSWWNFGSLLGICLILQILTGLFLAMHYTSDTTTAFSSVTHICRDVNYGWIIRYMHANGASMFFICLYMHVGRGLYYGSYTFLETWNIGVILLLTVMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGTNLVEWIWGGFSVDKATLTRFFAFHFILPFIIMAIAMVHLLFLHETGSNNPTGISSDMDKIPFHPYYTIKDILGALLLILALMLLVLFAPDLLGDPDNYTPANPLNTPPHIKPEWYFLFAYAILRSIPNKLGGVLALAFSILILALIPLLHTSKQRSMMFRPLSQCLFWALVADLLTLTWIGGQPVEHPYITIGQLASVLYFLLILVLMPTAGTVENKLLKW + + + + + tRNA + 15659..15727 + + + 15659 + 15727 + HQ184032.1 + + + + + product + tRNA-Thr + + + + + tRNA + complement(15727..15792) + + + 15792 + 15727 + + HQ184032.1 + + + + + product + tRNA-Pro + + + + + actaatggctaatcagcccatgctcacacataactgtgctgtcatacatttggtatttttttattttgggggatgcttggactcagctatggccgtcaaaggccctgacccggagcatctattgtagctggacttaactgcatcttgagcaccagcataatgataagcgtggacattacagtcaatggtcacaggacataaattatattatatatccccccttcataaaaatttcccccttaaatatctaccaccacttttaacagacttttccctagatacttatttaaatttttcacgctttcaatactcaatttagcactccaaacaaagtcaatatataaacgcaggccccccccccccgttgatgtagcttaacccaaagcaaggcactgaaaatgcctagatgagtctcccaactccataaacacataggtttggtcccagccttcctgttaactcttaataaacttacacatgcaagcatctacaccccagtgagaatgccctctaggttattaaaactaagaggagctggcatcaagcacacaccctgtagctcacgacgccttgcttaaccacacccccacgggaaacagcagtgacaaaaattaagccataaacgaaagtttgactaagttatattaattagggttggtaaatctcgtgccagccaccgcggtcatacgattaacccaagctaacaggagtacggcgtaaaacgtgttaaagcaccataccaaatagggttaaattctaactaagctgtaaaaagccatgattaaaataaaaataaatgacgaaagtgaccctacaatagccgacgcactatagctaagacccaaactgggattagataccccactatgcttagccctaaacacagataattacataaacaaaattattcgccagagtactactagcaacagcttaaaactcaaaggacttggcggtgctttatatccttctagaggagcctgttctataatcgataaaccccgataaacctcaccaattcttgctaatacagtctatataccgccatcttcagcaaaccctaaaaaggaaaaaaagtaagcgtaattatgatacataaaaacgttaggtcaaggtgtaacctatgaaatgggaagaaatgggctacattctctacaccaagagaatcaagcacgaaagttattatgaaaccaataaccaaaggaggatttagcagtaaactaagaatagagtgcttagttgaattaggccatgaagcacgcacacaccgcccgtcaccctcctcaaatagattcagtgcatctaaccctatttaaacgcactagctacatgagaggagacaagtcgtaacaaggtaagcatactggaaagtgtgcttggataaatcaagatatagcttaaacaaagcatccagtttacacctagaagacttcattcattatgaatatcttgaactagacctagcccaaagataccctctcgactaaacaaccaagatagaataaaacaaaacatttaatcccaatttaaagtataggagatagaaatctaagtacggcgctatagagaaagtaccgcaagggaacgatgaaagaaaaaaactaaaagtataaaaaagcaaagattaccccttgtaccttttgcataatgaattaactagtataagacttaacaaaatgaattttagctaagcagcccgaaaccagacgagctactcacaaacagtttaccaagaactaactcatctatgtggcaaaatagtgagaagatttgtaagtagaggtgacatgcctaacgagcctggtgatagctggttgtccagaaaatgaatctaagttcagctttaaagataccaaaaattcaaataaaccccactgtagctttaaaagttagtctaaaaaggtacagccttttagaaacggatacaaccttgactagagagtaaaatttaacactaccatagtaggcctaaaagcagccatcaattaagaaagcgttaaagctcaacaacaaaaattaaatagattccaacaacaaatgattaactcctagccccaatactggactaatctattatagaatagaagcaataatgttaatatgagtaacaagaaaaattttctccttgcataagtctaagtcagtgcctgataatactctgaccactaacagtcaataaaaataatccaacaataaacaatttattgattatactgttaacccaacacaggagtgcatctaaggaaagattaaaagaagtaaaaggaactcggcaaacacaaaccccgcctgtttaccaaaaacatcacctccagcattcccagtattggaggcattgcctgcccagtgacaactgtttaacggccgcggtatcctgaccgtgcaaaggtagcataatcatttgttctctaaataaggacttgtatgaatggccgcacgagggttttactgtctcttacttccaatcagtgaaattgaccttcccgtgaagaggcgggaatgcacaaataagacgagaagaccctatggagctttaactaaccaacccaaagagaataaatttaaccattaaggaataacaacaatctccatgagttggtagtttcggttggggtgacctcggagaataaaaaatcctccgagcgattttaaagactagacccacaagtcaaatcactctatcgctcattgatccaaaaacttgatcaacggaacaagttaccctagggataacagcgcaatcctattcaagagtccatatcgacaatagggtttacgacctcgatgttggatcaggacatcctgatggtgcaaccgctatcaaaggttcgtttgttcaacgattaaagtcctacgtgatctgagttcagaccggagtaatccaggtcggtttctatctattacgtatttctcccagtacgaaaggacaagagaaataaggccaactttaaatcaagcgccttaagacaaccaatgataacatctcaactgacaacacaaaaccctgccctagaacagggcttagttaaggtggcagagcccggtaattgcataaaacttaaacttttatatccagagattcaaatcctctccttaacaaaatgttcataattaacatcttaatactaattattcccatcctattggccgtagcattccttacgttagtggaacgaaaagttctaggctatatacaactccgaaaaggtccaaatgtcgtaggtccatatggcctactccaacccatcgccgatgcaatcaaacttttcattaaagaaccactacgacccgctacatcttcagcctcaatatttatcctagcacctatcatagctttaggcctagccttaaccatgtgaattcccctaccaataccctatcctcttatcaacataaacctaggagtcctatttattctagccatatcaagcctagccgtatactccattctctgatcaggctgagcttccaactcaaaatacgcactaatcggagccctacgagcagtagcacaaacaatctcatacgaagtaacgctagcaattatcctgttatcagtactcctaataagtgggtcctttaccctctccacattaattactacacaagaacaaatatggttaatcctcccagcatggcctctagcaataatatgatttatctcaacactagcagaaacaaaccgagctccatttgatttaactgaaggagaatcagagctagtctcgggcttcaacgtagaatatgcagcaggaccatttgccctcttcttcatagcagagtacgcaaatattatcataataaatatctttacagcaattttattcctaggaacatcccacaatccacacataccagaactctacacaatcaattttaccattaaatccctactgctcacaatatccttcctatgaatccgagcatcctaccctcgatttcgctatgaccaactaatacacttactatgaaaaaattttctacctctgacactagccctgtgcatgtgacacgtatccctacccatccttacatcaggcatcccaccacaaacataagaaatatgtctgacaaaagagttactttgatagagtaaataatagaggttcaaaccctcttatttctagaactataggaatcgaacctactcctaagaatccaaaactcttcgtgctcccaattacaccaaattctattagtaaggtcagctaattaagctatcgggcccataccccgaaaatgttggtttatatccttcccgtactaataaacccaattatctttattattattctactaaccattatactaggaactattattgtcataatcagttctcactgactacttgtctgaatcgggtttgaaataaatatactcgccatcatccccatcataataaaaaatcacaacccacgagctacagaagcatcaactaaatattttttgactcaatcaacagcctcaatactactaataatagccgtcatcattaacctaatattctcaggccaatgaaccgtaataaaactatttaacccaatagcctcaatacttataacgatagccctagctataaaactaggaatagccccatttcacttctgagtcccagaagtaacacagggcatccccctatcctcaggccttatcctactgacatgacaaaaactagcacctatatctgtactttaccaaatcttcccatcaattaacctaaacttaattctaaccctatcagttttatcaatcctaattggaggctgagggggactaaaccaaacacaactccgaaaaatcatagcctactcatcaatcgctcatataggctgaataacagcagtactaccatataaccccaccataacattgctaaacttaattatctatatcattataacttccaccatatttaccatatttatagccaattccaccaccactaccctgtcattatcacacacatgaaataaaacacccattataaccgtcctaattcttgccactctcctatccataggaggactccctcccctatctgggtttataccaaaatgaataatcatccaagagataacaaaaaataacagcatcattctacccactttcatagcaatcacagctctactaaacttatatttttatatacgactcacgtattctaccacactaacaatatttccctccacaaacaacataaaaataaaatgacaatttccccttatgaaaaaaataacttttctaccaacaatagtcgtattatctaccataatactaccactcacgccaatactatcagtgttagaataggaatttaggttaaacagaccaagagccttcaaagccctaagcaagtacaatttacttaattcctgataaggattgcaagactacaccttacatcaattgaatgcaaatcaaccactttaattaagctaaatcctcactagactggtgggctccacccccacgaaactttagttaacagctaaacaccctagttaactggcttcaatctacttctcccgccgcaagaaaaaaaaggcgggagaagccccggcagaattgaagctgcttctctgaatttgcaattcaacgtgtaaattcaccacagggcttggtaaaaagaggagtcaaacctctatctttagatttacagtctaatgctttgctcagccattttacccatgttcattaaccgctgactattctcaaccaaccataaagatattggtaccctttatctactatttggtgcttgggccggtatagtaggaacagctctaagccttctaattcgcgctgaattaggccaacccggaactctgctcggagacgaccaaatctacaacgtagttgtaaccgcacacgcatttgtaataatcttcttcatagtaataccaatcataattggaggattcggtaactgacttgttcccctaataattggtgctcccgatatagcatttccccgaataaataatataagcttctgactcctccctccctcattcctactactcctcgcatcctctatagttgaagctggggcaggaacaggctgaaccgtgtaccctcccttagcaggcaacctagcccatgcaggagcttcagtagatctaaccattttctctttacacttagcaggagtttcctcaattttaggagccatcaacttcattacaacaattatcaacataaagccccccgcaatgtcacaataccaaacccctctgttcgtatgatccgtaataattaccgccgtactactactactctcgctccctgtattagcagccggcatcacaatgctattaacagaccggaacctaaatacaaccttcttcgacccggcaggaggaggagaccctattctatatcaacacttattctgattctttggacaccccgaagtctatattttaatcttacctggatttggaataatctctcatatcgtgacctactactcaggaaaaaaagaaccattcggatatatgggaatagtttgggctataatgtcaatcggatttctaggtttcatcgtatgagcccaccatatattcactgtcggaatagacgtcgacacacgagcctacttcacatcagccactataattattgctattccaaccggggtaaaagtcttcagctgattggcaacacttcatggaggtaatatcaaatggtctcctgctataatgtgagccctaggctttattttcttatttacagtagggggtttaactggaattgtcttagccaactcttccctcgatattgttcttcacgacacatactacgttgtcgcacatttccactatgttttatcaataggagctgtatttgctattatagggggatttgttcattgattcccactattctcaggttatactctcaacgatacatgagccaaaatccacttcgcaattatatttgtaggcgtcaatataaccttcttcccacaacactttctaggactatctggcatgcctcgacgatactccgactacccagatgcatacacaatatgaaatactatctcatcaataggctcattcatttccctaacagcagttatactaatagttttcatcatctgagaagcatttgcatctaaacgagaagtcttgactgtagacttaaccacgacaaatctagaatgattaaacggatgccctccaccatatcacacatttgaagaacccacctatgttaacctaaaataagaaaggaaggaatcgaaccccctactattggtttcaagccaacatcataacctctatgtctctctcaataaacgaggtgttagtaaaacattatataattttgtcaaagttaagttacaagtgaaagtcctgtacacctcatatggcatatcccatacaactaggattccaagatgcaacatcaccaatcatagaagaactacttcactttcatgaccacacgctaataattgtcttcttaattagctcattagtactttacattatttcactaatactaacgacaaagctgacccatacaagcacgatagatgcacaagaagtagagacaatctgaaccattctgcccgccatcatcttaattctaattgctcttccttctttacgaattctatacataatagatgaaatcaataacccatctcttacagtaaaaaccataggacatcagtgatactgaagctatgagtatacagattatgaggacttaagcttcgactcctacataattccaacatcagaattaaagccaggggagctacgactattagaagtcgataatcgagttgtactaccaatagaaataacaatccgaatgttagtctcctctgaagacgtattacactcatgagctgtgccctctctaggactaaaaacagacgcaatcccaggccgtctaaaccaaataacccttatatcgtcccgtccaggcttatattacggtcaatgctcagaaatttgcgggtcaaaccacagtttcatacccattgtccttgagttagtcccactaaagtactttgaaaaatgatctgcgtcaatattataaaatcactaagaagctatatagcactaaccttttaagttagagattgagagccatatactctccttggtgacatgccgcaactagacacgtcaacatgactgacaatgatcttatcaatattcttgaccctttttatcatctttcaactaaaagtttcaaaacacaacttttatcacaatccagaactgacaccaacaaaaatattaaaacaaaacaccccttgagaaacaaaatgaacgaaaatttatttacctcttttactacccctgtaattttaggtctccctctcgtaacccttatcgtactattccccagcctactattcccaacatcaaaccgactagtaagcaatcgctttgtaaccctccaacaatgaatacttcaacttgtatcaaaacaaataatgagtatccacaattctaaaggacaaacatgaacattaatattaatatctctgatcctatttattggatcaacaaacctactaggcctattaccccattcattcacaccaacaacacaactatcaataaacctaggcatagccatccccctgtgagcaggagccgtaattacaggattccgcaataaaactaaagcatcacttgcccatttcttaccacaaggaacacccactccactaatcccaatactagtaattattgaaactatcagcctttttattcaacctatagccctcgccgtgcggttaacagctaacatcactgcaggacacctattaattcacctaatcggaggagctacacttgcactaataagcattagcactacaacagctctaattacattcaccattctaaccctactaacaattctagagtttgcagtagctataatccaagcctatgtattcactctcctagtcagcctatatctgcatgacaacacataatgacacaccaaactcatgcttatcatatagtaaacccaagcccttgacctcttacaggagctttgtctgccctcttaataacatccggcctaaccatgtgatttcactttaactcaatgaccctgctaataattggcctaacaacaaatatactaacaatataccaatgatgacgagatgttatccgagaaagcaccttccaagggcaccataccccagctgtccaaaaaggcctccgttatggaataattctttttattatctccgaagtactattctttaccggatttttctgagctttctaccactcaagcctcgcccccacccctgaactaggcggctgctgacccccaacaggcattcacccactaaaccccctagaagtcccactgctcaacacctctgtcctattggcttccggagtttctattacctgagcccatcatagtttaatagaaggggaccgaaagcatatattacaagccctatttatcaccatcacattaggagtctacttcacactactacaagcctcagaatactatgaagcaccttttactatctccgacggagtttacggctcaactttttttgtagccacaggcttccacggcctccacgtcatcattgggtccaccttcttaattgtctgcttcttccgccaattaaaatttcattttacttctaaccaccacttcggctttgaagccgctgcctgatactgacatttcgtagacgtagtctgacttttcctctatgtttctatctattgatgaggctcctattcttttagtattaactagtacagctgacttccaatcagctagtttcggtctagtccgaaaaagaataataaatttaatactagccctcctgaccaattttacactagccaccctactcgtcatcatcgcattctgacttccccaactaaatgtatactctgagaaaacaagcccatacgaatgtggatttgaccccataggatcagcccgccttcccttctctataaaattctttctggtagccatcacattcctcttatttgacctagaaattgcactcctcctaccactgccatgagcctcacaaacagcaaatctaaacacaatgcttaccatagccctcttcctaattatcctcctagctgtaagcctagcctatgagtgaactcaaaaaggactagaatgaaccgaatatggtacttagtttaaaataaaataaatgatttcgactcattagattatgatttaattcataattaccaaatgtctatagtatacataaacattataatagcattcacagtatctcttgtaggactactaatataccgatcccacctaatatcctcccttctatgcttagaaggaataatgctatccctattcgttatagcagccctaacaatcctcaactcacattttacattagctagcataatacctattatcctactagtcttcgcagcctgtgaagcagccctaggtctatctctactagtaatagtatcaaatacatatggtactgattatgtacaaaacctcaacttactccaatgctaaaatacattattccaacaattatacttatacccctaacctggttatcaaaaaataatataatttgggttaactccacagcacacagccttctaattagctttacaagcctcctcctcataaaccagtttggcgacaacagccttaatttttcactactatttttctccgactccctatccacaccactactaattttaaccatatggctcctccctctaatactaatagctagccaacatcatctatcaaaagaaaacctaacccgaaaaaaactatttattactatgctgatctcactacaactattcctaattataacctttaccgccatggaactaatcttattttatattctatttgaagcaacactagtcccaacactcattattatcacccgatgaggaaaccaaacagaacgcctaaacgccggactctatttcctattctatacactagctggctccttacccctattagtcgcactaatttatatccaaaacacagtaggatccctaaatttcctaatattacagtactgagtacaacctgttcacaactcttgatctaatgtcttcatatgactagcatgtataatagctttcatagtaaaaataccactatatggcctccacctttgactacctaaagctcacgtagaagcccccatcgcaggctccatagtccttgcagcagttctactaaaactaggggggtacggtatgctacgaatcacactaattctaaaccctatgaccgactttatagcatacccattcattatactctccctatgaggcataattataaccagctcaatctgcctccgtcaaacggacctaaaatcactcatcgcatactcctctgtaagccacatagcactcgttatcgtagccatccttatccagacaccttgaagctacataggagcaaccgcccttatgattgcccacggcctcacatcctccatacttttctgtctagcaaactcaaactacgaacgaatccacagccgaaccataattctagctcgaggcctacaaacgctccttccactaatagccacctgatgactactagcaagtctaaccaacttagctctacccccaacaatcaacttaattggagaactatttgtagtaatgtcaaccttttcatgatctaacattacaattattctaataggagtaaatatagtaatcaccgccctatattctctatacatgctaattataacccaacgaggaaaatatacctaccacattaataatatctcgccttcctttacacgggaaaatgcactcatatcattacacatcctacccctactactcctaaccctaaacccaaaaattattctaggacctctatactgtaaatatagtttaacaaaaacattagattgtgaatctaacaatagaaactcattaccttcttatttaccgaaaaagtatgcaagaactgctaattctatgctcccatatctaatagtatggctttttcgaacttttaaaggatagtagtttatccgttggtcttaggaaccaaaaaattggtgcaactccaaataaaagtaataaacatattctcctcactctcactagttactttactcttactaactatacccattataataataagctttaacacctacaaaccttccaactacccactctacgtaaaaacagctatctcatacgccttcattaccagcataattcccacaataatatttatccactcaggccaagaactaattatttcaaactgacactgactaaccatccaaactcttaaattatccctcagctttaaaatagactatttctcaataatatttatcccagtagcactattcgtcacatgatctattatagaattctcaatatgatatatacactcagaccccaatattaacaaattcttcaaatacctactcctattcctcattactatgctcatccttgtaaccgcaaacaacctcttccagctattcattggctgagaaggcgtcggaatcatatcatttctactcatcggatgatgatacggacgagcagatgcaaacacagcagccctacaagcaatcttatataaccgcatcggcgacattggtttcattttagcaatagcatggtttctaacaaatctcaatacctgagacctccaacagatcttcatactaaacccaagcgactcaaacatacccttgatcggactagcattagctgcaaccggaaaatccgcccaatttggcctccacccgtgacttccctctgcaatagaaggcccaactcccgtctcagcactactccattcaagcacaatagtggtagcaggtatcttcctactaatccgtttctatcccctcacagaaaacaataaatacatccaatctattacattatgcttaggagccattaccacactatttacagcaatatgcgccctcacccaaaatgacattaagaaaatcatcgccttctccacatccagtcaactgggccttataatagtaactattggcattaaccaaccttacctagctttcctccacatctgtacccacgcctttttcaaagctatactattcatatgctccggttccattattcacagcctaaacgacgaacaagatattcgaaaaataggaggcctatttaaagccatgccattcaccacaacagccctcattgttggcagtctcgcactaacaggaatacccttcctcacaggattctactccaaagacctaatcatcgaagccgccaacacgtcttataccaacgcctgagcccttctaataacattaattgccacctctttcacagctatttacagcacccgtattatttttttcgcacttctaggacaaccccgattccctaccctagttaatattaacgaaaacaacccccttctgatcaactctatcaaacgcttactaattggaagcctcttcgcaggatacatcatttccaacaatattcctccaacaacaattccccaaataactatgccctactacctaaaaacaacagccctaattgttacaatcctaggcttcatcttagccctagaaatcagtaatataactaaaaatctaaaatatcactacccctcaaacgccttcaagttctcaaccttgctagggtatttccccacaattatacatcgcctagctccatacataaatttatcaataagccaaaaatcagcatcctcccttctagacctaatctgactagaagccatcctaccaaaaaccatctcactcgcccaaataaaagcatctaccctggtcacaaaccaaaaaggcctgatcaaactatatttcctctccttcttaatcacaatccttatcagcataatcttatttaatttccacgagtaatttctataataaccacaacaccaattaataaagaccacccagttacaataactaatcaggtaccataactgtataaagccgcaatccctatggcctcttcactaaaaaacccagaatcccctgtatcataaatcacccaatcccctaaaccattaaactcaaacacaacctcaacttccttatcctttaatacataatagaccataaagaactccatcaacaagccagtaacaaatgcccctaaaacagccttattagaaagccaaatttcaggatactgttctgtagccatagccgttgtataaccaaaaactaccatcatacctcccaaataaattaaaaagaccatcaaccccaaaaaggatccaccaaaattcaatacaattccacagccaaccccaccactcacaattaaccctaaccccccataaataggtgaaggtttcgaagaaaaccccacaaaacctatcacgaaaataacgcttagaataaatacaatgtatagtatcattattcttacatggaatctaaccatgactaatgatatgaaaaaccatcgttgtcattcaactacaagaacactaatgactaacattcgaaagtcccacccactaataaaaattgtaaacaatgcattcatcgaccttccagccccatcaaacatttcatcatgatgaaatttcggttccctcctgggaatctgcctaatcctacaaatcctcacaggcctattcctagcaatacactacacatccgacacaacaacagcattctcctctgttacccatatctgccgagacgtgaactacggctgaatcatccgatacatacacgcaaacggagcttcaatgttttttatctgcttatatatgcacgtaggacgaggcttatattacgggtcttacacttttctagaaacatgaaatattggagtaatccttctgctcacagtaatagccacagcatttataggatacgtcctaccatgaggacaaatatcattctgaggagcaacagtcatcaccaacctcttatcagcaatcccatacatcggcacaaatttagtcgaatgaatctgaggcggattctcagtagacaaagcaacccttacccgattcttcgctttccattttatccttccatttatcatcatagcaattgccatagtccacctactattcctccacgaaacaggctccaacaatccaacaggaatttcctcagacatagacaaaatcccattccacccctactataccattaaggacatcttaggggccctcttactaattctagctctaatactactagtactattcgcacccgacctcctcggagacccagataactacaccccagccaatccactcaacacaccccctcacatcaaacccgagtgatacttcttatttgcatacgcaatcttacgatcaatccccaacaaactaggaggagtactagccctagccttctctatcctaattcttgctctaatccccctactacacacctccaaacaacgaagcataatattccgaccactcagccaatgcctattctgagccctagtagcagacctactgacactcacatgaattggaggacaaccagtcgaacacccatatatcaccatcggacaactagcatctgtcctatactttctcctcatcctagtgctaataccaacggccggcacagtcgaaaacaaattactaaaatgaagacaggtctttgtagtacatctaatatactggtcttgtaaaccagagaaggagaacaactaacctccctaagactcaaggaagaaactgcagtctcaccatcaacccccaaagctgaagttctatttaaactattccctgaacactattaatatagttccataaatacaaagagccttatcagtattaaatttatcaaaaatcccaataactcaacacagaatttgcaccctaaccaaatattacaaacaccactagctaacataacacacccatacacagaccacagaatgaattacctaggcaaggggtaatgtacataacattaatgtaataaagacataatatgtatatagtacattaaattatatgccccatgcatataagcaagtacatgacctctatagcagtacataatacatataattattgactgtacatagtacattatgtcaaattcattcttgatagtatatctattatatattccttaccattagatcacgagcttaattaccatgccgcgtgaaaccagcaacccgctaggcagggatccctcttctcgctccgggcccataaaccgtgggggtcgctatccaatgaactttaccaggcatctggttctttcttcagggccatctcatctaaaacggtccattctttcctcttaaataagacatctcgatgg + +' +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> parserClass [ + + ^ BioEntrezXMLGBBasicParser +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> setUp [ + + super setUp. + parseResult := BioParser parseNcbiXmlGBSeq: self gbSet01. +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> testParseAccession [ + + | record | + record := parseResult first. + self + assert: (record at: BioGBSeqCollection qualifierForAccessionWithVersion) + equals: 'HQ184032.1'. + +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> testParseDefinition [ + + | record | + record := parseResult first. + self + assert: (record at: BioGBSeqCollection qualifierForDefinition) + equals: 'Bos taurus isolate Chi597 mitochondrion, complete genome'. + +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> testParseOnlyOneRecord [ + + self assert: parseResult size equals: 1. + +] + +{ #category : 'accessing' } +BioEntrezXMLGenBankAccessionParserTest >> testParseReferenceJournal [ + + | record | + record := parseResult first. + self + assert: (record at: BioGBSeqCollection qualifierForReferenceJournal) + equals: 'Submitted (25-AUG-2010) Dipartimento di Genetica e Microbiologia, University of Pavia, Via Ferrata, 1, Pavia 27100, Italy' + + + +] diff --git a/repository/BioParsers-Tests/BioFASTAParserTest.class.st b/repository/BioParsers-Tests/BioFASTAParserTest.class.st new file mode 100644 index 00000000..d0bc9871 --- /dev/null +++ b/repository/BioParsers-Tests/BioFASTAParserTest.class.st @@ -0,0 +1,515 @@ +Class { + #name : 'BioFASTAParserTest', + #superclass : 'BioAbstractFASTAParserTest', + #instVars : [ + 'fastaRecord' + ], + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioFASTAParserTest >> multiFastaSeq01PlainText [ + + ^ '>YAL069W-1.334 Putative promoter sequence +CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA +CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT +ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC +CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC +TGTTCTTCTACCCACCATATTGAAACGCTAACAA +>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA +GTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTT +TTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGG +TACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT' +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> multiFastaSeq06PlainText [ + + ^ '>YAL069W-1.334 Putative promoter sequence +CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA +CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT +ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC +CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC +TGTTCTTCTACCCACCATATTGAAACGCTAACAA + +>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA +GTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTT +TTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGG +TACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT' +] + +{ #category : 'accessing' } +BioFASTAParserTest >> setUp [ + + super setUp. + parser := self parserClass new +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseMultiFasta03 [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq03Header01 sequence: self multiFastaSeq03Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq03Header02 sequence: self multiFastaSeq03Body02). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq03. + self assert: (parseResult isKindOf: BioFastaMultiRecord). + self deny: parseResult records isEmpty. + self assert: parseResult records size equals: 2. + self assert: parseResult sequenceNames asArray equals: + (Array + with: self multiFastaSeq03Header01 allButFirst + with: self multiFastaSeq03Header02 allButFirst). + self assert: parseResult sequenceStrings asArray equals: + (Array + with: self multiFastaSeq03Body01 asCondensedString + with: self multiFastaSeq03Body02 asCondensedString). + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseMultiFasta07 [ + fastaRecord := BioFastaMultiRecord new + addFastaRecord: (BioFastaRecord named: 'Sample sequence 1' sequence: 'garkbdctymvhu'); + addFastaRecord: (BioFastaRecord named: 'Sample sequence 2' sequence: 'ctymvhgarkbda'); + addFastaRecord: (BioFastaRecord named: 'Sample sequence 3' sequence: 'ccccccccccga'); + yourself. + parseResult := self parserClass parseMultiFasta: self multiFastaSeq07. + self assert: (parseResult isKindOf: BioFastaMultiRecord). + self deny: parseResult sequences isEmpty. + self assert: parseResult sequences size equals: 3. + self assert: parseResult equals: fastaRecord +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseMultiFastaThreeSequences [ + + | seqString | + + seqString := self multiFastaSeq02. + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header01 sequence: self multiFastaSeq02Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header02 sequence: self multiFastaSeq02Body02); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header03 sequence: self multiFastaSeq02Body03). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq02PlainText. + + self assert: parseResult records size equals: 3. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseMultiFastaTwoSequences [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header01 sequence: self multiFastaSeq01Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header02 sequence: self multiFastaSeq01Body02). + + parseResult := self parserClass parseMultiFasta: '>YAL069W-1.334 Putative promoter sequence +CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACA +CATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTT +ACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAAC +CACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATC +CAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATAC +TGTTCTTCTACCCACCATATTGAAACGCTAACAA +>YAL068C-7235.2170 Putative promoter sequence +TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGA +GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA +TAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA +GTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTT +TTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGG +TACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT'. + self assert: (parseResult isKindOf: BioFastaMultiRecord). + + self deny: parseResult sequences isEmpty. + self assert: parseResult sequences size equals: 2. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFasta01 [ + + | seqSelectors parseResults | + seqSelectors := self class superclass selectorsInProtocol: 'samples-single'. + parseResults := seqSelectors collect: [ :sel | self parserClass parseFasta: (self perform: sel) ]. + + parseResults do: [ :pResult | self assert: pResult isFastaRecord ] +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFasta02 [ + fastaRecord := BioFastaRecord + named: self multiFastaSeq01Header01 + sequence: self multiFastaSeq01Body01. + parseResult := self parserClass parseFasta: self fastaSeq01. + self assert: (parseResult isKindOf: BioFastaRecord). + + self assert: parseResult name equals: self multiFastaSeq01Header01. + self + assert: parseResult sequence asString + equals: self multiFastaSeq01Body01 asCondensedString +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFasta04 [ + | seqName seqString | + seqName := 'YAL068C-7235.2170 Putative promoter sequence'. + seqString := 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA'. + fastaRecord := BioFastaRecord named: seqName sequence: seqString. + parseResult := self parserClass parseFasta: self fastaSeq04. + self assert: (parseResult isKindOf: BioFastaRecord). + self assert: parseResult equals: fastaRecord. + self assert: parseResult name equals: seqName. + self + assert: parseResult sequence asString + equals: seqString asCondensedString +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFasta07 [ + + self + should: [ self parserClass parseFasta: '>gi|6273291|emb|AF191665.1|AF191665 +' ] + raise: Error. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFastaProteinSequenceIntoRecord [ + + | seqName seqString | + seqName := '>sp_ac|P02769_WOSIG0 \ID=ALBU_BOVIN \DE="Serum albumin precursor (Allergen Bos d 6) (BSA)" \NCBITAXID=9913 \MODRES=(1|Acetyl) \VARIANT=(196|A|T) \LENGTH=589'. + seqString := 'RGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA'. + + fastaRecord := BioFastaRecord named: seqName sequence: seqString. + + parseResult := self parserClass parseFasta: self fastaSeq08. + self assert: (parseResult isKindOf: BioFastaRecord). + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFastaProteinSequenceName [ + + | seqName seqString | + seqName := 'sp_ac|P02769_WOSIG0 \ID=ALBU_BOVIN \DE="Serum albumin precursor (Allergen Bos d 6) (BSA)" \NCBITAXID=9913 \MODRES=(1|Acetyl) \VARIANT=(196|A|T) \LENGTH=589'. + seqString := 'RGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA'. + + fastaRecord := BioFastaRecord named: seqName sequence: seqString. + + parseResult := self parserClass parseFasta: self fastaSeq08. + + self assert: parseResult name equals: seqName. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseSingleFastaProteinSequenceString [ + + | seqName seqString | + seqName := 'sp_ac|P02769_WOSIG0 \ID=ALBU_BOVIN \DE="Serum albumin precursor (Allergen Bos d 6) (BSA)" \NCBITAXID=9913 \MODRES=(1|Acetyl) \VARIANT=(196|A|T) \LENGTH=589'. + seqString := 'RGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA'. + + fastaRecord := BioFastaRecord named: seqName sequence: seqString. + + parseResult := self parserClass parseFasta: self fastaSeq08. + + self + assert: parseResult sequence asString + equals: seqString asCondensedString + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseThreeMultiFastaIntoRecord [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header01 sequence: self multiFastaSeq02Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header02 sequence: self multiFastaSeq02Body02); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header03 sequence: self multiFastaSeq02Body03). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq02PlainText. + + self assert: parseResult sequenceNames equals: fastaRecord sequenceNames. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseThreeMultiFastaSequenceNames [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header01 sequence: self multiFastaSeq02Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header02 sequence: self multiFastaSeq02Body02); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header03 sequence: self multiFastaSeq02Body03). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq02PlainText. + + self assert: parseResult equals: fastaRecord. + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseThreeMultiFastaSequenceStrings [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header01 sequence: self multiFastaSeq02Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header02 sequence: self multiFastaSeq02Body02); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq02Header03 sequence: self multiFastaSeq02Body03). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq02PlainText. + + self + assert: parseResult sequenceStrings asArray + equals: (Array + with: self multiFastaSeq02Body01 asCondensedString + with: self multiFastaSeq02Body02 asCondensedString + with: self multiFastaSeq02Body03 asCondensedString). + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseTwoMultiFastaIntoRecord [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header01 sequence: self multiFastaSeq01Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header02 sequence: self multiFastaSeq01Body02). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq01PlainText. + + self + assert: parseResult sequenceNames + equals: fastaRecord sequenceNames. + + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseTwoMultiFastaSequenceNames [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header01 sequence: self multiFastaSeq01Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header02 sequence: self multiFastaSeq01Body02). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq01PlainText. + self + assert: (parseResult sequenceNames + bioHasEqualElements: (OrderedCollection + with: self multiFastaSeq01Header01 + with: self multiFastaSeq01Header02)). + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testParseTwoMultiFastaSequenceStrings [ + + fastaRecord := BioFastaMultiRecord new. + fastaRecord + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header01 sequence: self multiFastaSeq01Body01); + addFastaRecord: (BioFastaRecord named: self multiFastaSeq01Header02 sequence: self multiFastaSeq01Body02). + + parseResult := self parserClass parseMultiFasta: self multiFastaSeq01PlainText. + self + assert: parseResult sequenceStrings asArray + equals: ( + Array + with: self multiFastaSeq01Body01 asCondensedString + with: self multiFastaSeq01Body02 asCondensedString). + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta01 [ + + | seqSelectors parseResults | + seqSelectors := self class superclass selectorsInProtocol: + 'samples-single'. + parseResults := seqSelectors collect: [ :sel | + self parserClass tokenizeFasta: (self perform: sel) ]. + + parseResults do: [ :pResult | + self assert: pResult isCollection. + self assert: pResult size equals: 2 ] +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta02 [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq01) + bioHasEqualElements: + #('YAL069W-1.334 Putative promoter sequence' 'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATCCAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATATTGAAACGCTAACAA')). +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta03 [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq02) + bioHasEqualElements: #('YAL068C-7235.2170 Putative promoter sequence' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGATAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAAGTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTTTTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGGTACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT')) +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta05 [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq05) + bioHasEqualElements: #( + 'YAL068C-7235.2170 Putative promoter sequence' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA') ). +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta06 [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq08) + bioHasEqualElements: #( + 'sp_ac|P02769_WOSIG0 \ID=ALBU_BOVIN \DE="Serum albumin precursor (Allergen Bos d 6) (BSA)" \NCBITAXID=9913 \MODRES=(1|Acetyl) \VARIANT=(196|A|T) \LENGTH=589' + 'RGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNELTEFAKTCVADESHAGCEKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLYYANKYNGVFQECCQAEDKGACLLPKIETMREKVLASSARQRLRCASIQKFGERALKAWSVARLSQKFPKAEFVEVTKLVTDLTKVHKECCHGDLLECADDRADLAKYICDNQDTISSKLKECCDKPLLEKSHCIAEVEKDAIPENLPPLTADFAEDKDVCKNYQEAKDAFLGSFLYEYSRRHPEYAVSVLLRLAKEYEATLEECCAKDDPHACYSTVFDKLKHLVDEPQNLIKQNCDQFEKLGEYGFQNALIVRYTRKVPQVSTPTLVEVSRSLGKVGTRCCTKPESERMPCTEDYLSLILNRLCVLHEKTPVSEKVTKCCTESLVNRRPCFSALTPDETYVPKAFDEKLFTFHADICTLPDTEKQIKKQTALVELLKHKPKATEEQLKTVMENFVAFVDKCCAADDKEACFAVEGPKLVVSTQTALA') ). +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFasta07 [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq09) + bioHasEqualElements: #('gi|6273291|emb|AF191665.1|AF191665' 'actgtcgatatgctagct') ) +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testSingleFastaWithReturnLine [ + + self + assert: ((self parserClass tokenizeFasta: self fastaSeq04) + bioHasEqualElements: #('YAL068C-7235.2170 Putative promoter sequence' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA')). +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta01 [ + + self + assert: ((self parserClass tokenizeMultiFasta: self multiFastaSeq01PlainText) + bioHasEqualElements: #( + #('YAL069W-1.334 Putative promoter sequence' 'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATCCAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATATTGAAACGCTAACAA') + #('YAL068C-7235.2170 Putative promoter sequence' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGATAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAAGTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTTTTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGGTACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT') ) ). + + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta02 [ + + self assert: ( + (self parserClass tokenizeMultiFasta: self multiFastaSeq02PlainText) + bioHasEqualElements: #( + #('first sequence record' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCA') + #('second sequence record' 'CAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGA') + #('third sequence record' 'GAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGATAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAA') ) ). + + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta03 [ + + self assert: ( ( self parserClass tokenizeMultiFasta: self multiFastaSeq03 ) bioHasEqualElements: #( + #( 'SEQUENCE_1' 'MTEITAAMVKELRESTGAGMMDCKNALSETNGDFDKAVQLLREKGLGKAAKKADRLAAEGLVSVKVSDDFTIAAMRPSYLSYEDLDMTFVENEYKALVAELEKENEERRRLKDPNKPEHKIPQFASRKQLSDAILKEAEEKIKEELKAQGKPEKIWDNIIPGKMNSFIADNSQLDSKLTLMGQFYVMDDKKTVEQVIAEKEKEFGGKIKIVEFICFEVGEGLEKKTEDFAAEVAAQL') + #( 'SEQUENCE_2' 'SATVSEINSETDFVAKNDQFIALTKDTTAHIQSNSLQSVEELHSSTINGVKFEEYLKSQIATIGENLVVRRFATLKAGANGVVNGYIHTNGRVGVVIAAACDSAEVASKSRDLLRQICMH') ) ). + + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta04 [ + + self assert: ( ( self parserClass tokenizeMultiFasta: self multiFastaSeq04 ) bioHasEqualElements: #( + #( 'HSBGPG Human gene for bone gla protein (BGP)' 'GGCAGATTCCCCCTAGACCCGCCCGCACCATGGTCAGGCATGCCCCTCCTCATCGCTGGGCACAGCCCAGAGGGTATAAACAGTGCTGGAGGCTGGCGGGGCAGGCCAGCTGAGTCCTGAGCAGCAGCCCAGCGCAGCCACCGAGACACCATGAGAGCCCTCACACTCCTCGCCCTATTGGCCCTGGCCGCACTTTGCATCGCTGGCCAGGCAGGTGAGTGCCCCCACCTCCCCTCAGGCCGCATTGCAGTGGGGGCTGAGAGGAGGAAGCACCATGGCCCACCTCTTCTCACCCCTTTGGCTGGCAGTCCCTTTGCAGTCTAACCACCTTGTTGCAGGCTCAATCCATTTGCCCCAGCTCTGCCCTTGCAGAGGGAGAGGAGGGAAGAGCAAGCTGCCCGAGACGCAGGGGAAGGAGGATGAGGGCCCTGGGGATGAGCTGGGGTGAACCAGGCTCCCTTTCCTTTGCAGGTGCGAAGCCCAGCGGTGCAGAGTCCAGCAAAGGTGCAGGTATGAGGATGGACCTGATGGGTTCCTGGACCCTCCCCTCTCACCCTGGTCCCTCAGTCTCATTCCCCCACTCCTGCCACCTCCTGTCTGGCCATCAGGAAGGCCAGCCTGCTCCCCACCTGATCCTCCCAAACCCAGAGCCACCTGATGCCTGCCCCTCTGCTCCACAGCCTTTGTGTCCAAGCAGGAGGGCAGCGAGGTAGTGAAGAGACCCAGGCGCTACCTGTATCAATGGCTGGGGTGAGAGAAAAGGCAGAGCTGGGCCAAGGCCCTGCCTCTCCGGGATGGTCTGTGGGGGAGCTGCAGCAGGGAGTGGCCTCTCTGGGTTGTGGTGGGGGTACAGGCAGCCTGCCCTGGTGGGCACCCTGGAGCCCCATGTGTAGGGAGAGGAGGGATGGGCATTTTGCACGGGGGCTGATGCCACCACGTCGGGTGTCTCAGAGCCCCAGTCCCCTACCCGGATCCCCTGGAGCCCAGGAGGGAGGTGTGTGAGCTCAATCCGGACTGTGACGAGTTGGCTGACCACATCGGCTTTCAGGAGGCCTATCGGCGCTTCTACGGCCCGGTCTAGGGTGTCGCTCTGCTGGCCTGGCCGGCAACCCCAGTTCTGCTCCTCTCCAGGCACCCTTCTTTCCTCTTCCCCTTGCCCTTGCCCTGACCTCCCAGCCCTATGGATGTGGGGTCCCCATCATCCCAGCTGCTCCCAAATAAACTCCAGAAG') + #( 'HSGLTH1 Human theta 1-globin gene' 'CCACTGCACTCACCGCACCCGGCCAATTTTTGTGTTTTTAGTAGAGACTAAATACCATATAGTGAACACCTAAGACGGGGGGCCTTGGATCCAGGGCGATTCAGAGGGCCCCGGTCGGAGCTGTCGGAGATTGAGCGCGCGCGGTCCCGGGATCTCCGACGAGGCCCTGGACCCCCGGGCGGCGAAGCTGCGGCGCGGCGCCCCCTGGAGGCCGCGGGACCCCTGGCCGGTCCGCGCAGGCGCAGCGGGGTCGCAGGGCGCGGCGGGTTCCAGCGCGGGGATGGCGCTGTCCGCGGAGGACCGGGCGCTGGTGCGCGCCCTGTGGAAGAAGCTGGGCAGCAACGTCGGCGTCTACACGACAGAGGCCCTGGAAAGGTGCGGCAGGCTGGGCGCCCCCGCCCCCAGGGGCCCTCCCTCCCCAAGCCCCCCGGACGCGCCTCACCCACGTTCCTCTCGCAGGACCTTCCTGGCTTTCCCCGCCACGAAGACCTACTTCTCCCACCTGGACCTGAGCCCCGGCTCCTCACAAGTCAGAGCCCACGGCCAGAAGGTGGCGGACGCGCTGAGCCTCGCCGTGGAGCGCCTGGACGACCTACCCCACGCGCTGTCCGCGCTGAGCCACCTGCACGCGTGCCAGCTGCGAGTGGACCCGGCCAGCTTCCAGGTGAGCGGCTGCCGTGCTGGGCCCCTGTCCCCGGGAGGGCCCCGGCGGGGTGGGTGCGGGGGGCGTGCGGGGCGGGTGCAGGCGAGTGAGCCTTGAGCGCTCGCCGCAGCTCCTGGGCCACTGCCTGCTGGTAACCCTCGCCCGGCACTACCCCGGAGACTTCAGCCCCGCGCTGCAGGCGTCGCTGGACAAGTTCCTGAGCCACGTTATCTCGGCGCTGGTTTCCGAGTACCGCTGAACTGTGGGTGGGTGGCCGCGGGATCCCCAGGCGACCTTCCCCGTGTTTGAGTAAAGCCTCTCCCAGGAGCAGCCTTCTTGCCGTGCTCTCTCGAGGTCAGGACGCGAGAGGAAGGCGC' ) ) ). + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta05 [ + + self assert: ( + (self parserClass tokenizeMultiFasta: self multiFastaSeq05) + bioHasEqualElements: #( + #('sequence1' 'ACTCCCCGTGCGCGCCCGGCCCGTAGCGTCCTCGTCGCCGCCCCTCGTCTCGCAGCCGCAGCCCGCGTGGACGCTCTCGCCTGAGCGCCGCGGACTAGCCCGGGTGGCC') + #('sequence2' 'CAGTCCGGCAGCGCCGGGGTTAAGCGGCCCAAGTAAACGTAGCGCAGCGATCGGCGCCGGAGATTCGCGAACCCGACACTCCGCGCCGCCCGCCGGCCAGGACCCGCGGCGCGATCGCGGCGCCGCGCTACAGCCAGCCTCACTGGCGCGCGGGCGAGCGCACGGGCGCTC' ) + #('sequence3' 'CACGACAGGCCCGCTGAGGCTTGTGCCAGACCTTGGAAACCTCAGGTATATACCTTTCCAGACGCGGGATCTCCCCTCCCC') + #('sequence4' 'CAGCAGACATCTGAATGAAGAAGAGGGTGCCAGCGGGTATGAGGAGTGCATTATCGTTAATGGGAACTTCAGTGACCAGTCCTCAGACACGAAGGATGCTCCCTCACCCCCAGTCTTGGAGGCAATCTGCACAGAGCCAGTCTGCACACC') + )). + + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFasta07 [ + + self + assert: ((self parserClass tokenizeMultiFasta: self multiFastaSeq07) + bioHasEqualElements: #( + #('Sample sequence 1' 'garkbdctymvhu') + #('Sample sequence 2' 'ctymvhgarkbda') + #('Sample sequence 3' 'ccccccccccga'))). + + +] + +{ #category : 'testing-tokenizer' } +BioFASTAParserTest >> testTokenizeMultiFastaWithAdditionalSeparatorBetweenRecords [ + + self + assert: ((self parserClass tokenizeMultiFasta: self multiFastaSeq06PlainText) + bioHasEqualElements: #( + #('YAL069W-1.334 Putative promoter sequence' 'CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATCCAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATATTGAAACGCTAACAA') + #('YAL068C-7235.2170 Putative promoter sequence' 'TACGAGAATAATTTCTCATCATCCAGCTTTAACACAAAATTCGCACAGTTTTCGTTAAGAGAACTTAACATTTTCTTATGACGTAAATGAAGTTTATATATAAATTTCCTTTTTATTGGATAATATGCCTATGCCGCATAATTTTTATATCTTTCTCCTAACAAAACATTCGCTTGTAAAGTATTATATTTAGGAAAAAAATAATCGATAAAGGCTCATCCGAAGATCAGTTAGATTCTTTTTGCAAGTCCTGAAGAAATTTTCACACTACTACTATAAAAAAAAAATATCATAAAAAGGTACATTACGTGCAACCAAAAGTGTAAAATGATTGGTTGCAATGTTTCACCTAAATTACTT') ) ). + + +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testTokenizeSingleFastaDescription01 [ + + | seqHeader | + seqHeader := '>gi|198282148|ref|NC_011206.1| Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome'. + + parseResult := self parserClass tokenizeFastaDescription: seqHeader. + self assert: (parseResult isKindOf: Collection). + self + assert: parseResult + equals: + 'Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome' +] + +{ #category : 'testing-builder' } +BioFASTAParserTest >> testTokenizeSingleFastaHeader01 [ + + | seqHeader | + seqHeader := '>gi|198282148|ref|NC_011206.1| Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome'. + + parseResult := self parserClass tokenizeFastaHeader: seqHeader. + self assert: (parseResult isKindOf: Collection). + self + assert: parseResult + equals: + #( '>gi' '198282148' 'ref' 'NC_011206.1' 'Acidithiobacillus ferrooxidans ATCC 53993 chromosome, complete genome' ) +] diff --git a/repository/BioParsers-Tests/BioGFF3Test.class.st b/repository/BioParsers-Tests/BioGFF3Test.class.st new file mode 100644 index 00000000..a162a963 --- /dev/null +++ b/repository/BioParsers-Tests/BioGFF3Test.class.st @@ -0,0 +1,315 @@ +Class { + #name : 'BioGFF3Test', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioGFF3Test >> baseDirectory [ + + ^ self testFilesFullDirectoryName / 'gff'. +] + +{ #category : 'accessing' } +BioGFF3Test >> hsaFile [ + + ^ self baseDirectory / 'hsa.gff3'. +] + +{ #category : 'accessing' } +BioGFF3Test >> rnoFile [ + + ^ self baseDirectory / 'rno.gff3'. +] + +{ #category : 'tests' } +BioGFF3Test >> testAsBioSequenceFeature [ + + | gff result f sf | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=gene1;Name=EDEN +'. + result := BioGFF3File fromString: gff. + f := result features first. + sf := f asBioSequenceFeature. + self assert: sf primaryTag equals: 'gene'. + self assert: sf sourceTag equals: '.'. + self assert: sf start equals: 1000. + self assert: sf end equals: 9000. + self assert: sf strand equals: '+'. + self assert: sf chromosome equals: 'chr1' +] + +{ #category : 'tests' } +BioGFF3Test >> testAttributesParsing [ + + | gff result f | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=gene1;Name=EDEN;Note=some%20note +'. + result := BioGFF3File fromString: gff. + f := result features first. + self assert: f id equals: 'gene1'. + self assert: f name equals: 'EDEN'. + self assert: (f attributeAt: 'Note') equals: 'some%20note' +] + +{ #category : 'tests' } +BioGFF3Test >> testBasicParse [ + + | gff result | + gff := '##gff-version 3 +ctg123 . gene 1000 9000 . + . ID=gene1;Name=EDEN +'. + result := BioGFF3File fromString: gff. + self assert: result featureCount equals: 1. + self assert: result features first type equals: 'gene'. + self assert: result features first name equals: 'EDEN'. + self assert: result features first id equals: 'gene1' +] + +{ #category : 'tests' } +BioGFF3Test >> testComments [ + + | gff result | + gff := '##gff-version 3 +# This is a comment +# Another comment +chr1 . gene 1000 9000 . + . ID=g1 +'. + result := BioGFF3File fromString: gff. + self assert: result featureCount equals: 1. + self assert: result features first id equals: 'g1' +] + +{ #category : 'tests' } +BioGFF3Test >> testDerivesFromRelationship [ + + | gff result parent children | + gff := '##gff-version 3 +chr1 . miRNA_primary_transcript 17369 17436 . - . ID=MI0022705;Name=hsa-mir-6859-1 +chr1 . miRNA 17409 17431 . - . ID=MIMAT1;Name=hsa-miR-6859-5p;Derives_from=MI0022705 +'. + result := BioGFF3File fromString: gff. + parent := result featureWithId: 'MI0022705'. + self assert: parent isNotNil. + children := result derivesFromOf: parent. + self assert: children size equals: 1. + self assert: children first name equals: 'hsa-miR-6859-5p' +] + +{ #category : 'tests' } +BioGFF3Test >> testFeatureFields [ + + | gff result f | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=gene1;Name=EDEN +'. + result := BioGFF3File fromString: gff. + f := result features first. + self assert: f seqid equals: 'chr1'. + self assert: f source equals: '.'. + self assert: f type equals: 'gene'. + self assert: f start equals: '1000'. + self assert: f end equals: '9000'. + self assert: f score equals: '.'. + self assert: f strand equals: '+'. + self assert: f phase equals: '.' +] + +{ #category : 'tests' } +BioGFF3Test >> testFeatureTypeFiltering [ + + | gff result | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=g1 +chr1 . mRNA 1050 9000 . + . ID=m1;Parent=g1 +chr1 . exon 1300 1500 . + . Parent=m1 +chr1 . CDS 1201 1500 . + 0 ID=c1;Parent=m1 +'. + result := BioGFF3File fromString: gff. + self assert: result featureCount equals: 4. + self assert: (result featuresWithType: 'gene') size equals: 1. + self assert: (result featuresWithType: 'exon') size equals: 1. + self assert: (result featuresWithType: 'CDS') size equals: 1. + self + assert: result featureTypes + equals: #( 'CDS' 'exon' 'gene' 'mRNA' ) +] + +{ #category : 'tests' } +BioGFF3Test >> testFromFileFilteringSeqid [ + + | result | + result := BioGFF3File new + fromFile: self rnoFile + filteringSeqid: 'chr1'. + self assert: result featureCount > 0. + self assert: (result features allSatisfy: [ :f | f seqid = 'chr1' ]) +] + +{ #category : 'tests' } +BioGFF3Test >> testFromFileFilteringTypes [ + + | result | + result := BioGFF3File new + fromFile: self hsaFile + filteringTypes: (Set with: 'miRNA'). + self assert: result featureCount equals: 2883. + self assert: (result features allSatisfy: [ :f | f type = 'miRNA' ]) +] + +{ #category : 'tests' } +BioGFF3Test >> testFromFileMaxFeatures [ + + | result | + result := BioGFF3File new fromFile: self hsaFile maxFeatures: 100. + self assert: result featureCount equals: 100 +] + +{ #category : 'tests' } +BioGFF3Test >> testGffVersion [ + + | gff result | + gff := '##gff-version 3 +ctg123 . gene 1000 9000 . + . ID=gene1 +'. + result := BioGFF3File fromString: gff. + self assert: result gffVersion equals: '3' +] + +{ #category : 'tests' } +BioGFF3Test >> testGroupByType [ + + | gff result groups | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=g1 +chr1 . mRNA 1050 9000 . + . ID=m1 +chr1 . exon 1300 1500 . + . Parent=m1 +'. + result := BioGFF3File fromString: gff. + groups := result groupByType. + self assert: (groups at: 'gene') size equals: 1. + self assert: (groups at: 'mRNA') size equals: 1. + self assert: (groups at: 'exon') size equals: 1 +] + +{ #category : 'tests' } +BioGFF3Test >> testIntegerAccessors [ + + | gff result f | + gff := '##gff-version 3 +chr1 . exon 1000 9000 . + . Parent=mRNA1 +'. + result := BioGFF3File fromString: gff. + f := result features first. + self assert: f startInteger equals: 1000. + self assert: f endInteger equals: 9000. + self assert: f isForwardStrand. + self deny: f isReverseStrand +] + +{ #category : 'tests' } +BioGFF3Test >> testParentChildRelationship [ + + | gff result gene children | + gff := '##gff-version 3 +ctg123 . gene 1000 9000 . + . ID=gene00001;Name=EDEN +ctg123 . mRNA 1050 9000 . + . ID=mRNA00001;Parent=gene00001 +'. + result := BioGFF3File fromString: gff. + gene := result featureWithId: 'gene00001'. + self assert: gene isNotNil. + children := result childrenOf: gene. + self assert: children size equals: 1. + self assert: children first type equals: 'mRNA' +] + +{ #category : 'tests' } +BioGFF3Test >> testParseHsaGffFile [ + + | result | + result := BioGFF3File fromFile: self hsaFile. + self assert: result featureCount equals: 4801. + self + assert: result featureTypes + equals: #( 'miRNA' 'miRNA_primary_transcript' ). + self assert: result seqids size equals: 24. + self assert: (result featuresWithType: 'miRNA') size equals: 2883. + self + assert: (result featuresWithType: 'miRNA_primary_transcript') size + equals: 1918 +] + +{ #category : 'tests' } +BioGFF3Test >> testParseRnoGffFile [ + + | result | + result := BioGFF3File fromFile: self rnoFile. + self assert: result featureCount equals: 1323. + self + assert: result featureTypes + equals: #( 'miRNA' 'miRNA_primary_transcript' ) +] + +{ #category : 'tests' } +BioGFF3Test >> testReverseStrand [ + + | gff result f | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . - . ID=g1 +'. + result := BioGFF3File fromString: gff. + f := result features first. + self assert: f isReverseStrand. + self deny: f isForwardStrand +] + +{ #category : 'tests' } +BioGFF3Test >> testScoreFloat [ + + | f | + f := BioGFF3Feature new score: '.'. + self assert: f scoreFloat equals: nil. + f score: '42.5'. + self assert: f scoreFloat equals: 42.5 +] + +{ #category : 'tests' } +BioGFF3Test >> testSeqidFiltering [ + + | gff result | + gff := '##gff-version 3 +chr1 . gene 1000 9000 . + . ID=g1 +chr2 . gene 2000 8000 . + . ID=g2 +'. + result := BioGFF3File fromString: gff. + self assert: (result featuresWithSeqid: 'chr1') size equals: 1. + self assert: (result featuresWithSeqid: 'chr2') size equals: 1. + self assert: result seqids equals: #( 'chr1' 'chr2' ) +] + +{ #category : 'tests' } +BioGFF3Test >> testStreamFeaturesFromFile [ + + | count | + count := 0. + BioGFF3File new + streamFeaturesFromFile: self hsaFile + block: [ :f | count := count + 1 ]. + self assert: count equals: 4801 +] + +{ #category : 'tests' } +BioGFF3Test >> testTypePredicates [ + + | f | + f := BioGFF3Feature new type: 'gene'. + self assert: f isOfGeneType. + f type: 'miRNA'. + self assert: f isMiRNAType. + self deny: f isMiRNAPrimaryTranscript. + f type: 'miRNA_primary_transcript'. + self assert: f isMiRNAPrimaryTranscript +] diff --git a/repository/BioParsers-Tests/BioGenBankParserTest.class.st b/repository/BioParsers-Tests/BioGenBankParserTest.class.st new file mode 100644 index 00000000..223eaa20 --- /dev/null +++ b/repository/BioParsers-Tests/BioGenBankParserTest.class.st @@ -0,0 +1,51 @@ +Class { + #name : 'BioGenBankParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioGenBankParserTest >> gbSeq01 [ + " From http://www.genomatix.de/online_help/help/sequence_formats.html " + + ^ 'LOCUS AB000263 368 bp mRNA linear PRI 05-FEB-1999 +DEFINITION Homo sapiens mRNA for prepro cortistatin like peptide, complete + cds. +ACCESSION AB000263 +ORIGIN + 1 acaagatgcc attgtccccc ggcctcctgc tgctgctgct ctccggggcc acggccaccg + 61 ctgccctgcc cctggagggt ggccccaccg gccgagacag cgagcatatg caggaagcgg + 121 caggaataag gaaaagcagc ctcctgactt tcctcgcttg gtggtttgag tggacctccc + 181 aggccagtgc cgggcccctc ataggagagg aagctcggga ggtggccagg cggcaggaag + 241 gcgcaccccc ccagcaatcc gcgcgccggg acagaatgcc ctgcaggaac ttcttctgga + 301 agaccttctc ctcctgcaaa taaaacctca cccatgaatg ctcacgcaag tttaattaca + 361 gacctgaa +// +' +] + +{ #category : 'accessing' } +BioGenBankParserTest >> setUp [ + + super setUp. + parser := BioGBParser new +] + +{ #category : 'testing' } +BioGenBankParserTest >> testGenBankTokenize01 [ + + self + assert: (BioParser tokenizeAccession: 'gb|AAM45611.1|AF384285_1') + equals: #( 'AAM45611' '1' ). + +] + +{ #category : 'testing' } +BioGenBankParserTest >> testGenBankTokenize02 [ + + self + assert: (BioParser tokenizeLocus: 'gb|AAM45611.1|AF384285_1') + equals: 'AF384285_1' + +] diff --git a/repository/BioParsers-Tests/BioGenIdParserTest.class.st b/repository/BioParsers-Tests/BioGenIdParserTest.class.st new file mode 100644 index 00000000..58267d44 --- /dev/null +++ b/repository/BioParsers-Tests/BioGenIdParserTest.class.st @@ -0,0 +1,33 @@ +Class { + #name : 'BioGenIdParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioGenIdParserTest >> setUp [ + " Private - See superimplementor's comment " + + super setUp. + parser := BioGIParser new. + +] + +{ #category : 'testing' } +BioGenIdParserTest >> testTokenizeGenBankIdentifier01 [ + + self assert: (self parser tokenize: 'gi|32128012') equals: '32128012'. + self assert: (self parser tokenize: 'gi|152787') equals: '152787' +] + +{ #category : 'testing' } +BioGenIdParserTest >> testTokenizeGenBankIdentifier02 [ + + self + should: [ self parser tokenize: 'gi' ] + raise: Error. + self + should: [ self parser tokenize: 'gi|' ] + raise: Error +] diff --git a/repository/BioParsers-Tests/BioMAFParserTest.class.st b/repository/BioParsers-Tests/BioMAFParserTest.class.st new file mode 100644 index 00000000..6d09755a --- /dev/null +++ b/repository/BioParsers-Tests/BioMAFParserTest.class.st @@ -0,0 +1,81 @@ +Class { + #name : 'BioMAFParserTest', + #superclass : 'BioAbstractFASTAParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'sample-data' } +BioMAFParserTest >> mafSample01 [ + " From http://genome.ucsc.edu/FAQ/FAQformat.html " + + ^ '##maf version=1 scoring=tba.v8 +# tba.v8 (((human chimp) baboon) (mouse rat)) + +a score=23262.0 +s hg18.chr7 27578828 38 + 158545518 AAA-GGGAATGTTAACCAAATGA---ATTGTCTCTTACGGTG +s panTro1.chr6 28741140 38 + 161576975 AAA-GGGAATGTTAACCAAATGA---ATTGTCTCTTACGGTG +s baboon 116834 38 + 4622798 AAA-GGGAATGTTAACCAAATGA---GTTGTCTCTTATGGTG +s mm4.chr6 53215344 38 + 151104725 -AATGGGAATGTTAAGCAAACGA---ATTGTCTCTCAGTGTG +s rn3.chr4 81344243 40 + 187371129 -AA-GGGGATGCTAAGCCAATGAGTTGTTGTCTCTCAATGTG + +a score=5062.0 +s hg18.chr7 27699739 6 + 158545518 TAAAGA +s panTro1.chr6 28862317 6 + 161576975 TAAAGA +s baboon 241163 6 + 4622798 TAAAGA +s mm4.chr6 53303881 6 + 151104725 TAAAGA +s rn3.chr4 81444246 6 + 187371129 taagga + +a score=6636.0 +s hg18.chr7 27707221 13 + 158545518 gcagctgaaaaca +s panTro1.chr6 28869787 13 + 161576975 gcagctgaaaaca +s baboon 249182 13 + 4622798 gcagctgaaaaca +s mm4.chr6 53310102 13 + 151104725 ACAGCTGAAAATA' +] + +{ #category : 'sample-data' } +BioMAFParserTest >> mafSample02 [ + " From https://cgwb.nci.nih.gov/goldenPath/help/maf.html " + + ^ '##maf version=1 scoring=probability +#mblastz 8.91 02-Jan-2005 + +a score=0.128 +s human_hoxa 100 9 + 100257 ACA-TTACT +s horse_hoxa 120 10 - 98892 ACAATTGCT +s fugu_hoxa 88 8 + 90788 ACA--TGCT + +a score=0.071 +s human_unc 9077 8 + 10998 ACAGTATT +s horse_unc 4555 6 - 5099 ACA--ATT +s fugu_unc 4000 4 + 4038 AC----TT' +] + +{ #category : 'accessing' } +BioMAFParserTest >> parserClass [ + + ^ BioMAFParser +] + +{ #category : 'accessing' } +BioMAFParserTest >> setUp [ + + super setUp. + parser := self parserClass new +] + +{ #category : 'testing' } +BioMAFParserTest >> testMultiSeqAlignment01 [ + +" parseResult := self parser parse: self mafSample01." + self assert: true + +] + +{ #category : 'testing' } +BioMAFParserTest >> testMultiSeqAlignment02 [ + + "parseResult := self parser parse: self mafSample02." + self assert: true + +] diff --git a/repository/BioParsers-Tests/BioNCBIIdParserTest.class.st b/repository/BioParsers-Tests/BioNCBIIdParserTest.class.st new file mode 100644 index 00000000..1033dc7b --- /dev/null +++ b/repository/BioParsers-Tests/BioNCBIIdParserTest.class.st @@ -0,0 +1,83 @@ +Class { + #name : 'BioNCBIIdParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioNCBIIdParserTest >> parserClass [ + " Private - See superimplementor's comment " + + ^ BioNCBIIdParser + +] + +{ #category : 'accessing' } +BioNCBIIdParserTest >> setUp [ + " Private - See superimplementor's comment " + + super setUp. + parser := self parserClass new. + +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testAllIdentifiers [ + + | identifiers | + + identifiers := self parserClass allIdentifiers. + self assert: (identifiers isKindOf: Collection). + self + assertCollection: identifiers + hasSameElements: #('pdb' 'bbs' 'gi' 'gnl' 'lcl' 'pat' 'pir' 'prf' 'sp' 'dbj' 'emb' 'gb' 'ref'). +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForBBS [ + + self + assert: (self parserClass classFor: 'bbs') + equals: BioGIBackBoneIdParser. + +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForDBJ [ + + self assert: (self parserClass classFor: 'dbj') equals: BioDDBJParser. + +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForEMB [ + + self assert: (self parserClass classFor: 'emb') equals: BioEMBLParser. + +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForPDB [ + + self + assert: (self parserClass classFor: 'pdb') + equals: BioBrookhavenProtParser. +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForRefSeq [ + + self + assert: (self parserClass classFor: 'ref') + equals: BioRefSeqParser. +] + +{ #category : 'testing' } +BioNCBIIdParserTest >> testClassForSwissProt [ + + self + assert: (self parserClass classFor: 'sp') + equals: BioSwissProtParser + +] diff --git a/repository/BioParsers-Tests/BioPhylipParserTest.class.st b/repository/BioParsers-Tests/BioPhylipParserTest.class.st new file mode 100644 index 00000000..2c2e750d --- /dev/null +++ b/repository/BioParsers-Tests/BioPhylipParserTest.class.st @@ -0,0 +1,254 @@ +Class { + #name : 'BioPhylipParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'testing' } +BioPhylipParserTest >> firstLineTokenizer [ + + ^ BioPhylipParser new firstLineTokenizer +] + +{ #category : 'testing' } +BioPhylipParserTest >> phylipInterleavedDNA [ + + ^ ' 6 13 +Archaeopt CGATGCTTAC CGCCGATGCT +HesperorniCGTTACTCGT TGTCGTTACT +BaluchitheTAATGTTAAT TGTTAATGTT +B. virginiTAATGTTCGT TGTTAATGTT +BrontosaurCAAAACCCAT CATCAAAACC +B.subtilisGGCAGCCAAT CACGGCAGCC + +TACCGCCGAT GCTTACCGC +CGTTGTCGTT ACTCGTTGT +AATTGTTAAT GTTAATTGT +CGTTGTTAAT GTTCGTTGT +CATCATCAAA ACCCATCAT +AATCACGGCA GCCAATCAC + +CCCCGCCCCC GCTTACCGC +CCCCGTCCCC ACTCGTTGT +CCCCGTCCCC GTTAATTGT +CCCCGTCCCC GTTCGTTGT +CCCCATCCCC ACCCATCAT +CCCCACCCCC GCCAATCAC +' +] + +{ #category : 'testing' } +BioPhylipParserTest >> phylipInterleavedProtein [ + + ^ ' 5 176 +cox2_leitaMAFILSFWMI FLLDSVIVLL SFVCFVCVWI CALLFSTVLL VSKLNNIYCT +cox2_crifaMAFILSFWMI FLIDAVIVLL SFVCFVCIWI CSLFFSSFLL VSKINNVYCT +cox2_bsaltMSFIISFWML FLIDSLIVLL SGAIFVCIWI CSLFFLCILF ICKLDYIFCS +cox2_trybbMSFILTFWMI FLMDSIIVLI SFSIFLSVWI CALIIATVLT VTKINNIYCT +cox2_tborrMLFFINQLLL LLVDTFVILE IFSLFVCVFI IVMYILFINY NIFLKNINVY + +WDFTASKFID VYWFTIGGMF SLGLLLRLCL LLYFGHLNFV SFDLCKVVGF +WDFTASKFID AYWFTIGGMF VLCLLLRLCL LLYFGCLNFV SFDLCKVVGF +WDFISAKFID LYWFTLGCLF IVCLLIRLCL LLYFSCLNFV CFDLCKCIGF +WDFISSKFID TYWFVLGMMF ILCLLLRLCL LLYFSCINFV SFDLCKVIGF +LDFIGSKYLD LYWFLIGIFF VIVLLIRLCL LLYYSWISLL IFDLCKIMGF + +QWYWVYFIFG ETTIFSNLIL ESDYMIGDLR LLQCNHVLTL LSLVIYKLWL +QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKLWL +QWYWVYFIFG ETTIFSNLIL ESDYLIGDLR LLQCNHVLTL LSLVIYKVWL +QWYWVYFLFG ETTIFSNLIL ESDYLIGDLR ILQCNHVLTL LSLVIYKLWV +QWYWIFFVFK ENVIFSNLLI ESDYWIGDLR LLQCNNTFNL ICLVVYKIWV + +SAVDVIHSFA ISSLGVKVEN LVAVMK +SAVDVIHSFA VSSLGIKVDC IPGRCN +SAIDVIHSFT LANLGIKVD? ?PGRCN +SAVDVIHSFT ISSLGIKVEN PGRCNE +TSIDVIHSFT ISTLGIKIDC IPGRCN +' +] + +{ #category : 'testing' } +BioPhylipParserTest >> speciesDNALineTokenizer [ + + ^ BioPhylipParser new speciesDNALineTokenizer +] + +{ #category : 'testing' } +BioPhylipParserTest >> speciesDNANamedBlockTokenizer [ + + ^ BioPhylipParser new speciesDNANamedBlockTokenizer +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeDNASpeciesBlock01 [ + " Private - Answer a with a sample phylip DNA " + + | speciesBlock expectedResult firstRecord | + + speciesBlock := 'Archaeopt CGATGCTTAC CGC +HesperorniCGTTACTCGT TGT +BaluchitheTAATGTTAAT TGT +B. virginiTAATGTTCGT TGT +BrontosaurCAAAACCCAT CAT +B.subtilisGGCAGCCAAT CAC'. + expectedResult := #(#('Archaeopt ' 'CGATGCTTAC CGC' nil) #('Hesperorni' 'CGTTACTCGT TGT' nil) #('Baluchithe' 'TAATGTTAAT TGT' nil) #('B. virgini' 'TAATGTTCGT TGT' nil) #('Brontosaur' 'CAAAACCCAT CAT' nil)). + + parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. + firstRecord := parseResult first. + + self assert: firstRecord first equals: 'Archaeopt '. + self assert: firstRecord second equals: 'CGATGCTTAC CGC'. + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeDNASpeciesBlock02 [ + " Private - Answer a with a sample phylip DNA " + + | speciesBlock expectedResult firstRecord | + + speciesBlock := 'Archaeopt CGATGCTTAC CGC +Hes CGTTACTCGT TGT +BaluchitheTAATGTTAAT TGT +B. virginiTAATGTTCGT TGT +BrontosaurCAAAACCCAT CAT +B.subtilisGGCAGCCAAT CAC'. + expectedResult := #( + #('Archaeopt ' 'CGATGCTTAC CGC') + #('Hesperorni' 'CGTTACTCGT TGT') + #('Baluchithe' 'TAATGTTAAT TGT') + #('B. virgini' 'TAATGTTCGT TGT') + #('Brontosaur' 'CAAAACCCAT CAT')). + + parseResult := self speciesDNANamedBlockTokenizer parse: speciesBlock. + firstRecord := parseResult first. + + self assert: firstRecord first equals: 'Archaeopt '. + self assert: firstRecord second equals: 'CGATGCTTAC CGC'. + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeDNASpeciesLine01 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'Archaeopt CGATGCTTAC CGC'. + expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult ). +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeDNASpeciesLine02 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'Archaeopt CGATGCTTACCGC'. + expectedResult := #('Archaeopt ' 'CGATGCTTACCGC'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult). + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeDNASpeciesLine03 [ + " Private - Answer a with a sample phylip DNA " + + | speciesLineBlock expectedResult | + + speciesLineBlock := 'B. virginiTAATGTTCGT TGT'. + expectedResult := #('B. virgini' 'TAATGTTCGTTGT'). + parseResult := self speciesDNALineTokenizer parse: speciesLineBlock. + + self assert: (parseResult bioHasEqualElements: expectedResult). + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeFirstLine01 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := '6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeFirstLine02 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := ' 6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeFirstLine03 [ + " Private - Answer a with a sample phylip DNA " + + | firstLine | + + firstLine := '6 13 +'. + parseResult := self firstLineTokenizer parse: firstLine. + self assert: (parseResult bioHasEqualElements: #('6' '13') ). + +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeInterleavedDNA [ + " Private - Answer a with a sample phylip DNA " + + | phylipString | + phylipString := self phylipInterleavedDNA. + + parseResult := BioParser tokenizePhylipInterleavedDNA: phylipString. + self assert: parseResult size equals: 4. + self assert: parseResult first equals: 6. + self assert: parseResult second equals: 13. + self assert: (parseResult third bioHasEqualElements: + #( 'Archaeopt ' 'Hesperorni' 'Baluchithe' 'B. virgini' + 'Brontosaur' 'B.subtilis' )). + self assert: (parseResult fourth bioHasEqualElements: + #( 'CGATGCTTACCGCCGATGCTTACCGCCGATGCTTACCGCCCCCGCCCCCGCTTACCGC' + 'CGTTACTCGTTGTCGTTACTCGTTGTCGTTACTCGTTGTCCCCGTCCCCACTCGTTGT' + 'TAATGTTAATTGTTAATGTTAATTGTTAATGTTAATTGTCCCCGTCCCCGTTAATTGT' + 'TAATGTTCGTTGTTAATGTTCGTTGTTAATGTTCGTTGTCCCCGTCCCCGTTCGTTGT' + 'CAAAACCCATCATCAAAACCCATCATCAAAACCCATCATCCCCATCCCCACCCATCAT' + 'GGCAGCCAATCACGGCAGCCAATCACGGCAGCCAATCACCCCCACCCCCGCCAATCAC' )) +] + +{ #category : 'testing' } +BioPhylipParserTest >> testTokenizeInterleavedProtein [ + + | phylipString | + phylipString := self phylipInterleavedProtein. + parseResult := BioParser tokenizePhylipInterleavedProtein: phylipString. + + self assert: parseResult size equals: 4. + self assert: parseResult first equals: 5. + self assert: parseResult second equals: 176. + self assert: (parseResult third bioHasEqualElements: + #( 'cox2_leita' 'cox2_crifa' 'cox2_bsalt' 'cox2_trybb' + 'cox2_tborr' )). + self assert: (parseResult fourth bioHasEqualElements: + #( 'MAFILSFWMIFLLDSVIVLLSFVCFVCVWICALLFSTVLLVSKLNNIYCTWDFTASKFIDVYWFTIGGMFSLGLLLRLCLLLYFGHLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYMIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAISSLGVKVENLVAVMK' + 'MAFILSFWMIFLIDAVIVLLSFVCFVCIWICSLFFSSFLLVSKINNVYCTWDFTASKFIDAYWFTIGGMFVLCLLLRLCLLLYFGCLNFVSFDLCKVVGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKLWLSAVDVIHSFAVSSLGIKVDCIPGRCN' + 'MSFIISFWMLFLIDSLIVLLSGAIFVCIWICSLFFLCILFICKLDYIFCSWDFISAKFIDLYWFTLGCLFIVCLLIRLCLLLYFSCLNFVCFDLCKCIGFQWYWVYFIFGETTIFSNLILESDYLIGDLRLLQCNHVLTLLSLVIYKVWLSAIDVIHSFTLANLGIKVD??PGRCN' + 'MSFILTFWMIFLMDSIIVLISFSIFLSVWICALIIATVLTVTKINNIYCTWDFISSKFIDTYWFVLGMMFILCLLLRLCLLLYFSCINFVSFDLCKVIGFQWYWVYFLFGETTIFSNLILESDYLIGDLRILQCNHVLTLLSLVIYKLWVSAVDVIHSFTISSLGIKVENPGRCNE' + 'MLFFINQLLLLLVDTFVILEIFSLFVCVFIIVMYILFINYNIFLKNINVYLDFIGSKYLDLYWFLIGIFFVIVLLIRLCLLLYYSWISLLIFDLCKIMGFQWYWIFFVFKENVIFSNLLIESDYWIGDLRLLQCNNTFNLICLVVYKIWVTSIDVIHSFTISTLGIKIDCIPGRCN' )) +] diff --git a/repository/BioParsers-Tests/BioProteinParserTest.class.st b/repository/BioParsers-Tests/BioProteinParserTest.class.st new file mode 100644 index 00000000..08037634 --- /dev/null +++ b/repository/BioParsers-Tests/BioProteinParserTest.class.st @@ -0,0 +1,81 @@ +Class { + #name : 'BioProteinParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioProteinParserTest >> setUp [ + + super setUp. + parser := #proteinLetterGapped asPParser. +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinLetterMatches [ + + parser := #proteinLetterGapped asPParser. + + 'ACDEFGHIKLMNPQRSTVWYBXZJUO' do: [: letter | + self assert: (parser matches: (String with: letter))]. + 'acdefghiklmnpqrstvwybxzjuo' do: [: letter | + self assert: (parser matches: (String with: letter))]. + + self deny: (parser matches: ''). + self deny: (parser matches: '.'). + self assert: (parser matches: '?'). + self assert: (parser matches: '-'). + + self should: [parser matches: $a] raise: MessageNotUnderstood. + self should: [parser matches: nil] raise: MessageNotUnderstood. +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseEmpty [ + + self deny: (parser matches: String empty). + +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseLowercaseSequence [ + + self assert: (parser matches: 'MNPQRSTVW' asLowercase). + +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseLowercaseSingleAminoacid [ + + self assert: (parser matches: 'p'). + +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseNumber [ + + self deny: (parser matches: '8743'). +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseSingleMismatch [ + + self assert: (parser matches: '-'). + self assert: (parser matches: '?'). + +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseUppercaseSequence [ + + self assert: (parser matches: 'MNPQRSTVW'). + +] + +{ #category : 'testing' } +BioProteinParserTest >> testProteinParseUppercaseSingleAminoacid [ + + self assert: (parser matches: 'P'). + +] diff --git a/repository/BioParsers-Tests/BioSwissProtParserTest.class.st b/repository/BioParsers-Tests/BioSwissProtParserTest.class.st new file mode 100644 index 00000000..49f5ae5a --- /dev/null +++ b/repository/BioParsers-Tests/BioSwissProtParserTest.class.st @@ -0,0 +1,38 @@ +Class { + #name : 'BioSwissProtParserTest', + #superclass : 'BioAbstractParserTest', + #category : 'BioParsers-Tests', + #package : 'BioParsers-Tests' +} + +{ #category : 'accessing' } +BioSwissProtParserTest >> setUp [ + + super setUp. + parser := BioSwissProtParser new. +] + +{ #category : 'testing' } +BioSwissProtParserTest >> testSwissProtTokenize01 [ + + self + assert: (BioParser tokenizeAccession: 'sp|P80487|HHP_THICU') + equals: #( 'P80487' ) +] + +{ #category : 'testing' } +BioSwissProtParserTest >> testSwissProtTokenize02 [ + + self + assert: (BioParser tokenizeAccession: 'sp|P80487.1|HHP_THICU') + equals: #( 'P80487' '1' ) +] + +{ #category : 'testing' } +BioSwissProtParserTest >> testSwissProtTokenize03 [ + + self + assert: + (BioParser tokenizeSwissProtEntryName: 'sp|Q9UWG2|RL3_METVA') + equals: 'RL3_METVA' +] diff --git a/repository/BioParsers-Tests/package.st b/repository/BioParsers-Tests/package.st new file mode 100644 index 00000000..db56e2fa --- /dev/null +++ b/repository/BioParsers-Tests/package.st @@ -0,0 +1 @@ +Package { #name : 'BioParsers-Tests' } diff --git a/repository/BioParsers/BioBlastContainerNode.class.st b/repository/BioParsers/BioBlastContainerNode.class.st index 7b30c887..4db3247a 100644 --- a/repository/BioParsers/BioBlastContainerNode.class.st +++ b/repository/BioParsers/BioBlastContainerNode.class.st @@ -4,9 +4,9 @@ Class { #instVars : [ 'nodes' ], - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'comparing' } diff --git a/repository/BioParsers/BioBlastHitNode.class.st b/repository/BioParsers/BioBlastHitNode.class.st index ab950f79..8c25b653 100644 --- a/repository/BioParsers/BioBlastHitNode.class.st +++ b/repository/BioParsers/BioBlastHitNode.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioBlastHitNode', #superclass : 'BioBlastValueNode', - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioBlastHspNode.class.st b/repository/BioParsers/BioBlastHspNode.class.st index c3227770..9d1b0323 100644 --- a/repository/BioParsers/BioBlastHspNode.class.st +++ b/repository/BioParsers/BioBlastHspNode.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioBlastHspNode', #superclass : 'BioBlastValueNode', - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioBlastNode.class.st b/repository/BioParsers/BioBlastNode.class.st index 9552a827..cae8deb5 100644 --- a/repository/BioParsers/BioBlastNode.class.st +++ b/repository/BioParsers/BioBlastNode.class.st @@ -12,9 +12,9 @@ Class { #instVars : [ 'nodeName' ], - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioBlastParentNode.class.st b/repository/BioParsers/BioBlastParentNode.class.st index 95461e13..5ac5d51a 100644 --- a/repository/BioParsers/BioBlastParentNode.class.st +++ b/repository/BioParsers/BioBlastParentNode.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioBlastParentNode', #superclass : 'BioBlastContainerNode', - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'instance creation' } diff --git a/repository/BioParsers/BioBlastRootNode.class.st b/repository/BioParsers/BioBlastRootNode.class.st index 753c43ac..f6b50799 100644 --- a/repository/BioParsers/BioBlastRootNode.class.st +++ b/repository/BioParsers/BioBlastRootNode.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioBlastRootNode', #superclass : 'BioBlastParentNode', - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioBlastStructureNode.class.st b/repository/BioParsers/BioBlastStructureNode.class.st index 07089a34..51e06b5c 100644 --- a/repository/BioParsers/BioBlastStructureNode.class.st +++ b/repository/BioParsers/BioBlastStructureNode.class.st @@ -4,9 +4,9 @@ The structure exists only to specify those nodes which are present in the XML an Class { #name : 'BioBlastStructureNode', #superclass : 'BioBlastContainerNode', - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioBlastValueNode.class.st b/repository/BioParsers/BioBlastValueNode.class.st index 974bd135..bcc961ad 100644 --- a/repository/BioParsers/BioBlastValueNode.class.st +++ b/repository/BioParsers/BioBlastValueNode.class.st @@ -7,9 +7,9 @@ Class { #instVars : [ 'value' ], - #category : 'BioParsers-Support', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'Support' + #tag : 'BLAST' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEResultKeysParser.class.st b/repository/BioParsers/BioEResultKeysParser.class.st index 1981ba08..c692a391 100644 --- a/repository/BioParsers/BioEResultKeysParser.class.st +++ b/repository/BioParsers/BioEResultKeysParser.class.st @@ -5,9 +5,9 @@ See http://www.ncbi.nlm.nih.gov/books/NBK25500/ for details (15/11/2011) Class { #name : 'BioEResultKeysParser', #superclass : 'BioEntrezResultParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'content' } diff --git a/repository/BioParsers/BioEntrezResultParser.class.st b/repository/BioParsers/BioEntrezResultParser.class.st index 79ee9845..acf50fa9 100644 --- a/repository/BioParsers/BioEntrezResultParser.class.st +++ b/repository/BioParsers/BioEntrezResultParser.class.st @@ -7,9 +7,9 @@ Instance Variables: Class { #name : 'BioEntrezResultParser', #superclass : 'BioSAXParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'content' } diff --git a/repository/BioParsers/BioEntrezXMLGBBasicParser.class.st b/repository/BioParsers/BioEntrezXMLGBBasicParser.class.st index cd6b0ed2..858a5978 100644 --- a/repository/BioParsers/BioEntrezXMLGBBasicParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBBasicParser.class.st @@ -4,9 +4,9 @@ This is a basic parser which can parse accession, definition and sequence nodes Class { #name : 'BioEntrezXMLGBBasicParser', #superclass : 'BioEntrezXMLGenBankSeqParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGBFullParser.class.st b/repository/BioParsers/BioEntrezXMLGBFullParser.class.st index 3b7a7b45..349a4516 100644 --- a/repository/BioParsers/BioEntrezXMLGBFullParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBFullParser.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioEntrezXMLGBFullParser', #superclass : 'BioEntrezXMLGenBankSeqParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGBSeqFeatureQualParser.class.st b/repository/BioParsers/BioEntrezXMLGBSeqFeatureQualParser.class.st index 4a123255..45bcca48 100644 --- a/repository/BioParsers/BioEntrezXMLGBSeqFeatureQualParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBSeqFeatureQualParser.class.st @@ -38,9 +38,9 @@ fileRef := BioObject testFilesFullDirectoryName / 'GenBankTestFiles' / 'TestGBSe Class { #name : 'BioEntrezXMLGBSeqFeatureQualParser', #superclass : 'BioEntrezXMLGBSeqParser', - #category : 'BioParsers-XML', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'XML' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGBSeqFullParser.class.st b/repository/BioParsers/BioEntrezXMLGBSeqFullParser.class.st index cce00f7f..ca92e32d 100644 --- a/repository/BioParsers/BioEntrezXMLGBSeqFullParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBSeqFullParser.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioEntrezXMLGBSeqFullParser', #superclass : 'BioEntrezXMLGBSeqParser', - #category : 'BioParsers-XML', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'XML' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGBSeqJournalParser.class.st b/repository/BioParsers/BioEntrezXMLGBSeqJournalParser.class.st index d2b91438..9f7fd813 100644 --- a/repository/BioParsers/BioEntrezXMLGBSeqJournalParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBSeqJournalParser.class.st @@ -8,9 +8,9 @@ fileRef := BioObject testFilesFullDirectoryName / 'GenBankTestFiles' / 'TestGBSe Class { #name : 'BioEntrezXMLGBSeqJournalParser', #superclass : 'BioEntrezXMLGBSeqParser', - #category : 'BioParsers-XML', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'XML' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGBSeqParser.class.st b/repository/BioParsers/BioEntrezXMLGBSeqParser.class.st index 7049e35a..6c2483dc 100644 --- a/repository/BioParsers/BioEntrezXMLGBSeqParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGBSeqParser.class.st @@ -25,9 +25,9 @@ Class { 'matches', 'records' ], - #category : 'BioParsers-XML', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'XML' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGenBankSeqParser.class.st b/repository/BioParsers/BioEntrezXMLGenBankSeqParser.class.st index 3d67ed70..a17b7862 100644 --- a/repository/BioParsers/BioEntrezXMLGenBankSeqParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGenBankSeqParser.class.st @@ -7,9 +7,9 @@ Class { #instVars : [ 'eRecord' ], - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioEntrezXMLGenSetParser.class.st b/repository/BioParsers/BioEntrezXMLGenSetParser.class.st index fae3eb23..6f8ad224 100644 --- a/repository/BioParsers/BioEntrezXMLGenSetParser.class.st +++ b/repository/BioParsers/BioEntrezXMLGenSetParser.class.st @@ -4,9 +4,9 @@ Class { #instVars : [ 'eRecord' ], - #category : 'BioParsers-Core', + #category : 'BioParsers-Entrez', #package : 'BioParsers', - #tag : 'Core' + #tag : 'Entrez' } { #category : 'accessing' } diff --git a/repository/BioParsers/BioFASTABasicParser.class.st b/repository/BioParsers/BioFASTABasicParser.class.st index 8f6136dc..449fbff8 100644 --- a/repository/BioParsers/BioFASTABasicParser.class.st +++ b/repository/BioParsers/BioFASTABasicParser.class.st @@ -4,9 +4,9 @@ This class is not intended to be used directly Class { #name : 'BioFASTABasicParser', #superclass : 'BioIDParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-FASTA', #package : 'BioParsers', - #tag : 'Core' + #tag : 'FASTA' } { #category : 'accessing private' } diff --git a/repository/BioParsers/BioFASTAMultiParser.class.st b/repository/BioParsers/BioFASTAMultiParser.class.st index fbb37e1d..e0ef1217 100644 --- a/repository/BioParsers/BioFASTAMultiParser.class.st +++ b/repository/BioParsers/BioFASTAMultiParser.class.st @@ -4,9 +4,9 @@ Parser for a FASTA file with several sequences. This class is not intended to be Class { #name : 'BioFASTAMultiParser', #superclass : 'BioFASTABasicParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-FASTA', #package : 'BioParsers', - #tag : 'Core' + #tag : 'FASTA' } { #category : 'accessing private' } diff --git a/repository/BioParsers/BioFASTAParser.class.st b/repository/BioParsers/BioFASTAParser.class.st index 99760cbc..4ef7aa66 100644 --- a/repository/BioParsers/BioFASTAParser.class.st +++ b/repository/BioParsers/BioFASTAParser.class.st @@ -4,9 +4,9 @@ Parser for several FASTA file format elements. This class is not intended to be Class { #name : 'BioFASTAParser', #superclass : 'BioAbstractTextParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-FASTA', #package : 'BioParsers', - #tag : 'Core' + #tag : 'FASTA' } { #category : 'accessing-parsers' } diff --git a/repository/BioParsers/BioGFF3CommentRecordNode.class.st b/repository/BioParsers/BioGFF3CommentRecordNode.class.st new file mode 100644 index 00000000..13326c7d --- /dev/null +++ b/repository/BioParsers/BioGFF3CommentRecordNode.class.st @@ -0,0 +1,34 @@ +Class { + #name : 'BioGFF3CommentRecordNode', + #superclass : 'BioGFF3GFF3FileNode', + #instVars : [ + 'text' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3CommentRecordNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitCommentRecord: self +] + +{ #category : 'generated' } +BioGFF3CommentRecordNode >> text [ + + ^ text +] + +{ #category : 'generated' } +BioGFF3CommentRecordNode >> text: aSmaCCToken [ + + text := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3CommentRecordNode >> tokenVariables [ + + ^ #( #text ) +] diff --git a/repository/BioParsers/BioGFF3DirectiveListNode.class.st b/repository/BioParsers/BioGFF3DirectiveListNode.class.st new file mode 100644 index 00000000..b705d52e --- /dev/null +++ b/repository/BioParsers/BioGFF3DirectiveListNode.class.st @@ -0,0 +1,31 @@ +Class { + #name : 'BioGFF3DirectiveListNode', + #superclass : 'BioGFF3GFF3FileNode', + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3DirectiveListNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitDirectiveList: self +] + +{ #category : 'generated' } +BioGFF3DirectiveListNode >> compositeNodeVariables [ + + ^ #( #directives ) +] + +{ #category : 'generated' } +BioGFF3DirectiveListNode >> directives [ + + ^ directives +] + +{ #category : 'generated-initialize-release' } +BioGFF3DirectiveListNode >> initialize [ + super initialize. + directives := OrderedCollection new: 2. +] diff --git a/repository/BioParsers/BioGFF3DirectiveNode.class.st b/repository/BioParsers/BioGFF3DirectiveNode.class.st new file mode 100644 index 00000000..aab2fce7 --- /dev/null +++ b/repository/BioParsers/BioGFF3DirectiveNode.class.st @@ -0,0 +1,34 @@ +Class { + #name : 'BioGFF3DirectiveNode', + #superclass : 'BioGFF3GFF3FileNode', + #instVars : [ + 'text' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3DirectiveNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitDirective: self +] + +{ #category : 'generated' } +BioGFF3DirectiveNode >> text [ + + ^ text +] + +{ #category : 'generated' } +BioGFF3DirectiveNode >> text: aSmaCCToken [ + + text := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3DirectiveNode >> tokenVariables [ + + ^ #( #text ) +] diff --git a/repository/BioParsers/BioGFF3Feature.class.st b/repository/BioParsers/BioGFF3Feature.class.st new file mode 100644 index 00000000..f36b6f6f --- /dev/null +++ b/repository/BioParsers/BioGFF3Feature.class.st @@ -0,0 +1,250 @@ +Class { + #name : 'BioGFF3Feature', + #superclass : 'Object', + #instVars : [ + 'seqid', + 'source', + 'type', + 'start', + 'end', + 'score', + 'strand', + 'phase', + 'attributes', + 'attributesDict' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'as yet unclassified' } +BioGFF3Feature class >> fromFeatureLine: aNode [ + + ^ self new + seqid: aNode seqid value; + source: aNode _source value; + type: aNode type value; + start: aNode start value; + end: aNode end value; + score: aNode score value; + strand: aNode strand value; + phase: aNode phase value; + attributes: aNode _attributes value; + yourself +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> asBioSequenceFeature [ + + | sf | + sf := BioSequenceFeature new. + sf primaryTag: self type. + sf sourceTag: self source. + sf start: self startInteger. + sf end: self endInteger. + sf strand: self strand. + sf score: self scoreFloat. + sf chromosome: self seqid. + sf frame: self phase. + self id ifNotNil: [ :i | sf addTag: 'ID' -> i ]. + self name ifNotNil: [ :n | sf addTag: 'Name' -> n ]. + ^ sf +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> attributeAt: key [ ^ self attributesDict at: key ifAbsent: [ nil ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> attributes [ ^ attributes +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> attributes: aString [ attributes := aString. attributesDict := nil +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> attributesDict [ + + ^ attributesDict ifNil: [ attributesDict := self parseAttributes ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> derivesFrom [ + + ^ self attributesDict at: 'Derives_from' ifAbsent: [ nil ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> end [ ^ end +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> end: aString [ end := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> endInteger [ + + ^ end ifNotNil: [ end asInteger ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> hasAttribute: key [ ^ self attributesDict includesKey: key +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> id [ + + ^ self attributesDict at: 'ID' ifAbsent: [ nil ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isForwardStrand [ ^ strand = (String with: $+) +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isMiRNAPrimaryTranscript [ + + ^ self type = 'miRNA_primary_transcript' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isMiRNAType [ + + ^ self type = 'miRNA' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isOfCDSType [ + + ^ self type = 'CDS' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isOfExonType [ + + ^ self type = 'exon' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isOfGeneType [ + + ^ self type = 'gene' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isOfMRNAType [ + + ^ self type = 'mRNA' +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isReverseStrand [ ^ strand = (String with: $-) +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> isUnstranded [ ^ strand = (String with: $.) +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> name [ + + ^ self attributesDict at: 'Name' ifAbsent: [ nil ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> parentIds [ + + | p | + p := self attributesDict at: 'Parent' ifAbsent: [ ^ #( ) ]. + p isString ifTrue: [ ^ Array with: p ]. + ^ p asArray +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> parseAttributes [ + + | dict pairs | + dict := Dictionary new. + pairs := attributes splitOn: ';'. + pairs do: [ :pair | + | eqIdx k v | + eqIdx := pair indexOf: $=. + eqIdx > 0 ifTrue: [ + k := (pair copyFrom: 1 to: eqIdx - 1) trimBoth. + v := (pair copyFrom: eqIdx + 1 to: pair size) trimBoth. + dict at: k put: v ] ]. + ^ dict +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> phase [ ^ phase +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> phase: aString [ phase := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> score [ ^ score +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> score: aString [ score := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> scoreFloat [ + + ^ score = (String with: $.) + ifTrue: [ nil ] + ifFalse: [ score asFloat ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> seqid [ ^ seqid +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> seqid: aString [ seqid := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> source [ ^ source +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> source: aString [ source := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> start [ ^ start +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> start: aString [ start := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> startInteger [ + + ^ start ifNotNil: [ start asInteger ] +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> strand [ ^ strand +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> strand: aString [ strand := aString +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> type [ ^ type +] + +{ #category : 'as yet unclassified' } +BioGFF3Feature >> type: aString [ type := aString +] diff --git a/repository/BioParsers/BioGFF3FeatureLineNode.class.st b/repository/BioParsers/BioGFF3FeatureLineNode.class.st new file mode 100644 index 00000000..3642a6d8 --- /dev/null +++ b/repository/BioParsers/BioGFF3FeatureLineNode.class.st @@ -0,0 +1,138 @@ +Class { + #name : 'BioGFF3FeatureLineNode', + #superclass : 'BioGFF3GFF3FileNode', + #instVars : [ + 'seqid', + '_source', + 'type', + 'start', + 'end', + 'score', + 'strand', + 'phase', + '_attributes' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> _attributes [ + + ^ _attributes +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> _attributes: aSmaCCToken [ + + _attributes := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> _source [ + + ^ _source +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> _source: aSmaCCToken [ + + _source := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitFeatureLine: self +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> end [ + + ^ end +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> end: aSmaCCToken [ + + end := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> phase [ + + ^ phase +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> phase: aSmaCCToken [ + + phase := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> score [ + + ^ score +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> score: aSmaCCToken [ + + score := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> seqid [ + + ^ seqid +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> seqid: aSmaCCToken [ + + seqid := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> start [ + + ^ start +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> start: aSmaCCToken [ + + start := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> strand [ + + ^ strand +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> strand: aSmaCCToken [ + + strand := aSmaCCToken +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> tokenVariables [ + + ^ #( #seqid #_source #type #start #end #score #strand #phase #_attributes ) +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> type [ + + ^ type +] + +{ #category : 'generated' } +BioGFF3FeatureLineNode >> type: aSmaCCToken [ + + type := aSmaCCToken +] diff --git a/repository/BioParsers/BioGFF3FeatureListNode.class.st b/repository/BioParsers/BioGFF3FeatureListNode.class.st new file mode 100644 index 00000000..3ef3deee --- /dev/null +++ b/repository/BioParsers/BioGFF3FeatureListNode.class.st @@ -0,0 +1,58 @@ +Class { + #name : 'BioGFF3FeatureListNode', + #superclass : 'BioGFF3GFF3FileNode', + #instVars : [ + 'lines', + '_comments' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3FeatureListNode >> _comments [ + + ^ _comments +] + +{ #category : 'generated' } +BioGFF3FeatureListNode >> _comments: anOrderedCollection [ + + self setParents: self _comments to: nil. + _comments := anOrderedCollection. + self setParents: self _comments to: self +] + +{ #category : 'generated' } +BioGFF3FeatureListNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitFeatureList: self +] + +{ #category : 'generated' } +BioGFF3FeatureListNode >> compositeNodeVariables [ + + ^ #( #lines #_comments ) +] + +{ #category : 'generated-initialize-release' } +BioGFF3FeatureListNode >> initialize [ + super initialize. + lines := OrderedCollection new: 2. + _comments := OrderedCollection new: 2. +] + +{ #category : 'generated' } +BioGFF3FeatureListNode >> lines [ + + ^ lines +] + +{ #category : 'generated' } +BioGFF3FeatureListNode >> lines: anOrderedCollection [ + + self setParents: self lines to: nil. + lines := anOrderedCollection. + self setParents: self lines to: self +] diff --git a/repository/BioParsers/BioGFF3File.class.st b/repository/BioParsers/BioGFF3File.class.st new file mode 100644 index 00000000..12516c23 --- /dev/null +++ b/repository/BioParsers/BioGFF3File.class.st @@ -0,0 +1,272 @@ +Class { + #name : 'BioGFF3File', + #superclass : 'Object', + #instVars : [ + 'directives', + 'features', + 'sourceDirective', + 'gffVersion' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'as yet unclassified' } +BioGFF3File class >> fromFile: aFilename [ + + ^ self fromString: aFilename asFileReference contents +] + +{ #category : 'as yet unclassified' } +BioGFF3File class >> fromString: aString [ + + | parser node input | + input := (aString endsWith: String lf) + ifTrue: [ aString ] + ifFalse: [ aString , String lf ]. + parser := BioGFF3Parser on: (ReadStream on: input). + node := parser parse. + ^ self new fromGFF3FileNode: node +] + +{ #category : 'properties' } +BioGFF3File >> childrenOf: aFeature [ + + | parentId | + parentId := aFeature id ifNil: [ ^ #( ) ]. + ^ self features select: [ :f | f parentIds includes: parentId ] +] + +{ #category : 'querying' } +BioGFF3File >> derivesFromOf: aFeature [ + + | anId | + anId := aFeature id ifNil: [ ^ #( ) ]. + ^ self features select: [ :f | f derivesFrom = anId ] +] + +{ #category : 'accessing' } +BioGFF3File >> directives [ + + ^ directives + ifNil: [ directives := OrderedCollection new ] +] + +{ #category : 'accessing' } +BioGFF3File >> directives: aCollection [ + + directives := aCollection +] + +{ #category : 'querying' } +BioGFF3File >> featureCount [ + + ^ self features size +] + +{ #category : 'querying' } +BioGFF3File >> featureTypes [ + + ^ (self features collect: #type) asSet asSortedCollection asArray +] + +{ #category : 'querying' } +BioGFF3File >> featureWithId: anId [ + + ^ self features detect: [ :f | f id = anId ] ifNone: [ nil ] +] + +{ #category : 'accessing' } +BioGFF3File >> features [ + + ^ features + ifNil: [ features := OrderedCollection new ] +] + +{ #category : 'accessing' } +BioGFF3File >> features: aCollection [ + + features := aCollection +] + +{ #category : 'querying' } +BioGFF3File >> featuresWithSeqid: aSeqid [ + + ^ self features select: [ :f | f seqid = aSeqid ] +] + +{ #category : 'querying' } +BioGFF3File >> featuresWithType: aType [ + + ^ self features select: [ :f | f type = aType ] +] + +{ #category : 'instance creation' } +BioGFF3File >> fromFile: aFilename filteringSeqid: aSeqid [ + "Parse only features for a specific seqid/chromosome. Memory-efficient." + + | stream line tabSeqid | + features := OrderedCollection new. + tabSeqid := aSeqid , (String with: Character tab). + stream := aFilename asFileReference readStream. + [ stream atEnd ] whileFalse: [ + line := stream nextLine. + line ifNotEmpty: [ + line first = $# ifFalse: [ + (line beginsWith: tabSeqid) ifTrue: [ + features add: (self parseFeatureLine: line) ] ] ] ]. + stream close. + self features: features. + ^ self +] + +{ #category : 'instance creation' } +BioGFF3File >> fromFile: aFilename filteringTypes: typeSet [ + "Parse only features whose type is in typeSet. Memory-efficient for large files." + + | stream line | + features := OrderedCollection new. + stream := aFilename asFileReference readStream. + [ stream atEnd ] whileFalse: [ + line := stream nextLine. + line ifNotEmpty: [ + line first = $# ifFalse: [ + | fields type | + fields := line findTokens: String tab. + fields size >= 3 ifTrue: [ + type := fields at: 3. + (typeSet includes: type) ifTrue: [ + features add: (self parseFeatureLine: line) ] ] ] ] ]. + stream close. + self features: features. + ^ self +] + +{ #category : 'instance creation' } +BioGFF3File >> fromFile: aFilename maxFeatures: maxCount [ + "Parse at most maxCount features. Useful for previews." + + | stream line count | + features := OrderedCollection new. + count := 0. + stream := aFilename asFileReference readStream. + [ stream atEnd or: [ count >= maxCount ] ] whileFalse: [ + line := stream nextLine. + line ifNotEmpty: [ + line first = $# ifFalse: [ + | feature | + feature := self parseFeatureLine: line. + feature ifNotNil: [ + features add: feature. + count := count + 1 ] ] ] ]. + stream close. + self features: features. + ^ self +] + +{ #category : 'instance creation' } +BioGFF3File >> fromGFF3FileNode: aNode [ + + | directiveTexts featuresList | + directiveTexts := aNode directives + ifNil: [ OrderedCollection new ] + ifNotNil: [ :dl | + dl directives + ifNil: [ OrderedCollection new ] + ifNotNil: [ :ds | + ds collect: [ :d | d text value ] ] ]. + self directives: directiveTexts. + featuresList := aNode features + ifNil: [ OrderedCollection new ] + ifNotNil: [ :fl | + fl lines + ifNil: [ OrderedCollection new ] + ifNotNil: [ :lines | + lines collect: [ :line | + BioGFF3Feature fromFeatureLine: line ] ] ]. + self features: featuresList. + ^ self +] + +{ #category : 'accessing' } +BioGFF3File >> gffVersion [ + + gffVersion ifNil: [ + | directive | + directive := self directives + detect: [ :d | d beginsWith: '##gff-version' ] + ifNone: [ ^ nil ]. + gffVersion := (directive copyReplaceAll: '##gff-version' with: '') + trimBoth ]. + ^ gffVersion +] + +{ #category : 'querying' } +BioGFF3File >> groupByType [ + + | groups | + groups := Dictionary new. + self features do: [ :f | + (groups at: f type ifAbsentPut: [ OrderedCollection new ]) add: f ]. + ^ groups +] + +{ #category : 'parsing' } +BioGFF3File >> parseFeatureLine: aLine [ + + | fields | + fields := aLine findTokens: String tab. + fields size < 9 ifTrue: [ ^ nil ]. + ^ BioGFF3Feature new + seqid: fields first; + source: (fields at: 2); + type: (fields at: 3); + start: (fields at: 4); + end: (fields at: 5); + score: (fields at: 6); + strand: (fields at: 7); + phase: (fields at: 8); + attributes: (fields at: 9); + yourself +] + +{ #category : 'querying' } +BioGFF3File >> seqids [ + + ^ (self features collect: #seqid) asSet asSortedCollection asArray +] + +{ #category : 'accessing' } +BioGFF3File >> source [ + + ^ self directives + detect: [ :d | d beginsWith: '##source' ] + ifNone: [ nil ] +] + +{ #category : 'querying' } +BioGFF3File >> streamFeaturesFromFile: aFilename block: aBlock [ + "Evaluate aBlock for each feature parsed from file, without storing all in memory. + aBlock receives each BioGFF3Feature." + + | stream line count | + count := 0. + stream := aFilename asFileReference readStream. + [ stream atEnd ] whileFalse: [ + line := stream nextLine. + line ifNotEmpty: [ + (line beginsWith: '##') + ifTrue: [ "skip directives" ] + ifFalse: [ + line first = $# + ifTrue: [ "skip comments" ] + ifFalse: [ + | feature | + feature := self parseFeatureLine: line. + feature ifNotNil: [ + aBlock value: feature. + count := count + 1 ] ] ] ] ]. + stream close. + ^ count +] diff --git a/repository/BioParsers/BioGFF3GFF3FileNode.class.st b/repository/BioParsers/BioGFF3GFF3FileNode.class.st new file mode 100644 index 00000000..2bcd8716 --- /dev/null +++ b/repository/BioParsers/BioGFF3GFF3FileNode.class.st @@ -0,0 +1,51 @@ +Class { + #name : 'BioGFF3GFF3FileNode', + #superclass : 'SmaCCParseNode', + #instVars : [ + 'directives', + 'features' + ], + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> acceptVisitor: aGFF3FileVisitor [ + + ^ aGFF3FileVisitor visitGFF3File: self +] + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> directives [ + + ^ directives +] + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> directives: aBioGFF3DirectiveListNode [ + + self directives notNil ifTrue: [ self directives parent: nil ]. + directives := aBioGFF3DirectiveListNode. + self directives notNil ifTrue: [ self directives parent: self ] +] + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> features [ + + ^ features +] + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> features: aBioGFF3FeatureListNode [ + + self features notNil ifTrue: [ self features parent: nil ]. + features := aBioGFF3FeatureListNode. + self features notNil ifTrue: [ self features parent: self ] +] + +{ #category : 'generated' } +BioGFF3GFF3FileNode >> nodeVariables [ + + ^ #( #directives #features ) +] diff --git a/repository/BioParsers/BioGFF3GFF3FileNodeVisitor.class.st b/repository/BioParsers/BioGFF3GFF3FileNodeVisitor.class.st new file mode 100644 index 00000000..2038d900 --- /dev/null +++ b/repository/BioParsers/BioGFF3GFF3FileNodeVisitor.class.st @@ -0,0 +1,9 @@ +Class { + #name : 'BioGFF3GFF3FileNodeVisitor', + #superclass : 'Object', + #traits : 'TBioGFF3GFF3FileNodeVisitor', + #classTraits : 'TBioGFF3GFF3FileNodeVisitor classTrait', + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} diff --git a/repository/BioParsers/BioNCBIBlastSAXParser.class.st b/repository/BioParsers/BioNCBIBlastSAXParser.class.st index d136a377..516c8397 100644 --- a/repository/BioParsers/BioNCBIBlastSAXParser.class.st +++ b/repository/BioParsers/BioNCBIBlastSAXParser.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioNCBIBlastSAXParser', #superclass : 'BioSAXParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-XML', #package : 'BioParsers', - #tag : 'Core' + #tag : 'XML' } { #category : 'handling - content' } diff --git a/repository/BioParsers/BioNCBIBlastSAXTokenizer.class.st b/repository/BioParsers/BioNCBIBlastSAXTokenizer.class.st index f38a0dfe..bc56b797 100644 --- a/repository/BioParsers/BioNCBIBlastSAXTokenizer.class.st +++ b/repository/BioParsers/BioNCBIBlastSAXTokenizer.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioNCBIBlastSAXTokenizer', #superclass : 'BioNCBIBlastSAXParser', - #category : 'BioParsers-Core', + #category : 'BioParsers-XML', #package : 'BioParsers', - #tag : 'Core' + #tag : 'XML' } { #category : 'handling - content' } diff --git a/repository/BioParsers/BioNCBIXMLBlastParser.class.st b/repository/BioParsers/BioNCBIXMLBlastParser.class.st index b9d71104..624f435f 100644 --- a/repository/BioParsers/BioNCBIXMLBlastParser.class.st +++ b/repository/BioParsers/BioNCBIXMLBlastParser.class.st @@ -1,9 +1,9 @@ Class { #name : 'BioNCBIXMLBlastParser', #superclass : 'BioXMLParser', - #category : 'BioParsers-XML', + #category : 'BioParsers-BLAST', #package : 'BioParsers', - #tag : 'XML' + #tag : 'BLAST' } { #category : 'initialize-release' } diff --git a/repository/BioParsers/BioParser.class.st b/repository/BioParsers/BioParser.class.st index b39429f5..4feb7ad4 100644 --- a/repository/BioParsers/BioParser.class.st +++ b/repository/BioParsers/BioParser.class.st @@ -106,6 +106,20 @@ BioParser class >> parseFullNcbiXmlGBSeq: aString [ ^ BioEntrezXMLGBSeqFullParser parse: aString ] +{ #category : 'parse-gff3' } +BioParser class >> parseGff3: aGffString [ + " Parse aFastaString and answer instance " + + ^ BioGFF3File fromString: aGffString +] + +{ #category : 'parse-gff3' } +BioParser class >> parseGff3File: aGff3FilePath [ + " Parse aGff3FilePath and answer a instance " + + ^ BioGFF3File new fromFile: aGff3FilePath +] + { #category : 'parse-fasta' } BioParser class >> parseMultiFasta: aFastaString [ " Parser aFastaString representing a MultiFASTA sequence. diff --git a/repository/BioParsers/BioSAXParser.class.st b/repository/BioParsers/BioSAXParser.class.st index e7ab468d..4472ea5c 100644 --- a/repository/BioParsers/BioSAXParser.class.st +++ b/repository/BioParsers/BioSAXParser.class.st @@ -18,9 +18,9 @@ Class { 'selectedNodes', 'current' ], - #category : 'BioParsers-Core', + #category : 'BioParsers-XML', #package : 'BioParsers', - #tag : 'Core' + #tag : 'XML' } { #category : 'instance creation' } diff --git a/repository/BioParsers/TBioGFF3GFF3FileNodeVisitor.trait.st b/repository/BioParsers/TBioGFF3GFF3FileNodeVisitor.trait.st new file mode 100644 index 00000000..db1041bc --- /dev/null +++ b/repository/BioParsers/TBioGFF3GFF3FileNodeVisitor.trait.st @@ -0,0 +1,44 @@ +Trait { + #name : 'TBioGFF3GFF3FileNodeVisitor', + #traits : 'TSmaCCParseNodeVisitor', + #classTraits : 'TSmaCCParseNodeVisitor classTrait', + #category : 'BioParsers-GFF3', + #package : 'BioParsers', + #tag : 'GFF3' +} + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitCommentRecord: aCommentRecord [ + + ^ self visitGFF3File: aCommentRecord +] + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitDirective: aDirective [ + + ^ self visitGFF3File: aDirective +] + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitDirectiveList: aDirectiveList [ + + ^ self visitGFF3File: aDirectiveList +] + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitFeatureLine: aFeatureLine [ + + ^ self visitGFF3File: aFeatureLine +] + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitFeatureList: aFeatureList [ + + ^ self visitGFF3File: aFeatureList +] + +{ #category : 'generated' } +TBioGFF3GFF3FileNodeVisitor >> visitGFF3File: aGFF3File [ + + ^ self visitSmaCCParseNode: aGFF3File +]