From cc11eebaecfbde5a4101d82784895e012e00e790 Mon Sep 17 00:00:00 2001 From: Hernan Morales Date: Tue, 28 Apr 2026 00:40:44 -0300 Subject: [PATCH] Add utility methods to BioSequence Add tests --- .../BioSequenceAnalysisTest.class.st | 285 +++++++++++ .../BioSequenceStringTest.class.st | 404 ++++++++++++++++ repository/BioTools/BioSequence.class.st | 448 +++++++++++++++++- 3 files changed, 1130 insertions(+), 7 deletions(-) create mode 100644 repository/BioTools-Tests/BioSequenceAnalysisTest.class.st create mode 100644 repository/BioTools-Tests/BioSequenceStringTest.class.st diff --git a/repository/BioTools-Tests/BioSequenceAnalysisTest.class.st b/repository/BioTools-Tests/BioSequenceAnalysisTest.class.st new file mode 100644 index 00000000..76ec3620 --- /dev/null +++ b/repository/BioTools-Tests/BioSequenceAnalysisTest.class.st @@ -0,0 +1,285 @@ +Class { + #name : 'BioSequenceAnalysisTest', + #superclass : 'BioAbstractTest', + #category : 'BioTools-Tests-Core', + #package : 'BioTools-Tests', + #tag : 'Core' +} + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> dnaSequence: aString [ + + ^ BioSequence newDNA: aString +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> newSeqRecord [ + + ^ BioSeqRecord new + sequence: (self dnaSequence: 'ATCGATCG'); + id: 'TEST001'; + yourself +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> proteinSequence: aString [ + + ^ BioSequence newProtein: aString +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> rnaSequence: aString [ + + ^ BioSequence newRNA: aString +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testATRatio [ + + | seq | + seq := self dnaSequence: 'AATT'. + self assert: seq atRatio equals: 1.0. + seq := self dnaSequence: 'GGCC'. + self assert: seq atRatio equals: 0.0 +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testAnnotationPreservesAll [ + + | rec | + rec := self newSeqRecord. + rec + seqVersion: 1; + keywords: #( 'keyword1' ); + dates: #( '01-JAN-2024' ); + species: 'Homo sapiens'; + primaryAccession: 'NM_001'; + secondaryAccessions: #( 'NM_001.1' ); + division: 'PRI'; + taxonomy: #( 'Eukaryota' ). + self assert: rec seqVersion equals: 1. + self assert: rec keywords first equals: 'keyword1'. + self assert: rec dates first equals: '01-JAN-2024'. + self assert: rec species equals: 'Homo sapiens'. + self assert: rec primaryAccession equals: 'NM_001' +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testComposition [ + + | seq comp | + seq := self dnaSequence: 'ATCGATCG'. + comp := seq composition. + self assert: (comp at: $A) equals: 2. + self assert: (comp at: $T) equals: 2. + self assert: (comp at: $C) equals: 2. + self assert: (comp at: $G) equals: 2 +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testCompositionWithFrequencies [ + + | seq comp | + seq := self dnaSequence: 'AATT'. + comp := seq compositionWithFrequencies. + self assert: ((comp at: $A) at: #count) equals: 2. + self assert: ((comp at: $A) at: #frequency) equals: 0.5 +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testDates [ + + | rec | + rec := self newSeqRecord. + self assert: rec dates isEmpty. + rec dates: #( '01-JAN-2024' '15-FEB-2024' ). + self assert: rec dates size equals: 2 +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testDefinition [ + + | rec | + rec := self newSeqRecord. + rec definition: 'Test sequence for unit testing'. + self assert: rec definition equals: 'Test sequence for unit testing' +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testDivision [ + + | rec | + rec := self newSeqRecord. + rec division: 'PRI'. + self assert: rec division equals: 'PRI' +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testFeatureCount [ + + | rec | + rec := self newSeqRecord. + self assert: rec featureCount equals: 0. + rec addSeqFeature: (BioSequenceFeature new primaryTag: 'gene'). + rec addSeqFeature: (BioSequenceFeature new primaryTag: 'CDS'). + self assert: rec featureCount equals: 2 +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testGCRatio [ + + | seq | + seq := self dnaSequence: 'GGCC'. + self assert: seq gcRatio equals: 1.0. + seq := self dnaSequence: 'AATT'. + self assert: seq gcRatio equals: 0.0. + seq := self dnaSequence: 'ATGC'. + self assert: seq gcRatio equals: 0.5 +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testKeywords [ + + | rec | + rec := self newSeqRecord. + self assert: rec keywords isEmpty. + rec keywords: #( 'hypothetical protein' 'complete cds' ). + self assert: rec keywords size equals: 2. + self assert: rec keywords first equals: 'hypothetical protein' +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testNucleotideComposition [ + + | seq comp | + seq := self dnaSequence: 'AATTTGGC'. + comp := seq nucleotideComposition. + self assert: (comp at: $A) equals: 2. + self assert: (comp at: $T) equals: 3. + self assert: (comp at: $G) equals: 2. + self assert: (comp at: $C) equals: 1 +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testNucleotideCompositionRNA [ + + | seq comp | + seq := self rnaSequence: 'AAUUUGGC'. + comp := seq nucleotideComposition. + self assert: (comp at: $A) equals: 2. + self assert: (comp at: $U) equals: 3. + self assert: (comp at: $G) equals: 2. + self assert: (comp at: $C) equals: 1 +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testOrganism [ + + | rec | + rec := self newSeqRecord. + rec organism: 'Drosophila melanogaster'. + self assert: rec organism equals: 'Drosophila melanogaster' +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testPrimaryAccession [ + + | rec | + rec := self newSeqRecord. + rec primaryAccession: 'NM_12345'. + self assert: rec primaryAccession equals: 'NM_12345' +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testPrimaryAccessionDefaultsToId [ + + | rec | + rec := self newSeqRecord. + "When no accession set, should return id" + self assert: rec primaryAccession equals: 'TEST001' +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testSecondaryAccessions [ + + | rec | + rec := self newSeqRecord. + self assert: rec secondaryAccessions isEmpty. + rec secondaryAccessions: #( 'NM_12345.1' 'NM_12345.2' ). + self assert: rec secondaryAccessions size equals: 2 +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testSeqVersion [ + + | rec | + rec := self newSeqRecord. + self assert: rec seqVersion isNil. + rec seqVersion: 1. + self assert: rec seqVersion equals: 1. + rec seqVersion: 2. + self assert: rec seqVersion equals: 2 +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testSequenceTypeDNA [ + + | seq | + seq := self dnaSequence: 'ATCG'. + self assert: seq sequenceType equals: #DNA +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testSequenceTypeProtein [ + + | seq | + seq := self proteinSequence: 'MVLSP'. + self assert: seq sequenceType equals: #Protein +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testSequenceTypeRNA [ + + | seq | + seq := self rnaSequence: 'AUCG'. + self assert: seq sequenceType equals: #RNA +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testSpecies [ + + | rec | + rec := self newSeqRecord. + self assert: rec species isNil. + rec species: 'Homo sapiens'. + self assert: rec species equals: 'Homo sapiens' +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testSpeciesName [ + + | rec | + rec := self newSeqRecord. + rec speciesName: 'Mus musculus'. + self assert: rec speciesName equals: 'Mus musculus' +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testSpeciesNameFromOrganism [ + + | rec | + rec := self newSeqRecord. + rec organism: 'Escherichia coli'. + self assert: rec speciesName equals: 'Escherichia coli' +] + +{ #category : 'as yet unclassified' } +BioSequenceAnalysisTest >> testTaxonomy [ + + | rec | + rec := self newSeqRecord. + rec taxonomy: + #( 'Eukaryota' 'Metazoa' 'Chordata' 'Craniata' 'Mammalia' ). + self assert: rec taxonomy size equals: 5. + self assert: rec taxonomy first equals: 'Eukaryota' +] diff --git a/repository/BioTools-Tests/BioSequenceStringTest.class.st b/repository/BioTools-Tests/BioSequenceStringTest.class.st new file mode 100644 index 00000000..e9007b25 --- /dev/null +++ b/repository/BioTools-Tests/BioSequenceStringTest.class.st @@ -0,0 +1,404 @@ +Class { + #name : 'BioSequenceStringTest', + #superclass : 'BioAbstractTest', + #category : 'BioTools-Tests-Core', + #package : 'BioTools-Tests', + #tag : 'Core' +} + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> dnaSequence: aString [ + + ^ BioSequence newDNA: aString +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testBeginsWith [ + + | seq | + seq := self dnaSequence: 'ATCGATCG'. + self assert: (seq beginsWith: 'ATCG'). + self deny: (seq beginsWith: 'TCGA'). + self assert: (seq beginsWith: ''). + self deny: (seq beginsWith: 'ATCGATCGATCG') +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testCountSubstring [ + + | seq | + seq := self dnaSequence: 'ATCGATCGATCG'. + self assert: (seq countSubstring: 'ATCG') equals: 3. + self assert: (seq countSubstring: 'GAT') equals: 2. + self assert: (seq countSubstring: 'TTTT') equals: 0 +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testCountSubstringOverlapping [ + + | seq | + seq := self dnaSequence: 'AAAA'. + self assert: (seq countSubstring: 'AA' overlapping: false) equals: 2. + self assert: (seq countSubstring: 'AA' overlapping: true) equals: 3 +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testDropFirst [ + + | seq sub | + seq := self dnaSequence: 'ATCGATCG'. + sub := seq dropFirst: 4. + self assert: sub sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testDropLast [ + + | seq sub | + seq := self dnaSequence: 'ATCGATCG'. + sub := seq dropLast: 4. + self assert: sub sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testEndsWith [ + + | seq | + seq := self dnaSequence: 'ATCGATCG'. + self assert: (seq endsWith: 'ATCG'). + self deny: (seq endsWith: 'CGAT'). + self assert: (seq endsWith: ''). + self deny: (seq endsWith: 'GATCGATCG') +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testFirstN [ + + | seq sub | + seq := self dnaSequence: 'ATCGATCG'. + sub := seq first: 4. + self assert: sub sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testFromTo [ + + | seq sub | + seq := self dnaSequence: 'ATCGATCG'. + sub := seq from: 2 to: 5. + self assert: sub sequence equals: 'TCGA' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testIncludesAnyOf [ + + | seq | + seq := self dnaSequence: 'ATCG'. + self assert: (seq includesAnyOf: 'XYZA'). + self deny: (seq includesAnyOf: 'N'). + self deny: (seq includesAnyOf: 'XYZ') +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testIncludesSubstring [ + + | seq | + seq := self dnaSequence: 'ATCGATCG'. + self assert: (seq includesSubstring: 'ATCG'). + self assert: (seq includesSubstring: 'CGA'). + self deny: (seq includesSubstring: 'TTTT') +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testIndexOf [ + + | seq | + seq := self dnaSequence: 'ATCGATCGATCG'. + self assert: (seq indexOf: 'ATCG') equals: 1. + self assert: (seq indexOf: 'GAT') equals: 4. + self assert: (seq indexOf: 'XXX') equals: 0 +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testIndexOfOrFail [ + + | seq | + seq := self dnaSequence: 'ATCGATCG'. + self assert: (seq indexOfOrFail: 'CGA') equals: 3. + self should: [ seq indexOfOrFail: 'XXX' ] raise: NotFound +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testIndexOfStartingAt [ + + | seq | + seq := self dnaSequence: 'ATCGATCGATCG'. + self assert: (seq indexOf: 'ATCG' startingAt: 1) equals: 1. + self assert: (seq indexOf: 'ATCG' startingAt: 2) equals: 5. + self assert: (seq indexOf: 'ATCG' startingAt: 6) equals: 9. + self assert: (seq indexOf: 'ATCG' startingAt: 10) equals: 0 +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testIndexOfStartingAtEndingAt [ + + | seq | + seq := self dnaSequence: 'ATCGATCGATCG'. + self assert: (seq indexOf: 'GAT' startingAt: 1 endingAt: 6) equals: 4. + self + assert: (seq indexOf: 'ATCG' startingAt: 1 endingAt: 4) + equals: 1. + self + assert: (seq indexOf: 'ATCG' startingAt: 5 endingAt: 8) + equals: 5. + self + assert: (seq indexOf: 'XXXX' startingAt: 1 endingAt: 12) + equals: 0 +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testIsAllLowercase [ + + | seq | + seq := self dnaSequence: 'atcg'. + self assert: seq isAllLowercase. + seq := self dnaSequence: 'ATcg'. + self deny: seq isAllLowercase +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testIsAllUppercase [ + + | seq | + seq := self dnaSequence: 'ATCG'. + self assert: seq isAllUppercase. + seq := self dnaSequence: 'ATcg'. + self deny: seq isAllUppercase +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testJoin [ + + | seqs joined | + seqs := { + (self dnaSequence: 'AT'). + (self dnaSequence: 'CG'). + (self dnaSequence: 'TA') }. + joined := '' join: (seqs collect: #sequence). + self assert: joined equals: 'ATCGTA' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testJoinWithStrings [ + + | joined | + joined := '' join: #( 'AT' 'CG' 'TA' ). + self assert: joined equals: 'ATCGTA' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testLastIndexOf [ + + | seq | + seq := self dnaSequence: 'ATCGATCGATCG'. + self assert: (seq lastIndexOf: 'ATCG') equals: 9. + self assert: (seq lastIndexOf: 'GAT') equals: 8. + self assert: (seq lastIndexOf: 'XXX') equals: 0 +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testLastIndexOfOrFail [ + + | seq | + seq := self dnaSequence: 'ATCGATCG'. + self assert: (seq lastIndexOfOrFail: 'ATC') equals: 5. + self should: [ seq lastIndexOfOrFail: 'XXX' ] raise: NotFound +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testLastIndexOfStartingAt [ + + | seq | + seq := self dnaSequence: 'ATCGATCGATCG'. + self assert: (seq lastIndexOf: 'ATCG' startingAt: 12) equals: 9. + self assert: (seq lastIndexOf: 'ATCG' startingAt: 8) equals: 5. + self assert: (seq lastIndexOf: 'ATCG' startingAt: 4) equals: 1 +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testLastN [ + + | seq sub | + seq := self dnaSequence: 'ATCGATCG'. + sub := seq last: 4. + self assert: sub sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testPadLeft [ + + | seq padded | + seq := self dnaSequence: 'ATCG'. + padded := seq padLeft: $- to: 8. + self assert: padded sequence equals: '----ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testPadLeftNoPaddingNeeded [ + + | seq padded | + seq := self dnaSequence: 'ATCGATCG'. + padded := seq padLeft: $- to: 4. + self assert: padded sequence equals: 'ATCGATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testPadRight [ + + | seq padded | + seq := self dnaSequence: 'ATCG'. + padded := seq padRight: $- to: 8. + self assert: padded sequence equals: 'ATCG----' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testRemovePrefix [ + + | seq result | + seq := self dnaSequence: 'PREFIXATCG'. + result := seq removePrefix: 'PREFIX'. + self assert: result sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testRemovePrefixNotPresent [ + + | seq result | + seq := self dnaSequence: 'ATCG'. + result := seq removePrefix: 'XXX'. + self assert: result sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testRemoveSuffix [ + + | seq result | + seq := self dnaSequence: 'ATCGSUFFIX'. + result := seq removeSuffix: 'SUFFIX'. + self assert: result sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testRemoveSuffixNotPresent [ + + | seq result | + seq := self dnaSequence: 'ATCG'. + result := seq removeSuffix: 'XXX'. + self assert: result sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testReplaceAllOccurrences [ + + | seq replaced | + seq := self dnaSequence: 'ATCGATCG'. + replaced := seq replace: 'ATCG' with: 'AAAA'. + self assert: replaced sequence equals: 'AAAAAAAA' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testReplaceWith [ + + | seq replaced | + seq := self dnaSequence: 'ATCGATCG'. + replaced := seq replace: 'ATC' with: 'TTT'. + self assert: replaced sequence equals: 'TTTGTTTG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testReversed [ + + | seq rev | + seq := self dnaSequence: 'ATCG'. + rev := seq reversed. + self assert: rev sequence equals: 'GCTA' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testSplitOn [ + + | seq parts | + seq := self dnaSequence: 'ATCG-ATCG-ATCG'. + parts := seq splitOn: '-'. + self assert: parts size equals: 3. + self assert: parts first sequence equals: 'ATCG'. + self assert: parts second sequence equals: 'ATCG'. + self assert: parts last sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testSplitOnEmpty [ + + | seq parts | + seq := self dnaSequence: ''. + parts := seq splitOn: '-'. + self assert: parts size equals: 1. + self assert: parts first sequence equals: '' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testSplitOnMaxChunks [ + + | seq parts | + seq := self dnaSequence: 'AT-CG-AT-CG'. + parts := seq splitOn: '-' maxChunks: 3. + self assert: parts size equals: 3. + self assert: parts first sequence equals: 'AT'. + self assert: parts second sequence equals: 'CG'. + self assert: parts last sequence equals: 'AT-CG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testTrim [ + + | seq trimmed | + seq := self dnaSequence: ' ATCG '. + trimmed := seq trim. + self assert: trimmed sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testTrimChars [ + + | seq trimmed | + seq := self dnaSequence: 'NNATCGNN'. + trimmed := seq trim: 'N'. + self assert: trimmed sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testTrimLeft [ + + | seq trimmed | + seq := self dnaSequence: ' ATCG '. + trimmed := seq trimLeft. + self assert: trimmed sequence equals: 'ATCG ' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testTrimNoWhitespace [ + + | seq trimmed | + seq := self dnaSequence: 'ATCG'. + trimmed := seq trim. + self assert: trimmed sequence equals: 'ATCG' +] + +{ #category : 'as yet unclassified' } +BioSequenceStringTest >> testTrimRight [ + + | seq trimmed | + seq := self dnaSequence: ' ATCG '. + trimmed := seq trimRight. + self assert: trimmed sequence equals: ' ATCG' +] diff --git a/repository/BioTools/BioSequence.class.st b/repository/BioTools/BioSequence.class.st index c09433c3..4e546182 100644 --- a/repository/BioTools/BioSequence.class.st +++ b/repository/BioTools/BioSequence.class.st @@ -71,6 +71,18 @@ BioSequence class >> fromUmambiguousRNASequences: aCollection [ ^ aCollection collect: [ : seqString | self newUnambiguousRNA: seqString ] ] +{ #category : 'splitjoin' } +BioSequence class >> join: aCollection [ + "Join a collection of BioSequences (or strings) into one sequence." + + | strings | + strings := aCollection collect: [ :each | + each isBioSequence + ifTrue: [ each sequence ] + ifFalse: [ each asString ] ]. + ^ self new initializeWith: ('' join: strings) +] + { #category : 'convenience' } BioSequence class >> joinSequence: aCollection [ " Private - Answer a String with the sequence ensambled from aCollection. @@ -480,6 +492,13 @@ BioSequence >> at: anInteger put: aLetter [ self signalInvalidObject: 'Sequence is read-only. Use #asMutable to enable modifications it then #asSequence' ] +{ #category : 'accessing' } +BioSequence >> atRatio [ + "Answer the AT ratio as a fraction (AT / ATGC)." + + ^ 1.0 - self gcRatio +] + { #category : 'accesing public - protein synthesis' } BioSequence >> backTranscribe [ " Answer a new instance of the receiver's with the receiver's sequence transcribed to its corresponding DNA, adjusting the alphabet " @@ -501,6 +520,13 @@ BioSequence >> backTranscription [ ^ backTranscript ] +{ #category : 'testing' } +BioSequence >> beginsWith: aPrefix [ + "Answer true if the receiver's sequence begins with aPrefix." + + ^ self sequence beginsWith: aPrefix +] + { #category : 'accessing' } BioSequence >> bioConsensusFor: aBioAlignment [ " Answer a representing a consensus base for the receiver " @@ -589,6 +615,29 @@ BioSequence >> complementaryAt: aCharacter [ ^ self alphabet complementaryTable at: aCharacter ] +{ #category : 'accessing - utilities' } +BioSequence >> composition [ + "Answer a Dictionary with nucleotide/amino acid composition. + Keys are letters, values are counts." + + ^ self occurrencesOfLetters +] + +{ #category : 'accessing - utilities' } +BioSequence >> compositionWithFrequencies [ + "Answer a Dictionary with composition including frequencies." + + | counts total result | + counts := self occurrencesOfLetters. + total := self size. + result := Dictionary new. + counts keysAndValuesDo: [ :key :value | + result at: key put: (Dictionary + with: #count -> value + with: #frequency -> (value / total asFloat)) ]. + ^ result +] + { #category : 'accessing' } BioSequence >> contents [ " Compatibility with #dumpToFileNamed: . Answer the receiver's sequence " @@ -637,6 +686,43 @@ BioSequence >> copyTo: stopInteger [ ] +{ #category : 'accessing - utilities' } +BioSequence >> countSubstring: aSubstring [ + "Answer the number of non-overlapping occurrences of aSubstring." + + | count pos start | + count := 0. + start := 1. + [ + pos := self sequence + indexOfSubCollection: aSubstring + startingAt: start. + pos > 0 ] whileTrue: [ + count := count + 1. + start := pos + aSubstring size ]. + ^ count +] + +{ #category : 'accessing - utilities' } +BioSequence >> countSubstring: aSubstring overlapping: aBoolean [ + "Count occurrences. If overlapping is true, count overlapping matches." + + | count pos start increment | + count := 0. + start := 1. + increment := aBoolean + ifTrue: [ 1 ] + ifFalse: [ aSubstring size ]. + [ + pos := self sequence + indexOfSubCollection: aSubstring + startingAt: start. + pos > 0 ] whileTrue: [ + count := count + 1. + start := pos + increment ]. + ^ count +] + { #category : 'accessing - checksum' } BioSequence >> crc32 [ "Answer a with the CRC checksum (edundancy check) for the receiver's sequence" @@ -776,6 +862,31 @@ BioSequence >> do: aClosure [ ] +{ #category : 'accessing - utilities' } +BioSequence >> dropFirst: n [ + "Answer a new BioSequence without the first n characters." + + ^ self class new + initializeWith: (self sequence allButFirst: n) + alphabet: self alphabet +] + +{ #category : 'accessing - utilities' } +BioSequence >> dropLast: n [ + "Answer a new BioSequence without the last n characters." + + ^ self class new + initializeWith: (self sequence allButLast: n) + alphabet: self alphabet +] + +{ #category : 'testing' } +BioSequence >> endsWith: aSuffix [ + "Answer true if the receiver's sequence ends with aSuffix." + + ^ self sequence endsWith: aSuffix +] + { #category : 'copying' } BioSequence >> findHotspots [ " Answer a of the receiver's substrings tokenized by hostspots : [ ] " @@ -785,6 +896,24 @@ BioSequence >> findHotspots [ ] +{ #category : 'accessing' } +BioSequence >> first: n [ + "Answer a new BioSequence with the first n characters." + + ^ self class new + initializeWith: (self sequence first: n) + alphabet: self alphabet +] + +{ #category : 'instance creation' } +BioSequence >> from: startIndex to: endIndex [ + "Answer a new BioSequence with characters from startIndex to endIndex." + + ^ self class new + initializeWith: (self sequence copyFrom: startIndex to: endIndex) + alphabet: self alphabet +] + { #category : 'accessing' } BioSequence >> from: start to: stop do: aClosure [ " Evaluate aBlock for all elements between start and stop (inclusive). " @@ -866,6 +995,18 @@ BioSequence >> gcContentUppercased [ ] +{ #category : 'accessing - utilities' } +BioSequence >> gcRatio [ + "Answer the GC ratio as a fraction (GC / ATGC)." + + | comp gc at | + comp := self occurrencesOfLetters. + gc := (comp at: $G ifAbsent: [ 0 ]) + (comp at: $C ifAbsent: [ 0 ]). + at := (comp at: $A ifAbsent: [ 0 ]) + (comp at: $T ifAbsent: [ 0 ]). + at + gc = 0 ifTrue: [ ^ 0.0 ]. + ^ gc / (at + gc) asFloat +] + { #category : 'accessing' } BioSequence >> gcSkew [ " Answer a with ratios . Calculate receiver's GC skew (G-C)/(G+C) for windows of size wLength. @@ -973,6 +1114,61 @@ BioSequence >> hotspotRegionsLeft: leftSize right: rightSize [ ^ self findHotspots tripleSelect: [ : a : b : c | a size >= leftSize and: [ c size >= rightSize ] ]. ] +{ #category : 'testing' } +BioSequence >> includesAnyOf: aCollection [ + "Answer true if the receiver contains any character from aCollection." + + ^ self sequence includesAnyOf: aCollection +] + +{ #category : 'testing' } +BioSequence >> includesSubstring: aSubstring [ + "Answer true if the receiver contains aSubstring." + + ^ (self indexOf: aSubstring) > 0 +] + +{ #category : 'accessing' } +BioSequence >> indexOf: aSubstring [ + "Answer the index of the first occurrence of aSubstring in the receiver. + Answer 0 if not found. Uses 1-based indexing." + + ^ self sequence indexOfSubCollection: aSubstring startingAt: 1 +] + +{ #category : 'accessing' } +BioSequence >> indexOf: aSubstring startingAt: startIndex [ + "Answer the index of the first occurrence of aSubstring at or after startIndex. + Answer 0 if not found." + + ^ self sequence + indexOfSubCollection: aSubstring + startingAt: startIndex +] + +{ #category : 'accessing' } +BioSequence >> indexOf: aSubstring startingAt: startIndex endingAt: endIndex [ + "Answer the index of the first occurrence of aSubstring within the range [startIndex, endIndex]. + Answer 0 if not found. Returns 1-based index in the original sequence." + + | subSeq idx | + subSeq := self sequence copyFrom: startIndex to: endIndex. + idx := subSeq indexOfSubCollection: aSubstring startingAt: 1. + idx = 0 ifTrue: [ ^ 0 ]. + ^ idx + startIndex - 1 +] + +{ #category : 'accessing' } +BioSequence >> indexOfOrFail: aSubstring [ + "Answer the index of the first occurrence of aSubstring. + Raise NotFound signal if not found." + + | idx | + idx := self indexOf: aSubstring. + idx = 0 ifTrue: [ NotFound signalFor: aSubstring ]. + ^ idx +] + { #category : 'accessing' } BioSequence >> indicesOfSubsequence: aBioSequence [ " See comment in #indicesOfSubstring: " @@ -1009,6 +1205,24 @@ BioSequence >> initializeWith: aString alphabet: anAlphabet [ alphabet := anAlphabet ] +{ #category : 'testing' } +BioSequence >> isAllLowercase [ + "Answer true if all alphabetic characters are lowercase." + + self sequence do: [ :c | + c isLetter ifTrue: [ c isLowercase ifFalse: [ ^ false ] ] ]. + ^ true +] + +{ #category : 'testing' } +BioSequence >> isAllUppercase [ + "Answer true if all alphabetic characters are uppercase." + + self sequence do: [ :c | + c isLetter ifTrue: [ c isUppercase ifFalse: [ ^ false ] ] ]. + ^ true +] + { #category : 'testing' } BioSequence >> isBioSequence [ "Answer whether the receiver represents a Biological sequence." @@ -1135,6 +1349,50 @@ BioSequence >> kmersCount: patString mismatches: d [ ^ (self sequence indicesOfSubstring: patString mismatches: d) size ] +{ #category : 'accessing' } +BioSequence >> last: n [ + "Answer a new BioSequence with the last n characters." + + ^ self class new + initializeWith: (self sequence last: n) + alphabet: self alphabet +] + +{ #category : 'accessing' } +BioSequence >> lastIndexOf: aSubstring [ + "Answer the index of the last occurrence of aSubstring in the receiver. + Answer 0 if not found." + + ^ self sequence findLastOccurrenceOfString: aSubstring startingAt: 1 +] + +{ #category : 'accessing' } +BioSequence >> lastIndexOf: aSubstring startingAt: startIndex [ + "Answer the index of the last occurrence of aSubstring at or before startIndex." + + | result lastPos | + result := 0. + lastPos := 0. + [ + lastPos := self sequence + findString: aSubstring + startingAt: lastPos + 1. + lastPos > 0 and: [ lastPos <= startIndex ] ] whileTrue: [ + result := lastPos ]. + ^ result +] + +{ #category : 'accessing' } +BioSequence >> lastIndexOfOrFail: aSubstring [ + "Answer the index of the last occurrence of aSubstring. + Raise NotFound signal if not found." + + | idx | + idx := self lastIndexOf: aSubstring. + idx = 0 ifTrue: [ NotFound signalFor: aSubstring ]. + ^ idx +] + { #category : 'accessing' } BioSequence >> lcc [ "Answer a of with the Local Composition Complexity (LCC) value for the receiver. Assume the receiver is unambiguous sequence " @@ -1484,6 +1742,37 @@ BioSequence >> notEmpty [ ^ seq notEmpty ] +{ #category : 'accessing - utilities' } +BioSequence >> nucleotideComposition [ + "Answer nucleotide composition for DNA/RNA sequences. + Returns Dictionary with A, T/U, G, C counts and 'other' for non-standard." + + | comp a t u g c other | + self isProteinSequence ifTrue: [ + self error: + 'nucleotideComposition not applicable to protein sequences' ]. + comp := self occurrencesOfLetters. + a := comp at: $A ifAbsent: [ 0 ]. + g := comp at: $G ifAbsent: [ 0 ]. + c := comp at: $C ifAbsent: [ 0 ]. + self isRNASequence + ifTrue: [ + t := 0. + u := comp at: $U ifAbsent: [ 0 ] ] + ifFalse: [ + t := comp at: $T ifAbsent: [ 0 ]. + u := comp at: $U ifAbsent: [ 0 ] ]. + other := self size - a - t - u - g - c. + ^ Dictionary new + at: $A put: a; + at: $T put: t; + at: $U put: u; + at: $G put: g; + at: $C put: c; + at: 'other' put: other; + yourself +] + { #category : 'accessing - frequencies' } BioSequence >> occurrencesOf: aCharacter [ " Answer how many of the receiver's elements are equal to aLetter " @@ -1513,6 +1802,30 @@ BioSequence >> oligonucleotideFrequency [ ^ self kmerFrequencies: 1 ] +{ #category : 'accessing - utilities' } +BioSequence >> padLeft: padChar to: targetLength [ + "Pad the sequence on the left with padChar to reach targetLength." + + | padding | + self size >= targetLength ifTrue: [ ^ self copy ]. + padding := String new: targetLength - self size withAll: padChar. + ^ self class new + initializeWith: padding , self sequence + alphabet: self alphabet +] + +{ #category : 'accessing - utilities' } +BioSequence >> padRight: padChar to: targetLength [ + "Pad the sequence on the right with padChar to reach targetLength." + + | padding | + self size >= targetLength ifTrue: [ ^ self copy ]. + padding := String new: targetLength - self size withAll: padChar. + ^ self class new + initializeWith: self sequence , padding + alphabet: self alphabet +] + { #category : 'accessing' } BioSequence >> positionsOf: aCharacterOrString [ " Answer a Collection with the positions of aminoacidLetter in the receiver's sequence " @@ -1563,6 +1876,19 @@ BioSequence >> randomLength: size for: anAlphabetClass [ nextPutAll: b asString ] ]. ] +{ #category : 'removing' } +BioSequence >> removePrefix: aPrefix [ + "Remove aPrefix if present at the start of the sequence. + Answer the receiver unchanged if prefix not present." + + (self beginsWith: aPrefix) ifTrue: [ + ^ self class new + initializeWith: + (self sequence copyFrom: aPrefix size + 1 to: self size) + alphabet: self alphabet ]. + ^ self copy +] + { #category : 'accessing - sequence record' } BioSequence >> removeSeqFeature: aBioSequenceFeature [ " Remove aBioSequenceFeature from the receiver " @@ -1570,6 +1896,28 @@ BioSequence >> removeSeqFeature: aBioSequenceFeature [ self sequenceFeatures remove: aBioSequenceFeature ] +{ #category : 'removing' } +BioSequence >> removeSuffix: aSuffix [ + "Remove aSuffix if present at the end of the sequence." + + (self endsWith: aSuffix) ifTrue: [ + ^ self class new + initializeWith: + (self sequence copyFrom: 1 to: self size - aSuffix size) + alphabet: self alphabet ]. + ^ self copy +] + +{ #category : 'transforming' } +BioSequence >> replace: oldSubstring with: newSubstring [ + "Answer a new BioSequence with all occurrences of oldSubstring replaced by newSubstring." + + ^ self class new + initializeWith: + (self sequence copyReplaceAll: oldSubstring with: newSubstring) + alphabet: self alphabet +] + { #category : 'accessing - restriction' } BioSequence >> resolveEnzyme: anEnzymeOrName [ @@ -1602,9 +1950,12 @@ BioSequence >> reverseComplement [ { #category : 'accesing public - protein synthesis' } BioSequence >> reversed [ - " Answer a copy of the receiver with element order reversed " - - ^ self newPrototypeWith: seq reversed + "Answer a new BioSequence with characters in reverse order. + Note: This is NOT reverse-complement, just reverse." + + ^ self class new + initializeWith: self sequence reversed + alphabet: self alphabet ] { #category : 'accessing - checksum' } @@ -1660,6 +2011,16 @@ BioSequence >> sequenceRecord: anObject [ sequenceRecord := anObject ] +{ #category : 'accessing - utilities' } +BioSequence >> sequenceType [ + "Answer the sequence type as a Symbol: #DNA, #RNA, or #Protein." + + self isDNASequence ifTrue: [ ^ #DNA ]. + self isRNASequence ifTrue: [ ^ #RNA ]. + self isProteinSequence ifTrue: [ ^ #Protein ]. + ^ #Unknown +] + { #category : 'accessing' } BioSequence >> size [ " Answer how many symbols the receiver contains " @@ -1687,6 +2048,45 @@ BioSequence >> splitByCodons [ ^ self sequence findTokens: $* ] +{ #category : 'splitjoin' } +BioSequence >> splitOn: aSeparator [ + "Answer anArray of subsequences split by aSeparator." + + ^ (self sequence splitOn: aSeparator) collect: [ :s | + self class new initializeWith: s alphabet: self alphabet ] +] + +{ #category : 'enumerating' } +BioSequence >> splitOn: aSeparator indicesDo: aBlock [ + "Evaluate aBlock with each separator index found." + + self sequence splitOn: aSeparator indicesDo: aBlock +] + +{ #category : 'accessing - utilities' } +BioSequence >> splitOn: aSeparator maxChunks: maxChunks [ + "Split into at most maxChunks parts using aSeparator." + + | parts remaining start pos | + parts := OrderedCollection new. + remaining := self sequence. + start := 1. + [ parts size < (maxChunks - 1) ] whileTrue: [ + pos := remaining indexOfSubCollection: aSeparator startingAt: start. + pos = 0 ifTrue: [ + parts add: remaining. + ^ parts collect: [ :s | + self class new initializeWith: s alphabet: self alphabet ] ]. + parts add: (remaining copyFrom: 1 to: pos - 1). + remaining := remaining + copyFrom: pos + aSeparator size + to: remaining size. + start := 1 ]. + parts add: remaining. + ^ parts collect: [ :s | + self class new initializeWith: s alphabet: self alphabet ] +] + { #category : 'accessing private' } BioSequence >> stopSymbol [ " Answer a terminator " @@ -1847,11 +2247,27 @@ BioSequence >> translationTable: aTableIdentifier stopSymbol: stopCharacter toSt ] +{ #category : 'growing' } +BioSequence >> trim [ + "Remove leading and trailing whitespace from sequence. + Answer a new BioSequence." + + ^ self class new + initializeWith: self sequence trimBoth + alphabet: self alphabet +] + { #category : 'accessing public - utils' } -BioSequence >> trim: aCharacter [ - " Modify the receiver by removing aCharacter in its sequence " - - seq := seq copyWithout: aCharacter. +BioSequence >> trim: chars [ + "Remove leading and trailing characters in chars." + + | trimmed | + trimmed := self sequence. + [ trimmed notEmpty and: [ chars includes: trimmed first ] ] + whileTrue: [ trimmed := trimmed copyFrom: 2 to: trimmed size ]. + [ trimmed notEmpty and: [ chars includes: trimmed last ] ] + whileTrue: [ trimmed := trimmed copyFrom: 1 to: trimmed size - 1 ]. + ^ self class new initializeWith: trimmed alphabet: self alphabet ] { #category : 'accessing public - utils' } @@ -1863,6 +2279,24 @@ BioSequence >> trimAmbiguityCodes [ ] +{ #category : 'trimming' } +BioSequence >> trimLeft [ + "Remove leading whitespace from sequence." + + ^ self class new + initializeWith: self sequence trimLeft + alphabet: self alphabet +] + +{ #category : 'trimming' } +BioSequence >> trimRight [ + "Remove trailing whitespace from sequence." + + ^ self class new + initializeWith: self sequence trimRight + alphabet: self alphabet +] + { #category : 'accessing - frequencies' } BioSequence >> trinucleotideFrequency [ "Answer a Dictionary mapping each trinucleotide (3-mer) to its frequency in the receiver.