From cc11eebaecfbde5a4101d82784895e012e00e790 Mon Sep 17 00:00:00 2001
From: Hernan Morales <hernan.morales@gmail.com>
Date: Tue, 28 Apr 2026 00:40:44 -0300
Subject: [PATCH] Add utility methods to BioSequence Add tests

---
 .../BioSequenceAnalysisTest.class.st          | 285 +++++++++++
 .../BioSequenceStringTest.class.st            | 404 ++++++++++++++++
 repository/BioTools/BioSequence.class.st      | 448 +++++++++++++++++-
 3 files changed, 1130 insertions(+), 7 deletions(-)
 create mode 100644 repository/BioTools-Tests/BioSequenceAnalysisTest.class.st
 create mode 100644 repository/BioTools-Tests/BioSequenceStringTest.class.st

diff --git a/repository/BioTools-Tests/BioSequenceAnalysisTest.class.st b/repository/BioTools-Tests/BioSequenceAnalysisTest.class.st
new file mode 100644
index 00000000..76ec3620
--- /dev/null
+++ b/repository/BioTools-Tests/BioSequenceAnalysisTest.class.st
@@ -0,0 +1,285 @@
+Class {
+	#name : 'BioSequenceAnalysisTest',
+	#superclass : 'BioAbstractTest',
+	#category : 'BioTools-Tests-Core',
+	#package : 'BioTools-Tests',
+	#tag : 'Core'
+}
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> dnaSequence: aString [
+
+	^ BioSequence newDNA: aString
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> newSeqRecord [
+
+	^ BioSeqRecord new
+		  sequence: (self dnaSequence: 'ATCGATCG');
+		  id: 'TEST001';
+		  yourself
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> proteinSequence: aString [
+
+	^ BioSequence newProtein: aString
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> rnaSequence: aString [
+
+	^ BioSequence newRNA: aString
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testATRatio [
+
+	| seq |
+	seq := self dnaSequence: 'AATT'.
+	self assert: seq atRatio equals: 1.0.
+	seq := self dnaSequence: 'GGCC'.
+	self assert: seq atRatio equals: 0.0
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testAnnotationPreservesAll [
+
+	| rec |
+	rec := self newSeqRecord.
+	rec
+		seqVersion: 1;
+		keywords: #( 'keyword1' );
+		dates: #( '01-JAN-2024' );
+		species: 'Homo sapiens';
+		primaryAccession: 'NM_001';
+		secondaryAccessions: #( 'NM_001.1' );
+		division: 'PRI';
+		taxonomy: #( 'Eukaryota' ).
+	self assert: rec seqVersion equals: 1.
+	self assert: rec keywords first equals: 'keyword1'.
+	self assert: rec dates first equals: '01-JAN-2024'.
+	self assert: rec species equals: 'Homo sapiens'.
+	self assert: rec primaryAccession equals: 'NM_001'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testComposition [
+
+	| seq comp |
+	seq := self dnaSequence: 'ATCGATCG'.
+	comp := seq composition.
+	self assert: (comp at: $A) equals: 2.
+	self assert: (comp at: $T) equals: 2.
+	self assert: (comp at: $C) equals: 2.
+	self assert: (comp at: $G) equals: 2
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testCompositionWithFrequencies [
+
+	| seq comp |
+	seq := self dnaSequence: 'AATT'.
+	comp := seq compositionWithFrequencies.
+	self assert: ((comp at: $A) at: #count) equals: 2.
+	self assert: ((comp at: $A) at: #frequency) equals: 0.5
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testDates [
+
+	| rec |
+	rec := self newSeqRecord.
+	self assert: rec dates isEmpty.
+	rec dates: #( '01-JAN-2024' '15-FEB-2024' ).
+	self assert: rec dates size equals: 2
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testDefinition [
+
+	| rec |
+	rec := self newSeqRecord.
+	rec definition: 'Test sequence for unit testing'.
+	self assert: rec definition equals: 'Test sequence for unit testing'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testDivision [
+
+	| rec |
+	rec := self newSeqRecord.
+	rec division: 'PRI'.
+	self assert: rec division equals: 'PRI'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testFeatureCount [
+
+	| rec |
+	rec := self newSeqRecord.
+	self assert: rec featureCount equals: 0.
+	rec addSeqFeature: (BioSequenceFeature new primaryTag: 'gene').
+	rec addSeqFeature: (BioSequenceFeature new primaryTag: 'CDS').
+	self assert: rec featureCount equals: 2
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testGCRatio [
+
+	| seq |
+	seq := self dnaSequence: 'GGCC'.
+	self assert: seq gcRatio equals: 1.0.
+	seq := self dnaSequence: 'AATT'.
+	self assert: seq gcRatio equals: 0.0.
+	seq := self dnaSequence: 'ATGC'.
+	self assert: seq gcRatio equals: 0.5
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testKeywords [
+
+	| rec |
+	rec := self newSeqRecord.
+	self assert: rec keywords isEmpty.
+	rec keywords: #( 'hypothetical protein' 'complete cds' ).
+	self assert: rec keywords size equals: 2.
+	self assert: rec keywords first equals: 'hypothetical protein'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testNucleotideComposition [
+
+	| seq comp |
+	seq := self dnaSequence: 'AATTTGGC'.
+	comp := seq nucleotideComposition.
+	self assert: (comp at: $A) equals: 2.
+	self assert: (comp at: $T) equals: 3.
+	self assert: (comp at: $G) equals: 2.
+	self assert: (comp at: $C) equals: 1
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testNucleotideCompositionRNA [
+
+	| seq comp |
+	seq := self rnaSequence: 'AAUUUGGC'.
+	comp := seq nucleotideComposition.
+	self assert: (comp at: $A) equals: 2.
+	self assert: (comp at: $U) equals: 3.
+	self assert: (comp at: $G) equals: 2.
+	self assert: (comp at: $C) equals: 1
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testOrganism [
+
+	| rec |
+	rec := self newSeqRecord.
+	rec organism: 'Drosophila melanogaster'.
+	self assert: rec organism equals: 'Drosophila melanogaster'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testPrimaryAccession [
+
+	| rec |
+	rec := self newSeqRecord.
+	rec primaryAccession: 'NM_12345'.
+	self assert: rec primaryAccession equals: 'NM_12345'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testPrimaryAccessionDefaultsToId [
+
+	| rec |
+	rec := self newSeqRecord.
+	"When no accession set, should return id"
+	self assert: rec primaryAccession equals: 'TEST001'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testSecondaryAccessions [
+
+	| rec |
+	rec := self newSeqRecord.
+	self assert: rec secondaryAccessions isEmpty.
+	rec secondaryAccessions: #( 'NM_12345.1' 'NM_12345.2' ).
+	self assert: rec secondaryAccessions size equals: 2
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testSeqVersion [
+
+	| rec |
+	rec := self newSeqRecord.
+	self assert: rec seqVersion isNil.
+	rec seqVersion: 1.
+	self assert: rec seqVersion equals: 1.
+	rec seqVersion: 2.
+	self assert: rec seqVersion equals: 2
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testSequenceTypeDNA [
+
+	| seq |
+	seq := self dnaSequence: 'ATCG'.
+	self assert: seq sequenceType equals: #DNA
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testSequenceTypeProtein [
+
+	| seq |
+	seq := self proteinSequence: 'MVLSP'.
+	self assert: seq sequenceType equals: #Protein
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testSequenceTypeRNA [
+
+	| seq |
+	seq := self rnaSequence: 'AUCG'.
+	self assert: seq sequenceType equals: #RNA
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testSpecies [
+
+	| rec |
+	rec := self newSeqRecord.
+	self assert: rec species isNil.
+	rec species: 'Homo sapiens'.
+	self assert: rec species equals: 'Homo sapiens'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testSpeciesName [
+
+	| rec |
+	rec := self newSeqRecord.
+	rec speciesName: 'Mus musculus'.
+	self assert: rec speciesName equals: 'Mus musculus'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testSpeciesNameFromOrganism [
+
+	| rec |
+	rec := self newSeqRecord.
+	rec organism: 'Escherichia coli'.
+	self assert: rec speciesName equals: 'Escherichia coli'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceAnalysisTest >> testTaxonomy [
+
+	| rec |
+	rec := self newSeqRecord.
+	rec taxonomy:
+		#( 'Eukaryota' 'Metazoa' 'Chordata' 'Craniata' 'Mammalia' ).
+	self assert: rec taxonomy size equals: 5.
+	self assert: rec taxonomy first equals: 'Eukaryota'
+]
diff --git a/repository/BioTools-Tests/BioSequenceStringTest.class.st b/repository/BioTools-Tests/BioSequenceStringTest.class.st
new file mode 100644
index 00000000..e9007b25
--- /dev/null
+++ b/repository/BioTools-Tests/BioSequenceStringTest.class.st
@@ -0,0 +1,404 @@
+Class {
+	#name : 'BioSequenceStringTest',
+	#superclass : 'BioAbstractTest',
+	#category : 'BioTools-Tests-Core',
+	#package : 'BioTools-Tests',
+	#tag : 'Core'
+}
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> dnaSequence: aString [
+
+	^ BioSequence newDNA: aString
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testBeginsWith [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCG'.
+	self assert: (seq beginsWith: 'ATCG').
+	self deny: (seq beginsWith: 'TCGA').
+	self assert: (seq beginsWith: '').
+	self deny: (seq beginsWith: 'ATCGATCGATCG')
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testCountSubstring [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCGATCG'.
+	self assert: (seq countSubstring: 'ATCG') equals: 3.
+	self assert: (seq countSubstring: 'GAT') equals: 2.
+	self assert: (seq countSubstring: 'TTTT') equals: 0
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testCountSubstringOverlapping [
+
+	| seq |
+	seq := self dnaSequence: 'AAAA'.
+	self assert: (seq countSubstring: 'AA' overlapping: false) equals: 2.
+	self assert: (seq countSubstring: 'AA' overlapping: true) equals: 3
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testDropFirst [
+
+	| seq sub |
+	seq := self dnaSequence: 'ATCGATCG'.
+	sub := seq dropFirst: 4.
+	self assert: sub sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testDropLast [
+
+	| seq sub |
+	seq := self dnaSequence: 'ATCGATCG'.
+	sub := seq dropLast: 4.
+	self assert: sub sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testEndsWith [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCG'.
+	self assert: (seq endsWith: 'ATCG').
+	self deny: (seq endsWith: 'CGAT').
+	self assert: (seq endsWith: '').
+	self deny: (seq endsWith: 'GATCGATCG')
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testFirstN [
+
+	| seq sub |
+	seq := self dnaSequence: 'ATCGATCG'.
+	sub := seq first: 4.
+	self assert: sub sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testFromTo [
+
+	| seq sub |
+	seq := self dnaSequence: 'ATCGATCG'.
+	sub := seq from: 2 to: 5.
+	self assert: sub sequence equals: 'TCGA'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testIncludesAnyOf [
+
+	| seq |
+	seq := self dnaSequence: 'ATCG'.
+	self assert: (seq includesAnyOf: 'XYZA').
+	self deny: (seq includesAnyOf: 'N').
+	self deny: (seq includesAnyOf: 'XYZ')
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testIncludesSubstring [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCG'.
+	self assert: (seq includesSubstring: 'ATCG').
+	self assert: (seq includesSubstring: 'CGA').
+	self deny: (seq includesSubstring: 'TTTT')
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testIndexOf [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCGATCG'.
+	self assert: (seq indexOf: 'ATCG') equals: 1.
+	self assert: (seq indexOf: 'GAT') equals: 4.
+	self assert: (seq indexOf: 'XXX') equals: 0
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testIndexOfOrFail [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCG'.
+	self assert: (seq indexOfOrFail: 'CGA') equals: 3.
+	self should: [ seq indexOfOrFail: 'XXX' ] raise: NotFound
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testIndexOfStartingAt [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCGATCG'.
+	self assert: (seq indexOf: 'ATCG' startingAt: 1) equals: 1.
+	self assert: (seq indexOf: 'ATCG' startingAt: 2) equals: 5.
+	self assert: (seq indexOf: 'ATCG' startingAt: 6) equals: 9.
+	self assert: (seq indexOf: 'ATCG' startingAt: 10) equals: 0
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testIndexOfStartingAtEndingAt [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCGATCG'.
+	self assert: (seq indexOf: 'GAT' startingAt: 1 endingAt: 6) equals: 4.
+	self
+		assert: (seq indexOf: 'ATCG' startingAt: 1 endingAt: 4)
+		equals: 1.
+	self
+		assert: (seq indexOf: 'ATCG' startingAt: 5 endingAt: 8)
+		equals: 5.
+	self
+		assert: (seq indexOf: 'XXXX' startingAt: 1 endingAt: 12)
+		equals: 0
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testIsAllLowercase [
+
+	| seq |
+	seq := self dnaSequence: 'atcg'.
+	self assert: seq isAllLowercase.
+	seq := self dnaSequence: 'ATcg'.
+	self deny: seq isAllLowercase
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testIsAllUppercase [
+
+	| seq |
+	seq := self dnaSequence: 'ATCG'.
+	self assert: seq isAllUppercase.
+	seq := self dnaSequence: 'ATcg'.
+	self deny: seq isAllUppercase
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testJoin [
+
+	| seqs joined |
+	seqs := {
+		        (self dnaSequence: 'AT').
+		        (self dnaSequence: 'CG').
+		        (self dnaSequence: 'TA') }.
+	joined := '' join: (seqs collect: #sequence).
+	self assert: joined equals: 'ATCGTA'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testJoinWithStrings [
+
+	| joined |
+	joined := '' join: #( 'AT' 'CG' 'TA' ).
+	self assert: joined equals: 'ATCGTA'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testLastIndexOf [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCGATCG'.
+	self assert: (seq lastIndexOf: 'ATCG') equals: 9.
+	self assert: (seq lastIndexOf: 'GAT') equals: 8.
+	self assert: (seq lastIndexOf: 'XXX') equals: 0
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testLastIndexOfOrFail [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCG'.
+	self assert: (seq lastIndexOfOrFail: 'ATC') equals: 5.
+	self should: [ seq lastIndexOfOrFail: 'XXX' ] raise: NotFound
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testLastIndexOfStartingAt [
+
+	| seq |
+	seq := self dnaSequence: 'ATCGATCGATCG'.
+	self assert: (seq lastIndexOf: 'ATCG' startingAt: 12) equals: 9.
+	self assert: (seq lastIndexOf: 'ATCG' startingAt: 8) equals: 5.
+	self assert: (seq lastIndexOf: 'ATCG' startingAt: 4) equals: 1
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testLastN [
+
+	| seq sub |
+	seq := self dnaSequence: 'ATCGATCG'.
+	sub := seq last: 4.
+	self assert: sub sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testPadLeft [
+
+	| seq padded |
+	seq := self dnaSequence: 'ATCG'.
+	padded := seq padLeft: $- to: 8.
+	self assert: padded sequence equals: '----ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testPadLeftNoPaddingNeeded [
+
+	| seq padded |
+	seq := self dnaSequence: 'ATCGATCG'.
+	padded := seq padLeft: $- to: 4.
+	self assert: padded sequence equals: 'ATCGATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testPadRight [
+
+	| seq padded |
+	seq := self dnaSequence: 'ATCG'.
+	padded := seq padRight: $- to: 8.
+	self assert: padded sequence equals: 'ATCG----'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testRemovePrefix [
+
+	| seq result |
+	seq := self dnaSequence: 'PREFIXATCG'.
+	result := seq removePrefix: 'PREFIX'.
+	self assert: result sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testRemovePrefixNotPresent [
+
+	| seq result |
+	seq := self dnaSequence: 'ATCG'.
+	result := seq removePrefix: 'XXX'.
+	self assert: result sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testRemoveSuffix [
+
+	| seq result |
+	seq := self dnaSequence: 'ATCGSUFFIX'.
+	result := seq removeSuffix: 'SUFFIX'.
+	self assert: result sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testRemoveSuffixNotPresent [
+
+	| seq result |
+	seq := self dnaSequence: 'ATCG'.
+	result := seq removeSuffix: 'XXX'.
+	self assert: result sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testReplaceAllOccurrences [
+
+	| seq replaced |
+	seq := self dnaSequence: 'ATCGATCG'.
+	replaced := seq replace: 'ATCG' with: 'AAAA'.
+	self assert: replaced sequence equals: 'AAAAAAAA'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testReplaceWith [
+
+	| seq replaced |
+	seq := self dnaSequence: 'ATCGATCG'.
+	replaced := seq replace: 'ATC' with: 'TTT'.
+	self assert: replaced sequence equals: 'TTTGTTTG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testReversed [
+
+	| seq rev |
+	seq := self dnaSequence: 'ATCG'.
+	rev := seq reversed.
+	self assert: rev sequence equals: 'GCTA'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testSplitOn [
+
+	| seq parts |
+	seq := self dnaSequence: 'ATCG-ATCG-ATCG'.
+	parts := seq splitOn: '-'.
+	self assert: parts size equals: 3.
+	self assert: parts first sequence equals: 'ATCG'.
+	self assert: parts second sequence equals: 'ATCG'.
+	self assert: parts last sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testSplitOnEmpty [
+
+	| seq parts |
+	seq := self dnaSequence: ''.
+	parts := seq splitOn: '-'.
+	self assert: parts size equals: 1.
+	self assert: parts first sequence equals: ''
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testSplitOnMaxChunks [
+
+	| seq parts |
+	seq := self dnaSequence: 'AT-CG-AT-CG'.
+	parts := seq splitOn: '-' maxChunks: 3.
+	self assert: parts size equals: 3.
+	self assert: parts first sequence equals: 'AT'.
+	self assert: parts second sequence equals: 'CG'.
+	self assert: parts last sequence equals: 'AT-CG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testTrim [
+
+	| seq trimmed |
+	seq := self dnaSequence: ' ATCG '.
+	trimmed := seq trim.
+	self assert: trimmed sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testTrimChars [
+
+	| seq trimmed |
+	seq := self dnaSequence: 'NNATCGNN'.
+	trimmed := seq trim: 'N'.
+	self assert: trimmed sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testTrimLeft [
+
+	| seq trimmed |
+	seq := self dnaSequence: ' ATCG '.
+	trimmed := seq trimLeft.
+	self assert: trimmed sequence equals: 'ATCG '
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testTrimNoWhitespace [
+
+	| seq trimmed |
+	seq := self dnaSequence: 'ATCG'.
+	trimmed := seq trim.
+	self assert: trimmed sequence equals: 'ATCG'
+]
+
+{ #category : 'as yet unclassified' }
+BioSequenceStringTest >> testTrimRight [
+
+	| seq trimmed |
+	seq := self dnaSequence: ' ATCG '.
+	trimmed := seq trimRight.
+	self assert: trimmed sequence equals: ' ATCG'
+]
diff --git a/repository/BioTools/BioSequence.class.st b/repository/BioTools/BioSequence.class.st
index c09433c3..4e546182 100644
--- a/repository/BioTools/BioSequence.class.st
+++ b/repository/BioTools/BioSequence.class.st
@@ -71,6 +71,18 @@ BioSequence class >> fromUmambiguousRNASequences: aCollection [
 	^ aCollection collect: [ : seqString | self newUnambiguousRNA: seqString ]
 ]
 
+{ #category : 'splitjoin' }
+BioSequence class >> join: aCollection [
+	"Join a collection of BioSequences (or strings) into one sequence."
+
+	| strings |
+	strings := aCollection collect: [ :each |
+			           each isBioSequence
+				           ifTrue: [ each sequence ]
+				           ifFalse: [ each asString ] ].
+	^ self new initializeWith: ('' join: strings)
+]
+
 { #category : 'convenience' }
 BioSequence class >> joinSequence: aCollection [
 	" Private - Answer a String with the sequence ensambled from aCollection.
@@ -480,6 +492,13 @@ BioSequence >> at: anInteger put: aLetter [
 	self signalInvalidObject: 'Sequence is read-only. Use #asMutable to enable modifications it then #asSequence' 
 ]
 
+{ #category : 'accessing' }
+BioSequence >> atRatio [
+	"Answer the AT ratio as a fraction (AT / ATGC)."
+
+	^ 1.0 - self gcRatio
+]
+
 { #category : 'accesing public - protein synthesis' }
 BioSequence >> backTranscribe [
 	" Answer a new instance of the receiver's with the receiver's sequence transcribed to its corresponding DNA, adjusting the alphabet "
@@ -501,6 +520,13 @@ BioSequence >> backTranscription [
 	^ backTranscript
 ]
 
+{ #category : 'testing' }
+BioSequence >> beginsWith: aPrefix [
+	"Answer true if the receiver's sequence begins with aPrefix."
+
+	^ self sequence beginsWith: aPrefix
+]
+
 { #category : 'accessing' }
 BioSequence >> bioConsensusFor: aBioAlignment [ 
 	" Answer a <Character> representing a consensus base for the receiver "
@@ -589,6 +615,29 @@ BioSequence >> complementaryAt: aCharacter [
 	^ self alphabet complementaryTable at: aCharacter
 ]
 
+{ #category : 'accessing - utilities' }
+BioSequence >> composition [
+	"Answer a Dictionary with nucleotide/amino acid composition.
+	 Keys are letters, values are counts."
+
+	^ self occurrencesOfLetters
+]
+
+{ #category : 'accessing - utilities' }
+BioSequence >> compositionWithFrequencies [
+	"Answer a Dictionary with composition including frequencies."
+
+	| counts total result |
+	counts := self occurrencesOfLetters.
+	total := self size.
+	result := Dictionary new.
+	counts keysAndValuesDo: [ :key :value |
+			result at: key put: (Dictionary
+					 with: #count -> value
+					 with: #frequency -> (value / total asFloat)) ].
+	^ result
+]
+
 { #category : 'accessing' }
 BioSequence >> contents [
 	" Compatibility with #dumpToFileNamed: . Answer the receiver's sequence <String> "
@@ -637,6 +686,43 @@ BioSequence >> copyTo: stopInteger [
 	
 ]
 
+{ #category : 'accessing - utilities' }
+BioSequence >> countSubstring: aSubstring [
+	"Answer the number of non-overlapping occurrences of aSubstring."
+
+	| count pos start |
+	count := 0.
+	start := 1.
+	[
+		pos := self sequence
+			       indexOfSubCollection: aSubstring
+			       startingAt: start.
+		pos > 0 ] whileTrue: [
+			count := count + 1.
+			start := pos + aSubstring size ].
+	^ count
+]
+
+{ #category : 'accessing - utilities' }
+BioSequence >> countSubstring: aSubstring overlapping: aBoolean [
+	"Count occurrences. If overlapping is true, count overlapping matches."
+
+	| count pos start increment |
+	count := 0.
+	start := 1.
+	increment := aBoolean
+		             ifTrue: [ 1 ]
+		             ifFalse: [ aSubstring size ].
+	[
+		pos := self sequence
+			       indexOfSubCollection: aSubstring
+			       startingAt: start.
+		pos > 0 ] whileTrue: [
+			count := count + 1.
+			start := pos + increment ].
+	^ count
+]
+
 { #category : 'accessing - checksum' }
 BioSequence >> crc32 [
 	"Answer a <Number> with the CRC checksum (edundancy check) for the receiver's sequence"
@@ -776,6 +862,31 @@ BioSequence >> do: aClosure [
 
 ]
 
+{ #category : 'accessing - utilities' }
+BioSequence >> dropFirst: n [
+	"Answer a new BioSequence without the first n characters."
+
+	^ self class new
+		  initializeWith: (self sequence allButFirst: n)
+		  alphabet: self alphabet
+]
+
+{ #category : 'accessing - utilities' }
+BioSequence >> dropLast: n [
+	"Answer a new BioSequence without the last n characters."
+
+	^ self class new
+		  initializeWith: (self sequence allButLast: n)
+		  alphabet: self alphabet
+]
+
+{ #category : 'testing' }
+BioSequence >> endsWith: aSuffix [
+	"Answer true if the receiver's sequence ends with aSuffix."
+
+	^ self sequence endsWith: aSuffix
+]
+
 { #category : 'copying' }
 BioSequence >> findHotspots [
 	" Answer a <Collection> of the receiver's substrings tokenized by hostspots : [ ] "
@@ -785,6 +896,24 @@ BioSequence >> findHotspots [
 	
 ]
 
+{ #category : 'accessing' }
+BioSequence >> first: n [
+	"Answer a new BioSequence with the first n characters."
+
+	^ self class new
+		  initializeWith: (self sequence first: n)
+		  alphabet: self alphabet
+]
+
+{ #category : 'instance creation' }
+BioSequence >> from: startIndex to: endIndex [
+	"Answer a new BioSequence with characters from startIndex to endIndex."
+
+	^ self class new
+		  initializeWith: (self sequence copyFrom: startIndex to: endIndex)
+		  alphabet: self alphabet
+]
+
 { #category : 'accessing' }
 BioSequence >> from: start to: stop do: aClosure [
 	" Evaluate aBlock for all elements between start and stop (inclusive). "
@@ -866,6 +995,18 @@ BioSequence >> gcContentUppercased [
 
 ]
 
+{ #category : 'accessing - utilities' }
+BioSequence >> gcRatio [
+	"Answer the GC ratio as a fraction (GC / ATGC)."
+
+	| comp gc at |
+	comp := self occurrencesOfLetters.
+	gc := (comp at: $G ifAbsent: [ 0 ]) + (comp at: $C ifAbsent: [ 0 ]).
+	at := (comp at: $A ifAbsent: [ 0 ]) + (comp at: $T ifAbsent: [ 0 ]).
+	at + gc = 0 ifTrue: [ ^ 0.0 ].
+	^ gc / (at + gc) asFloat
+]
+
 { #category : 'accessing' }
 BioSequence >> gcSkew [
 	" Answer a <Collection> with ratios <Float>. Calculate receiver's GC skew (G-C)/(G+C) for windows of size wLength.
@@ -973,6 +1114,61 @@ BioSequence >> hotspotRegionsLeft: leftSize right: rightSize [
 	^ self findHotspots tripleSelect: [ : a : b : c | a size >= leftSize and: [ c size >= rightSize ] ].
 ]
 
+{ #category : 'testing' }
+BioSequence >> includesAnyOf: aCollection [
+	"Answer true if the receiver contains any character from aCollection."
+
+	^ self sequence includesAnyOf: aCollection
+]
+
+{ #category : 'testing' }
+BioSequence >> includesSubstring: aSubstring [
+	"Answer true if the receiver contains aSubstring."
+
+	^ (self indexOf: aSubstring) > 0
+]
+
+{ #category : 'accessing' }
+BioSequence >> indexOf: aSubstring [
+	"Answer the index of the first occurrence of aSubstring in the receiver.
+	 Answer 0 if not found. Uses 1-based indexing."
+
+	^ self sequence indexOfSubCollection: aSubstring startingAt: 1
+]
+
+{ #category : 'accessing' }
+BioSequence >> indexOf: aSubstring startingAt: startIndex [
+	"Answer the index of the first occurrence of aSubstring at or after startIndex.
+	 Answer 0 if not found."
+
+	^ self sequence
+		  indexOfSubCollection: aSubstring
+		  startingAt: startIndex
+]
+
+{ #category : 'accessing' }
+BioSequence >> indexOf: aSubstring startingAt: startIndex endingAt: endIndex [
+	"Answer the index of the first occurrence of aSubstring within the range [startIndex, endIndex].
+	 Answer 0 if not found. Returns 1-based index in the original sequence."
+
+	| subSeq idx |
+	subSeq := self sequence copyFrom: startIndex to: endIndex.
+	idx := subSeq indexOfSubCollection: aSubstring startingAt: 1.
+	idx = 0 ifTrue: [ ^ 0 ].
+	^ idx + startIndex - 1
+]
+
+{ #category : 'accessing' }
+BioSequence >> indexOfOrFail: aSubstring [
+	"Answer the index of the first occurrence of aSubstring.
+	 Raise NotFound signal if not found."
+
+	| idx |
+	idx := self indexOf: aSubstring.
+	idx = 0 ifTrue: [ NotFound signalFor: aSubstring ].
+	^ idx
+]
+
 { #category : 'accessing' }
 BioSequence >> indicesOfSubsequence: aBioSequence [
 	" See comment in #indicesOfSubstring: "
@@ -1009,6 +1205,24 @@ BioSequence >> initializeWith: aString alphabet: anAlphabet [
 	alphabet := anAlphabet
 ]
 
+{ #category : 'testing' }
+BioSequence >> isAllLowercase [
+	"Answer true if all alphabetic characters are lowercase."
+
+	self sequence do: [ :c |
+		c isLetter ifTrue: [ c isLowercase ifFalse: [ ^ false ] ] ].
+	^ true
+]
+
+{ #category : 'testing' }
+BioSequence >> isAllUppercase [
+	"Answer true if all alphabetic characters are uppercase."
+
+	self sequence do: [ :c |
+		c isLetter ifTrue: [ c isUppercase ifFalse: [ ^ false ] ] ].
+	^ true
+]
+
 { #category : 'testing' }
 BioSequence >> isBioSequence [
 	"Answer whether the receiver represents a Biological sequence."
@@ -1135,6 +1349,50 @@ BioSequence >> kmersCount: patString mismatches: d [
 	^ (self sequence indicesOfSubstring: patString mismatches: d) size
 ]
 
+{ #category : 'accessing' }
+BioSequence >> last: n [
+	"Answer a new BioSequence with the last n characters."
+
+	^ self class new
+		  initializeWith: (self sequence last: n)
+		  alphabet: self alphabet
+]
+
+{ #category : 'accessing' }
+BioSequence >> lastIndexOf: aSubstring [
+	"Answer the index of the last occurrence of aSubstring in the receiver.
+	 Answer 0 if not found."
+
+	^ self sequence findLastOccurrenceOfString: aSubstring startingAt: 1
+]
+
+{ #category : 'accessing' }
+BioSequence >> lastIndexOf: aSubstring startingAt: startIndex [
+	"Answer the index of the last occurrence of aSubstring at or before startIndex."
+
+	| result lastPos |
+	result := 0.
+	lastPos := 0.
+	[
+		lastPos := self sequence
+			           findString: aSubstring
+			           startingAt: lastPos + 1.
+		lastPos > 0 and: [ lastPos <= startIndex ] ] whileTrue: [
+		result := lastPos ].
+	^ result
+]
+
+{ #category : 'accessing' }
+BioSequence >> lastIndexOfOrFail: aSubstring [
+	"Answer the index of the last occurrence of aSubstring.
+	 Raise NotFound signal if not found."
+
+	| idx |
+	idx := self lastIndexOf: aSubstring.
+	idx = 0 ifTrue: [ NotFound signalFor: aSubstring ].
+	^ idx
+]
+
 { #category : 'accessing' }
 BioSequence >> lcc [
 	"Answer a <Collection> of <Float> with the Local Composition Complexity (LCC) value for the receiver. Assume the receiver is unambiguous sequence "
@@ -1484,6 +1742,37 @@ BioSequence >> notEmpty [
 	^ seq notEmpty
 ]
 
+{ #category : 'accessing - utilities' }
+BioSequence >> nucleotideComposition [
+	"Answer nucleotide composition for DNA/RNA sequences.
+	 Returns Dictionary with A, T/U, G, C counts and 'other' for non-standard."
+
+	| comp a t u g c other |
+	self isProteinSequence ifTrue: [
+			self error:
+				'nucleotideComposition not applicable to protein sequences' ].
+	comp := self occurrencesOfLetters.
+	a := comp at: $A ifAbsent: [ 0 ].
+	g := comp at: $G ifAbsent: [ 0 ].
+	c := comp at: $C ifAbsent: [ 0 ].
+	self isRNASequence
+		ifTrue: [
+				t := 0.
+				u := comp at: $U ifAbsent: [ 0 ] ]
+		ifFalse: [
+				t := comp at: $T ifAbsent: [ 0 ].
+				u := comp at: $U ifAbsent: [ 0 ] ].
+	other := self size - a - t - u - g - c.
+	^ Dictionary new
+		  at: $A put: a;
+		  at: $T put: t;
+		  at: $U put: u;
+		  at: $G put: g;
+		  at: $C put: c;
+		  at: 'other' put: other;
+		  yourself
+]
+
 { #category : 'accessing - frequencies' }
 BioSequence >> occurrencesOf: aCharacter [
 	" Answer how many of the receiver's elements are equal to aLetter "
@@ -1513,6 +1802,30 @@ BioSequence >> oligonucleotideFrequency [
 	^ self kmerFrequencies: 1
 ]
 
+{ #category : 'accessing - utilities' }
+BioSequence >> padLeft: padChar to: targetLength [
+	"Pad the sequence on the left with padChar to reach targetLength."
+
+	| padding |
+	self size >= targetLength ifTrue: [ ^ self copy ].
+	padding := String new: targetLength - self size withAll: padChar.
+	^ self class new
+		  initializeWith: padding , self sequence
+		  alphabet: self alphabet
+]
+
+{ #category : 'accessing - utilities' }
+BioSequence >> padRight: padChar to: targetLength [
+	"Pad the sequence on the right with padChar to reach targetLength."
+
+	| padding |
+	self size >= targetLength ifTrue: [ ^ self copy ].
+	padding := String new: targetLength - self size withAll: padChar.
+	^ self class new
+		  initializeWith: self sequence , padding
+		  alphabet: self alphabet
+]
+
 { #category : 'accessing' }
 BioSequence >> positionsOf: aCharacterOrString [
 	" Answer a Collection with the positions of aminoacidLetter in the receiver's sequence "
@@ -1563,6 +1876,19 @@ BioSequence >> randomLength: size for: anAlphabetClass [
 								nextPutAll: b asString ] ].
 ]
 
+{ #category : 'removing' }
+BioSequence >> removePrefix: aPrefix [
+	"Remove aPrefix if present at the start of the sequence.
+	 Answer the receiver unchanged if prefix not present."
+
+	(self beginsWith: aPrefix) ifTrue: [
+			^ self class new
+				  initializeWith:
+				  (self sequence copyFrom: aPrefix size + 1 to: self size)
+				  alphabet: self alphabet ].
+	^ self copy
+]
+
 { #category : 'accessing - sequence record' }
 BioSequence >> removeSeqFeature: aBioSequenceFeature [
 	" Remove aBioSequenceFeature from the receiver "
@@ -1570,6 +1896,28 @@ BioSequence >> removeSeqFeature: aBioSequenceFeature [
 	self sequenceFeatures remove: aBioSequenceFeature
 ]
 
+{ #category : 'removing' }
+BioSequence >> removeSuffix: aSuffix [
+	"Remove aSuffix if present at the end of the sequence."
+
+	(self endsWith: aSuffix) ifTrue: [
+			^ self class new
+				  initializeWith:
+				  (self sequence copyFrom: 1 to: self size - aSuffix size)
+				  alphabet: self alphabet ].
+	^ self copy
+]
+
+{ #category : 'transforming' }
+BioSequence >> replace: oldSubstring with: newSubstring [
+	"Answer a new BioSequence with all occurrences of oldSubstring replaced by newSubstring."
+
+	^ self class new
+		  initializeWith:
+		  (self sequence copyReplaceAll: oldSubstring with: newSubstring)
+		  alphabet: self alphabet
+]
+
 { #category : 'accessing - restriction' }
 BioSequence >> resolveEnzyme: anEnzymeOrName [
 
@@ -1602,9 +1950,12 @@ BioSequence >> reverseComplement [
 
 { #category : 'accesing public - protein synthesis' }
 BioSequence >> reversed [
-	" Answer a copy of the receiver with element order reversed "
-	
-	^ self newPrototypeWith: seq reversed
+	"Answer a new BioSequence with characters in reverse order.
+	 Note: This is NOT reverse-complement, just reverse."
+
+	^ self class new
+		  initializeWith: self sequence reversed
+		  alphabet: self alphabet
 ]
 
 { #category : 'accessing - checksum' }
@@ -1660,6 +2011,16 @@ BioSequence >> sequenceRecord: anObject [
 	sequenceRecord := anObject
 ]
 
+{ #category : 'accessing - utilities' }
+BioSequence >> sequenceType [
+	"Answer the sequence type as a Symbol: #DNA, #RNA, or #Protein."
+
+	self isDNASequence ifTrue: [ ^ #DNA ].
+	self isRNASequence ifTrue: [ ^ #RNA ].
+	self isProteinSequence ifTrue: [ ^ #Protein ].
+	^ #Unknown
+]
+
 { #category : 'accessing' }
 BioSequence >> size [
 	" Answer how many symbols the receiver contains "
@@ -1687,6 +2048,45 @@ BioSequence >> splitByCodons [
 	^ self sequence findTokens: $*
 ]
 
+{ #category : 'splitjoin' }
+BioSequence >> splitOn: aSeparator [
+	"Answer anArray of subsequences split by aSeparator."
+
+	^ (self sequence splitOn: aSeparator) collect: [ :s |
+		  self class new initializeWith: s alphabet: self alphabet ]
+]
+
+{ #category : 'enumerating' }
+BioSequence >> splitOn: aSeparator indicesDo: aBlock [
+	"Evaluate aBlock with each separator index found."
+
+	self sequence splitOn: aSeparator indicesDo: aBlock
+]
+
+{ #category : 'accessing - utilities' }
+BioSequence >> splitOn: aSeparator maxChunks: maxChunks [
+	"Split into at most maxChunks parts using aSeparator."
+
+	| parts remaining start pos |
+	parts := OrderedCollection new.
+	remaining := self sequence.
+	start := 1.
+	[ parts size < (maxChunks - 1) ] whileTrue: [
+			pos := remaining indexOfSubCollection: aSeparator startingAt: start.
+			pos = 0 ifTrue: [
+					parts add: remaining.
+					^ parts collect: [ :s |
+						  self class new initializeWith: s alphabet: self alphabet ] ].
+			parts add: (remaining copyFrom: 1 to: pos - 1).
+			remaining := remaining
+				             copyFrom: pos + aSeparator size
+				             to: remaining size.
+			start := 1 ].
+	parts add: remaining.
+	^ parts collect: [ :s |
+		  self class new initializeWith: s alphabet: self alphabet ]
+]
+
 { #category : 'accessing private' }
 BioSequence >> stopSymbol [
 	" Answer a terminator <Character> "
@@ -1847,11 +2247,27 @@ BioSequence >> translationTable: aTableIdentifier stopSymbol: stopCharacter toSt
 	
 ]
 
+{ #category : 'growing' }
+BioSequence >> trim [
+	"Remove leading and trailing whitespace from sequence.
+	 Answer a new BioSequence."
+
+	^ self class new
+		  initializeWith: self sequence trimBoth
+		  alphabet: self alphabet
+]
+
 { #category : 'accessing public - utils' }
-BioSequence >> trim: aCharacter [
-	" Modify the receiver by removing aCharacter in its sequence "
-	
-	seq := seq copyWithout: aCharacter.
+BioSequence >> trim: chars [
+	"Remove leading and trailing characters in chars."
+
+	| trimmed |
+	trimmed := self sequence.
+	[ trimmed notEmpty and: [ chars includes: trimmed first ] ]
+		whileTrue: [ trimmed := trimmed copyFrom: 2 to: trimmed size ].
+	[ trimmed notEmpty and: [ chars includes: trimmed last ] ]
+		whileTrue: [ trimmed := trimmed copyFrom: 1 to: trimmed size - 1 ].
+	^ self class new initializeWith: trimmed alphabet: self alphabet
 ]
 
 { #category : 'accessing public - utils' }
@@ -1863,6 +2279,24 @@ BioSequence >> trimAmbiguityCodes [
 
 ]
 
+{ #category : 'trimming' }
+BioSequence >> trimLeft [
+	"Remove leading whitespace from sequence."
+
+	^ self class new
+		  initializeWith: self sequence trimLeft
+		  alphabet: self alphabet
+]
+
+{ #category : 'trimming' }
+BioSequence >> trimRight [
+	"Remove trailing whitespace from sequence."
+
+	^ self class new
+		  initializeWith: self sequence trimRight
+		  alphabet: self alphabet
+]
+
 { #category : 'accessing - frequencies' }
 BioSequence >> trinucleotideFrequency [
 	"Answer a Dictionary mapping each trinucleotide (3-mer) to its frequency in the receiver.