|
# chr22_fasta_file = 'tests/data/chr22.fa.gz' |
|
chr22_gtf_file = 'tests/data/chr22_ENST00000319363.gtf' |
|
|
|
|
|
# chr22_5UTR_vcf_file = 'tests/data/chr22_ENST00000319363_5UTR.vcf.gz' |
|
|
|
|
|
def test_5UTRFetcher__read_utr(): |
|
utr5 = UTRFetcher._read_utr(chr22_gtf_file, feature_type="5UTR") |
|
|
|
assert utr5.shape == (1, 12) |
|
|
|
assert utr5.iloc[0].Chromosome == 'chr22' |
|
assert utr5.iloc[0].Start == 17565848 |
|
assert utr5.iloc[0].End == 17565981 |
|
assert utr5.iloc[0].Strand == "+" |
|
|
|
utr5_from_cds = UTRFetcher._read_utr(chr22_gtf_file, feature_type="5UTR", infer_from_cds=True) |
|
|
|
pd.testing.assert_frame_equal(left = utr5.drop(['exon_number', 'exon_id'], axis=1), right = utr5_from_cds.drop(['exon_number', 'exon_id'], axis=1), check_dtype=False) |
|
|
|
|
|
def test_3UTRFetcher__read_utr(): |
|
utr3 = UTRFetcher._read_utr(chr22_gtf_file, feature_type="3UTR") |
|
|
|
assert utr3.shape == (1, 12) |
|
|
|
assert utr3.iloc[0].Chromosome == 'chr22' |
|
assert utr3.iloc[0].Start == 17590710 |
|
assert utr3.iloc[0].End == 17596583 |
|
assert utr3.iloc[0].Strand == "+" |
|
|
|
utr3_from_cds = UTRFetcher._read_utr(chr22_gtf_file, feature_type="3UTR", infer_from_cds=True) |
|
|
|
pd.testing.assert_frame_equal(left=utr3.drop(['exon_number', 'exon_id'], axis=1), |
|
right=utr3_from_cds.drop(['exon_number', 'exon_id'], axis=1), check_dtype=False) |
Currently, UTR region inference works only for non-spliced UTR regions:
kipoiseq/kipoiseq/extractors/gtf.py
Lines 348 to 368 in e67fab6
TODO:
tabix /s/genomes/GenBank/hg38/annotation/hg38.ensGene.gtf.gz chr22 | grep -i ENST00000263207 > kipoiseq/tests/data/chr22_ENST00000263207.gtfchr22_ENST00000263207_3UTR.vcf.gzchr22_ENST00000263207_5UTR.vcf.gzinfer_from_cds=False:chr22_ENST00000263207_3UTR.alt_seqs.txtchr22_ENST00000263207_3UTR.ref_seq.txtchr22_ENST00000263207_5UTR.alt_seqs.txtchr22_ENST00000263207_5UTR.ref_seq.txtkipoiseq/tests/extractors/test_protein.py
Lines 323 to 358 in 1d72daf