read.py
: read functions for .fast5
, .fasta
, .fastq
and .fna
files.
normalization.py
: several raw data normalization functions.
numpy
scipy
ont_fast5_api
from read import read_fast5
from normalization import med_mad, normalize_signal
import numpy as np
read_data = read_fast5('data/0a0bdc5c-8f8f-41ea-a4d1-4ff6344fac3e.fast5')
read_data = read_data[list(read_data.keys())[0]]
# check if it has a segmentation table
if not read_data.segmentation:
print('This file does not have a reference')
# normalize the raw signal
med, mad = med_mad(read_data.raw, factor = 1.0)
norm_signal = normalize_signal(read_data.raw, med, mad)
# read_data.segmentation contains the mapping between the DNA sequence and
# the raw signal, but this is relative to the start of the segmentation
dna_bases = read_data.segmentation['base']
dna_bases_start_in_raw = read_data.segmentation['start'] + read_data.start_rel_to_raw
dna_bases_end_in_raw = dna_bases_start_in_raw + read_data.segmentation['length']
# here we create an array, the length of the raw signal, and annotate where
# each DNA base corresponds to the DNA signal
dna_in_raw = np.full(norm_signal.shape, '', dtype='U1')
dna_in_raw[dna_bases_start_in_raw] = dna_bases
from resquiggle import resquiggle_read_normalized
mapping = resquiggle_read_normalized(
read_id = '0a0bdc5c-8f8f-41ea-a4d1-4ff6344fac3e', # this does not matter
raw_signal = modelled_signal, #use the same as in norm_signal
genome_seq = conditioning_dna_seq, # DNA sequence, as string that should match the raw signal e.g. 'ACGTACAGATC'
norm_signal = modelled_signal, #numpy array
)