speed up in silico mutagenesis
Add code for in silico mutagenesis and test cases. This method should be able to accept an input sequence and output sequences where N=1 number of bases have been mutated in the sequence.
We may want to be able to adjust N to be 2 or 3 later on, so the code should be able to accommodate this.
Benchmarking is needed to determine whether we should generate those sequences using parallel processing / multiple cores.
I tried using joblib to parallelize in_silico_mutagenesis_sequences as follows. For 1 < n_jobs < 7 it did not perform significantly different than n_jobs = 1. For n_jobs >= 7 it was much slower. This was the case for mutate_n_bases < 3. I tried testing mutate_n_bases = 3 but it was taking a very long time. I imagine there could be other places to parallelize this that might change performance more. I used joblib but it may be worth checking out multiprocessing.
import itertools
from time import time
from selene.sequences import Genome
from joblib import Parallel, delayed
def _ism_combinations(sequence, mutate_n_bases):
for x in itertools.combinations(range(len(sequence)), mutate_n_bases):
yield x
def _ism_mutator(indices, sequence_alts):
pos_mutations = []
for i in indices:
pos_mutations.append(sequence_alts[i])
cur_mutated_sequences = []
for mutations in itertools.product(*pos_mutations):
cur_mutated_sequences.append(list(zip(indices, mutations)))
return cur_mutated_sequences
def in_silico_mutagenesis_sequences(sequence,
mutate_n_bases=1,
bases_arr=None,
n_jobs=1):
if not bases_arr:
bases_arr = Genome.BASES_ARR
sequence_alts = []
for index, ref in enumerate(sequence):
alts = []
for base in bases_arr:
if base == ref:
continue
alts.append(base)
sequence_alts.append(alts)
all_mutated_sequences = []
if n_jobs == 1:
for indices in itertools.combinations(
range(len(sequence)), mutate_n_bases):
pos_mutations = []
for i in indices:
pos_mutations.append(sequence_alts[i])
for mutations in itertools.product(*pos_mutations):
all_mutated_sequences.append(list(zip(indices, mutations)))
else:
all_mutated_sequences = Parallel(n_jobs=n_jobs,
pre_dispatch="1.25*n_jobs")(
delayed(_ism_mutator)(x, sequence_alts) for x in
_ism_combinations(sequence, mutate_n_bases))
return all_mutated_sequences
if __name__ == "__main__":
for mutate_n_bases in [1, 2]:
for n_jobs in range(1, 16):
t_s = time()
sequence = "ATCGN" * 200
in_silico_mutagenesis_sequences(sequence,
mutate_n_bases=mutate_n_bases,
n_jobs=n_jobs)
t_f = time()
print(f"Time to mutate {mutate_n_bases} bases with {n_jobs} jobs "
f"is {t_f - t_s} s.")