"""
Library for doing sequence design that can be expressed as linear algebra
operations for rapid processing by numpy (e.g., generating all DNA sequences
of a certain length and calculating all their full duplex binding energies
in the nearest neighbor model and filtering those outside a given range).
Based on the DNA single-stranded tile (SST) sequence designer used in the following publication.
"Diverse and robust molecular algorithms using reprogrammable DNA self-assembly"
Woods\*, Doty\*, Myhrvold, Hui, Zhou, Yin, Winfree. (\*Joint first co-authors)
""" # noqa
from __future__ import annotations
from typing import Tuple, List, Collection, Sequence, Dict, Iterable
from dataclasses import dataclass
import math
import itertools as it
from functools import lru_cache
import numpy as np
default_rng: np.random.Generator = np.random.default_rng() # noqa
bits2base = ['A', 'C', 'G', 'T']
base2bits = {'A': 0b00, 'C': 0b01, 'G': 0b10, 'T': 0b11,
'a': 0b00, 'c': 0b01, 'g': 0b10, 't': 0b11}
[docs]
def idx2seq(idx: int, length: int) -> str:
"""Return the lexicographic idx'th DNA sequence of given length."""
seq = ['x'] * length
for i in range(length - 1, -1, -1):
seq[i] = bits2base[idx & 0b11]
idx >>= 2
return ''.join(seq)
[docs]
def seq2arr(seq: str, base2bits_local: Dict[str, int] | None = None) -> np.ndarray:
"""Convert seq (string with DNA alphabet) to numpy array with integers 0,1,2,3."""
if base2bits_local is None:
base2bits_local = base2bits
return np.array([base2bits_local[base] for base in seq], dtype=np.ubyte)
# this was about 5 times slower than the new implementation of `seqs2arr` below
# def seqs2arr_old(seqs: Sequence[str]) -> np.ndarray:
# """Return numpy 2D array converting the given DNA sequences to integers."""
# if len(seqs) == 0:
# return np.empty((0, 0), dtype=np.ubyte)
# if isinstance(seqs, str):
# raise ValueError('seqs must be a sequence of strings, not a single string')
# seq_len = len(seqs[0])
# for seq in seqs:
# if len(seq) != seq_len:
# raise ValueError('All sequences in seqs must be equal length')
# num_seqs = len(seqs)
# arr = np.empty((num_seqs, seq_len), dtype=np.ubyte)
# for i in range(num_seqs):
# arr[i] = [base2bits[base] for base in seqs[i]]
# return arr
[docs]
def seqs2arr(seqs: Sequence[str]) -> np.ndarray:
"""Return numpy 2D array converting the given DNA sequences to integers."""
if len(seqs) == 0:
return np.empty((0, 0), dtype=np.ubyte)
if isinstance(seqs, str):
raise ValueError('seqs must be a sequence of strings, not a single string')
# check equal length of each sequence (a bit faster than a Python loop,
# e.g., 3.5 ms for 10^5 seqs compared to 5 ms with Python loop)
seq_len = len(seqs[0])
lengths_it = map(len, seqs)
lengths_arr = np.fromiter(lengths_it, dtype=int)
if np.any(lengths_arr != seq_len):
raise ValueError('All sequences in seqs must be equal length')
num_seqs = len(seqs)
# the code below is about 5 times faster than the old implementation (commented out above)
seqs_cat = ''.join(seqs)
seqs_cat = seqs_cat.upper()
seqs_cat_bytes = seqs_cat.encode()
seqs_cat_byte_array = bytearray(seqs_cat_bytes)
arr1d = np.frombuffer(seqs_cat_byte_array, dtype=np.ubyte)
# arr1d = np.fromstring(seqs_cat_bytes, dtype=np.ubyte) # generates warning about using frombuffer
# convert ASCII bytes for 'A', 'C', 'G', 'T' to 0, 1, 2, 3, respectively
# code below is magical to me, but it works and is slightly faster than more obvious ways:
# https://stackoverflow.com/a/35464758
from_values = np.array([ord(base) for base in ['A', 'C', 'G', 'T']])
to_values = np.arange(4)
sort_idx = np.argsort(from_values)
idx = np.searchsorted(from_values, arr1d, sorter=sort_idx)
arr1d = to_values[sort_idx][idx]
# this is a bit slower than the code above, e.g., 75 ms compared to 55 ms for 10^5 sequences
# for i, base in enumerate(['A', 'C', 'G', 'T']):
# idxs_with_base = arr2d == ord(base)
# arr2d[idxs_with_base] = i
arr2d = arr1d.reshape((num_seqs, seq_len))
return arr2d
[docs]
def arr2seqs(arr: np.ndarray) -> List[str]:
"""Return list of strings converting the given numpy array of integers to DNA sequences."""
return [''.join(bits2base[base] for base in row) for row in arr]
[docs]
def arr2seq(arr: np.ndarray) -> str:
"""Return string converting the given numpy array of integers to DNA sequence."""
bases_ch = [bits2base[base] for base in arr]
return ''.join(bases_ch)
def make_array_with_all_sequences(length: int, digits: Sequence[int]) -> np.ndarray:
num_digits = len(digits)
num_seqs = num_digits ** length
powers_num_digits = [num_digits ** k for k in range(length)]
digits = np.array(digits, dtype=np.ubyte)
arr = np.zeros((num_seqs, length), dtype=np.ubyte)
for i, j, c in zip(reversed(powers_num_digits), powers_num_digits, list(range(length))):
arr[:, c] = np.tile(np.repeat(digits, i), j)
return arr
[docs]
def make_array_with_all_dna_seqs(length: int, bases: Collection[str] = ('A', 'C', 'G', 'T')) -> np.ndarray:
"""Return 2D numpy array with all DNA sequences of given length in
lexicographic order. Bases contains bases to be used: ('A','C','G','T') by
default, but can be set to a subset of these.
Uses the encoding described in the documentation for DNASeqList. The result
is a 2D array, where each row represents a DNA sequence, and that row
has one byte per base."""
if len(bases) == 0:
raise ValueError('bases cannot be empty')
if not set(bases) <= {'A', 'C', 'G', 'T'}:
raise ValueError(f"bases must be a subset of {{A, C, G, T}}; cannot be {bases}")
base_bits = [base2bits[base] for base in bases]
digits = np.array(base_bits, dtype=np.ubyte)
return make_array_with_all_sequences(length, digits)
# num_bases = len(bases)
# num_seqs = num_bases ** length
#
# # the former code took up too much memory (using int32 or int64)
# # the following code makes sure it's 1 byte per base
# powers_num_bases = [num_bases ** k for k in range(length)]
#
# list_of_arrays = False
# if list_of_arrays:
# # this one seems to be faster but takes more memory, probably because just before the last command
# # there are two copies of the array in memory at once
# columns = []
# for i, j, c in zip(reversed(powers_num_bases), powers_num_bases, list(range(length))):
# columns.append(np.tile(np.repeat(bases, i), j))
# arr = np.vstack(columns).transpose()
# else:
# # this seems to be slightly slower but takes less memory, since it
# # allocates only the final array, plus one extra column of that
# # array at a time
# arr = np.empty((num_seqs, length), dtype=np.ubyte)
# for i, j, c in zip(reversed(powers_num_bases), powers_num_bases, list(range(length))):
# arr[:, c] = np.tile(np.repeat(bases, i), j)
#
# return arr
[docs]
def make_array_with_random_subset_of_dna_seqs(
length: int, num_random_seqs: int, rng: np.random.Generator = default_rng,
bases: Collection[str] = ('A', 'C', 'G', 'T')) -> np.ndarray:
"""
Return 2D numpy array with random subset of size `num_seqs` of DNA sequences of given length.
Bases contains bases to be used: ('A','C','G','T') by default, but can be set to a subset of these.
Uses the encoding described in the documentation for DNASeqList. The result is a 2D array,
where each row represents a DNA sequence, and that row has one byte per base.
Sequences returned will be unique (i.e., sampled without replacement) and in a random order
:param length:
length of each row
:param num_random_seqs:
number of rows
:param bases:
DNA bases to use
:param rng:
numpy random number generator (type returned by numpy.random.default_rng())
:return:
2D numpy array with random subset of size `num_seqs` of DNA sequences of given length
"""
if length < 0:
raise ValueError(f'length = {num_random_seqs} must be nonnegative')
elif length == 0:
return np.array([[]], dtype=np.ubyte)
if num_random_seqs <= 0:
raise ValueError(f'num_seqs = {num_random_seqs} must be positive')
if not set(bases) <= {'A', 'C', 'G', 'T'}:
raise ValueError(f"bases must be a subset of {'A', 'C', 'G', 'T'}; cannot be {bases}")
if len(bases) == 0:
raise ValueError('bases cannot be empty')
elif len(bases) == 1:
raise ValueError('bases must have at least two elements')
max_possible = len(bases) ** length
if num_random_seqs > max_possible:
raise ValueError(f'''\
num_random_seqs = {num_random_seqs} is greater than the total number {max_possible}
of sequences of length {length} using alphabet {bases}, so we cannot guarantee
that many unique sequences. Please set num_random_seqs <= {max_possible}.''')
# If we want sufficiently many sequences, then it's simpler to just generate all sequences
# of that length and choose a random subset of size num_seqs.
if num_random_seqs >= max_possible // 4:
all_seqs = make_array_with_all_dna_seqs(length=length, bases=bases)
# https://stackoverflow.com/a/27815343/5339430
idxs = rng.choice(all_seqs.shape[0], num_random_seqs, replace=False)
sampled_seqs = all_seqs[idxs]
return sampled_seqs
# This comment justifies why we sample 2*num_random_seqs sequences randomly (with replacement) in order
# to get our goal of at least num_random_seqs *unique* sequences. We sample with replacement since that's
# the only option using numpy's random number generation routines: we are generating a 2D array
# of numbers and want each *row* to be unique, but numpy can only let us say each *number* is unique,
# which doesn't describe what we want.
#
# Define
#
# m = num_seqs_to_sample = number of sequences we sample with replacement
# n = max_possible = total number of sequences of that length
# k = num_random_seqs = desired number of unique sequences
#
# If we sample m sequences with replacement, out of n total, this is tossing m balls into n bins.
# We want m sufficiently large that at least k bins are non-empty. We throw m=2*k balls.
#
# Above we check if k >= n/4 and generate all sequences if so. Thus, if we need the random selection
# below, then k < n/4. Thus at least 3/4 of bins are empty the entire time, so each time we throw a ball,
# there is a < 1/4 chance for it to land in a non-empty bin. We are throwing m=2*k balls,
# so the number of these that land in non-empty bins is at most X=Binomial(2*k, 1/4), with E[X] = k/2.
# Using Chernoff bounds for X=Binomial(n,p) with E[X]=np and delta=2 giving
# Pr[we generate fewer than k unique sequences]
# = Pr[more than k out of 2*k balls land in non-empty bins]
# = Pr[X > (1+delta)E[X]]
# < e^{-delta^2 E[X] / 3}
# = e^{-2*k/3},
# i.e., for large k, a very small probability. Even for k=1, this is about 1/2,
# so we'll only need to execute expected 2 iterations of the while loop below.
base_bits = np.array([base2bits[base] for base in bases], dtype=np.ubyte)
num_seqs_to_sample = 2 * num_random_seqs # c*k in analysis above
unique_sorted_arr = None
# odds are low to have a collision, so for simplicity we just repeat the whole process if needed
while unique_sorted_arr is None or len(unique_sorted_arr) < num_random_seqs:
arr = rng.choice(a=base_bits, size=(num_seqs_to_sample, length))
unique_sorted_arr = np.unique(arr, axis=0)
if len(unique_sorted_arr) < num_random_seqs:
print(f'WARNING: did not find {num_random_seqs} unique sequences. If you are seeing this warning '
f'repeatedly, check the parameters to make_array_with_random_subset_of_dna_seqs.')
# We probably have too many, so pick a random subset of sequences to return.
# We need a random subset, rather than just taking the first num_random_seqs elements,
# since numpy.unique above sorts the elements.
idxs = rng.choice(unique_sorted_arr.shape[0], num_random_seqs, replace=False)
sampled_seqs = unique_sorted_arr[idxs]
return sampled_seqs
# https://stackoverflow.com/questions/4941753/is-there-a-math-ncr-function-in-python
# In Python 3.8 there's math.comb, but this is about 3x faster somehow.
import operator as op
from functools import reduce
def comb(n: int, k: int) -> int:
# n choose k = n! / (k! * (n-k)!)
k = min(k, n - k)
numer = reduce(op.mul, range(n, n - k, -1), 1)
denom = reduce(op.mul, range(1, k + 1), 1)
return numer // denom
[docs]
def make_array_with_all_dna_seqs_hamming_distance(
dist: int, seq: str, bases: Collection[str] = ('A', 'C', 'G', 'T')) -> np.ndarray:
"""
Return 2D numpy array with all DNA sequences of given length in lexicographic order. Bases contains
bases to be used: ('A','C','G','T') by default, but can be set to a subset of these.
Uses the encoding described in the documentation for DNASeqList. The result is a 2D array, where each
row represents a DNA sequence, and that row has one byte per base.
"""
length = len(seq)
assert 1 <= dist <= length
num_bases = len(bases)
if num_bases == 0:
raise ValueError('bases cannot be empty')
if not set(bases) <= {'A', 'C', 'G', 'T'}:
raise ValueError(f"bases must be a subset of {'A', 'C', 'G', 'T'}; cannot be {bases}")
num_ways_to_choose_subsequence_indices = comb(length, dist)
num_different_bases = len(bases) - 1
num_subsequences = num_different_bases ** dist
num_seqs = num_ways_to_choose_subsequence_indices * num_subsequences
# for simplicity of modular arithmetic, we use integers 0,...,len(bases)-1 to represent the bases,
# then map these back to the correct subset of 0,1,2,3 when we are done
offsets = range(1, num_different_bases + 1)
subseq_offsets = make_array_with_all_sequences(length=dist, digits=offsets)
assert len(subseq_offsets) == num_subsequences
subseq_offsets_repeats = \
np.tile(subseq_offsets.flatten(), num_ways_to_choose_subsequence_indices).reshape(num_seqs, dist)
# all (length choose dist) indices where we could change the bases
idxs = combnr_idxs(length, dist)
assert len(idxs) == num_ways_to_choose_subsequence_indices
idxs_repeat = np.tile(idxs, num_subsequences).reshape(num_seqs, length)
assert len(idxs_repeat) == num_seqs
# map subset of bases used to *prefix* of 0,1,2,3
base2bits_local = {base: digit for base, digit in zip(bases, range(4))}
seq_as_arr = seq2arr(seq, base2bits_local=base2bits_local)
new_arr = np.tile(seq_as_arr, num_seqs).reshape(num_seqs, length)
new_arr[idxs_repeat] += subseq_offsets_repeats.flatten()
new_arr %= num_bases
# now map back to correct subset of 0,1,2,3 to represent bases
for base, digit in zip(['A', 'C', 'G', 'T'], range(4)):
if base not in bases:
idxs_to_inc = new_arr >= digit
new_arr[idxs_to_inc] += 1
return new_arr
def combnr_idxs(length: int, number: int) -> np.ndarray:
# Gives 2D Boolean numpy array, with `length` columns and (`length` choose `number`) rows,
# representing all ways to set exactly `number` elements of the row True and the others to False.
# Useful for indexing into a same-shape numpy array, changing exactly `number` elements in each row.
#
# :param length:
# number of columns
# :param number:
# number of True values in each row
# :return:
# numpy array, with `length` columns and (`length` choose `number`) rows,
# representing all ways to set exactly `number` elements of the row True and the others to False.
if number == 0:
return np.array([[False] * length], dtype=bool)
elif number > length // 2:
vals = combnr_idxs(length, length - number)
return np.logical_not(vals)
x = np.array(np.meshgrid(*([np.arange(0, length)] * number))).T.reshape(-1, number)
z = np.sum(np.identity(length)[x], 1, dtype=bool).astype(int)
return np.unique(z[np.sum(z, axis=1) == number], axis=0).astype(bool)
[docs]
def make_array_with_random_subset_of_dna_seqs_hamming_distance(
num_seqs: int, dist: int, seq: str, rng: np.random.Generator = default_rng,
bases: Collection[str] = ('A', 'C', 'G', 'T')) -> np.ndarray:
"""
Return 2D numpy array with random subset of size `num_seqs` of DNA sequences of given length.
Bases contains bases to be used: ('A','C','G','T') by default, but can be set to a subset of these.
Uses the encoding described in the documentation for DNASeqList. The result is a 2D array,
where each row represents a DNA sequence, and that row has one byte per base.
Sampled *with* replacement, so the same row may appear twice in the returned array
:param num_seqs:
number of sequences to generate
:param dist:
Hamming distance to be from `seq`
:param seq:
sequence to generate other sequences close to
:param bases:
DNA bases to use
:param rng:
numpy random number generator (type returned by numpy.random.default_rng())
:return:
2D numpy array with random subset of size `num_seqs` of DNA sequences of given length
"""
if not set(bases) <= {'A', 'C', 'G', 'T'}:
raise ValueError(f"bases must be a subset of {'A', 'C', 'G', 'T'}; cannot be {bases}")
if len(bases) == 0:
raise ValueError('bases cannot be empty')
elif len(bases) == 1:
raise ValueError('bases must have at least two elements')
length = len(seq)
if dist < 1:
raise ValueError(f'dist must be positive, but dist = {dist}')
if dist > length:
raise ValueError(f'should have dist <= len("{seq}") = {length}, but dist = {dist}')
num_different_bases = len(bases) - 1
# map subset of bases used to *prefix* of 0,1,2,3
base2bits_local = {base: digit for base, digit in zip(bases, range(4))}
seq_as_arr = seq2arr(seq, base2bits_local=base2bits_local)
seqs = np.tile(seq_as_arr, num_seqs).reshape(num_seqs, length)
# for simplicity of modular arithmetic, we use integers 0,...,len(bases)-1 to represent the bases,
# then map these back to the correct subset of 0,1,2,3 when we are done
subseq_offsets = rng.integers(low=1, high=num_different_bases + 1, size=(num_seqs, dist))
assert len(subseq_offsets) == num_seqs
# all (length choose dist) indices where we could change the bases
idxs = random_choice_noreplace(np.arange(length), dist, num_seqs, rng)
assert len(idxs) == num_seqs
changes = rng.integers(1, num_different_bases + 1, size=idxs.shape, dtype=np.uint8)
seqs[np.arange(num_seqs)[:, None], idxs] += changes
seqs = np.mod(seqs, num_different_bases + 1)
# now map back to correct subset of 0,1,2,3 to represent bases
for base, digit in zip(['A', 'C', 'G', 'T'], range(4)):
if base not in bases:
idxs_to_inc = seqs >= digit
seqs[idxs_to_inc] += 1
# use the next two lines to return only unique rows
# seqs = np.unique(seqs, axis=0)
# rng.shuffle(seqs)
return seqs
# def random_hamming(sequence: Union[List[int], np.ndarray], distance: int, number: int,
# rng: np.random.Generator) -> np.ndarray:
# sequence = np.array(sequence, dtype=np.uint8)
# length = len(sequence)
# seqrepeats = np.tile(sequence, number).reshape((number, length))
# places = random_choice_noreplace(np.arange(length), distance, number, rng)
# changes = rng.integers(1, 4, size=places.shape, dtype=np.uint8)
# seqrepeats[np.arange(number)[:, None], places] += changes
# seqrepeats = np.mod(seqrepeats, 4)
# return seqrepeats
# https://stackoverflow.com/a/59328647/5339430
def random_choice_noreplace(lst: np.ndarray, n_sample: int, num_draw: int,
rng: np.random.Generator) -> np.ndarray:
# lst: 1-D array or list
# n_sample: sample size for each draw
# num_draw: number of draws
# Intuition: Randomly generate numbers, get the index of the smallest n_sample number for each row.
lst = np.array(lst)
random_array_floats = rng.random((num_draw, len(lst)))
return lst[np.argpartition(random_array_floats, n_sample - 1, axis=-1)[:, :n_sample]]
# @lru_cache(maxsize=10000000)
[docs]
def longest_common_substring(a1: np.ndarray, a2: np.ndarray, vectorized: bool = True) -> Tuple[int, int, int]:
"""Return start and end indices (a1start, a2start, length) of longest common
substring (subarray) of 1D arrays a1 and a2."""
assert len(a1.shape) == 1
assert len(a2.shape) == 1
counter = np.zeros(shape=(len(a1) + 1, len(a2) + 1), dtype=np.int)
a1idx_longest = a2idx_longest = -1
len_longest = 0
if vectorized:
for i1 in range(len(a1)):
idx = (a2 == a1[i1])
idx_shifted = np.hstack([[False], idx])
counter[i1 + 1, idx_shifted] = counter[i1, idx] + 1
idx_longest = np.unravel_index(np.argmax(counter), counter.shape)
if idx_longest[0] > 0:
len_longest = counter[idx_longest]
a1idx_longest = int(idx_longest[0] - len_longest)
a2idx_longest = int(idx_longest[1] - len_longest)
else:
for i1 in range(len(a1)):
for i2 in range(len(a2)):
if a1[i1] == a2[i2]:
c = counter[i1, i2] + 1
counter[i1 + 1, i2 + 1] = c
if c > len_longest:
len_longest = c
a1idx_longest = i1 + 1 - c
a2idx_longest = i2 + 1 - c
return a1idx_longest, a2idx_longest, len_longest
# @lru_cache(maxsize=10000000)
[docs]
def longest_common_substrings_singlea1(a1: np.ndarray, a2s: np.ndarray) \
-> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Return start and end indices (a1starts, a2starts, lengths) of longest common
substring (subarray) of 1D array a1 and rows of 2D array a2s.
If length[i]=0, then a1starts[i]=a2starts[i]=0 (not -1), so be sure to check
length[i] to see if any substrings actually matched."""
assert len(a1.shape) == 1
assert len(a2s.shape) == 2
numa2s = a2s.shape[0]
len_a1 = len(a1)
len_a2 = a2s.shape[1]
counter = np.zeros(shape=(len_a1 + 1, numa2s, len_a2 + 1), dtype=np.int)
for i1 in range(len(a1)):
idx = (a2s == a1[i1])
idx_shifted = np.insert(idx, 0, np.zeros(numa2s, dtype=bool), axis=1)
counter[i1 + 1, idx_shifted] = counter[i1, idx] + 1
counter = np.swapaxes(counter, 0, 1)
counter_flat = counter.reshape(numa2s, (len_a1 + 1) * (len_a2 + 1))
idx_longest_raveled = np.argmax(counter_flat, axis=1)
len_longest = counter_flat[np.arange(counter_flat.shape[0]), idx_longest_raveled]
idx_longest = np.unravel_index(idx_longest_raveled, dims=(len_a1 + 1, len_a2 + 1))
a1idx_longest = idx_longest[0] - len_longest
a2idx_longest = idx_longest[1] - len_longest
return a1idx_longest, a2idx_longest, len_longest
# @lru_cache(maxsize=10000000)
[docs]
def longest_common_substrings_product(a1s: np.ndarray, a2s: np.ndarray) \
-> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Return start and end indices (a1starts, a2starts, lengths) of longest common
substring (subarray) of each pair in the cross product of rows of a1s and a2s.
If length[i]=0, then a1starts[i]=a2starts[i]=0 (not -1), so be sure to check
length[i] to see if any substrings actually matched."""
numa1s = a1s.shape[0]
numa2s = a2s.shape[0]
a1s_cp = np.repeat(a1s, numa2s, axis=0)
a2s_cp = np.tile(a2s, (numa1s, 1))
a1idx_longest, a2idx_longest, len_longest = _longest_common_substrings_pairs(a1s_cp, a2s_cp)
a1idx_longest = a1idx_longest.reshape(numa1s, numa2s)
a2idx_longest = a2idx_longest.reshape(numa1s, numa2s)
len_longest = len_longest.reshape(numa1s, numa2s)
return a1idx_longest, a2idx_longest, len_longest
def pair_index(n: int) -> np.ndarray:
index = np.fromiter(it.chain.from_iterable(it.combinations(range(n), 2)), int, count=n * (n - 1))
return index.reshape(-1, 2)
def _longest_common_substrings_pairs(a1s: np.ndarray, a2s: np.ndarray) \
-> Tuple[np.ndarray, np.ndarray, np.ndarray]:
assert len(a1s.shape) == 2
assert len(a2s.shape) == 2
assert a1s.shape[0] == a2s.shape[0]
numpairs = a1s.shape[0]
len_a1 = a1s.shape[1]
len_a2 = a2s.shape[1]
counter = np.zeros(shape=(len_a1 + 1, numpairs, len_a2 + 1), dtype=np.int)
for i1 in range(len_a1):
a1s_cp_col = a1s[:, i1].reshape(numpairs, 1)
a1s_cp_col_rp = np.repeat(a1s_cp_col, len_a2, axis=1)
idx = (a2s == a1s_cp_col_rp)
idx_shifted = np.hstack([np.zeros(shape=(numpairs, 1), dtype=bool), idx])
counter[i1 + 1, idx_shifted] = counter[i1, idx] + 1
counter = np.swapaxes(counter, 0, 1)
counter_flat = counter.reshape(numpairs, (len_a1 + 1) * (len_a2 + 1))
idx_longest_raveled = np.argmax(counter_flat, axis=1)
len_longest = counter_flat[np.arange(counter_flat.shape[0]), idx_longest_raveled]
idx_longest = np.unravel_index(idx_longest_raveled, dims=(len_a1 + 1, len_a2 + 1))
a1idx_longest = idx_longest[0] - len_longest
a2idx_longest = idx_longest[1] - len_longest
return a1idx_longest, a2idx_longest, len_longest
[docs]
def longest_common_substrings_all_pairs_strings(seqs1: Sequence[str], seqs2: Sequence[str]) \
-> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""For Python strings"""
a1s = seqs2arr(seqs1)
a2s = seqs2arr(seqs2)
return _longest_common_substrings_pairs(a1s, a2s)
def _strongest_common_substrings_all_pairs_return_energies_and_counter(
a1s: np.ndarray, a2s: np.ndarray, temperature: float) \
-> Tuple[np.ndarray, np.ndarray]:
assert len(a1s.shape) == 2
assert len(a2s.shape) == 2
assert a1s.shape[0] == a2s.shape[0]
numpairs = a1s.shape[0]
len_a1 = a1s.shape[1]
len_a2 = a2s.shape[1]
counter = np.zeros(shape=(len_a1 + 1, numpairs, len_a2 + 1), dtype=np.int)
energies = np.zeros(shape=(len_a1 + 1, numpairs, len_a2 + 1), dtype=np.float)
# if not loop_energies:
loop_energies = calculate_loop_energies(temperature)
prev_match_shifted_idxs = None
for i1 in range(len_a1):
a1s_col = a1s[:, i1].reshape(numpairs, 1)
a1s_col_rp = np.repeat(a1s_col, len_a2, axis=1)
# find matching chars and extend length of substring
match_idxs = (a2s == a1s_col_rp)
match_shifted_idxs = np.hstack([np.zeros(shape=(numpairs, 1), dtype=bool), match_idxs])
counter[i1 + 1, match_shifted_idxs] = counter[i1, match_idxs] + 1
if i1 > 0:
# calculate energy if matching substring has length > 1
prev_bases = a1s[:, i1 - 1]
cur_bases = a1s[:, i1]
loops = (prev_bases << 2) + cur_bases
latest_energies = loop_energies[loops].reshape(numpairs, 1)
latest_energies_rp = np.repeat(latest_energies, len_a2, axis=1)
match_idxs_false_at_end = np.hstack([match_idxs, np.zeros(shape=(numpairs, 1), dtype=bool)])
both_match_idxs = match_idxs_false_at_end & prev_match_shifted_idxs
prev_match_shifted_shifted_idxs = np.hstack(
[np.zeros(shape=(numpairs, 1), dtype=bool), prev_match_shifted_idxs])[:, :-1]
both_match_shifted_idxs = match_shifted_idxs & prev_match_shifted_shifted_idxs
energies[i1 + 1, both_match_shifted_idxs] = energies[i1, both_match_idxs] + latest_energies_rp[
both_match_idxs]
# prev_match_idxs = match_idxs
prev_match_shifted_idxs = match_shifted_idxs
counter = counter.swapaxes(0, 1)
energies = energies.swapaxes(0, 1)
return counter, energies
def internal_loop_penalty(n: int, temperature: float) -> float:
return 1.5 + (2.5 * 0.002 * temperature * math.log(1 + n))
def _strongest_common_substrings_all_pairs(a1s: np.ndarray, a2s: np.ndarray, temperature: float) \
-> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
numpairs = a1s.shape[0]
len_a1 = a1s.shape[1]
len_a2 = a2s.shape[1]
counter, energies = _strongest_common_substrings_all_pairs_return_energies_and_counter(a1s, a2s,
temperature)
counter_flat = counter.reshape(numpairs, (len_a1 + 1) * (len_a2 + 1))
energies_flat = energies.reshape(numpairs, (len_a1 + 1) * (len_a2 + 1))
idx_strongest_raveled = np.argmax(energies_flat, axis=1)
len_strongest = counter_flat[np.arange(counter_flat.shape[0]), idx_strongest_raveled]
energy_strongest = energies_flat[np.arange(counter_flat.shape[0]), idx_strongest_raveled]
idx_strongest = np.unravel_index(idx_strongest_raveled, dims=(len_a1 + 1, len_a2 + 1))
a1idx_strongest = idx_strongest[0] - len_strongest
a2idx_strongest = idx_strongest[1] - len_strongest
return a1idx_strongest, a2idx_strongest, len_strongest, energy_strongest
[docs]
def strongest_common_substrings_all_pairs_string(seqs1: Sequence[str], seqs2: Sequence[str],
temperature: float) \
-> Tuple[List[float], List[float], List[float], List[float]]:
"""For Python strings representing DNA; checks for reverse complement matches
rather than direct matches, and evaluates nearest neighbor energy, returning
indices lengths, and energies of strongest complementary substrings."""
a1s = seqs2arr(seqs1)
a2s = seqs2arr(seqs2)
a1idx_strongest, a2idx_strongest, len_strongest, energy_strongest = _strongest_common_substrings_all_pairs(
a1s,
wc_arr(a2s),
temperature)
return list(a1idx_strongest), list(a2idx_strongest), list(len_strongest), list(energy_strongest)
def energies_strongest_common_substrings(seqs1: Sequence[str], seqs2: Sequence[str], temperature: float) \
-> List[float]:
a1s = seqs2arr(seqs1)
a2s = seqs2arr(seqs2)
a1idx_strongest, a2idx_strongest, len_strongest, energy_strongest = \
_strongest_common_substrings_all_pairs(a1s, wc_arr(a2s), temperature)
return list(energy_strongest)
[docs]
@dataclass
class DNASeqList:
"""
Represents a list of DNA sequences of identical length. The sequences are stored as a 2D numpy array
of bytes :py:data:`DNASeqList.seqarr`. Each byte represents a single DNA base (so it is not a compact
representation; the most significant 6 bits of the byte will always be 0).
"""
seqarr: np.ndarray
"""
Uses a (noncompact) internal representation using 8 bits (1 byte, dtype = np.ubyte) per base,
stored in a numpy 2D array of bytes.
Each row (axis 0) is a DNA sequence, and each column (axis 1) is a base in a sequence.
The code used is :math:`A \\to 0, C \\to 1, G \\to 2, T \\to 3`.
"""
numseqs: int
"""Number of DNA sequences (number of rows, axis 0, in :py:data:`DNASeqList.seqarr`)"""
seqlen: int
"""Length of each DNA sequence (number of columns, axis 1, in :py:data:`DNASeqList.seqarr`)"""
rng: np.random.Generator
"""Random number generator to use."""
[docs]
def __init__(self,
length: int | None = None,
num_random_seqs: int | None = None,
shuffle: bool = False,
alphabet: Collection[str] = ('A', 'C', 'G', 'T'),
seqs: Sequence[str] | None = None,
seqarr: np.ndarray | None = None,
filename: str | None = None,
rng: np.random.Generator = default_rng,
hamming_distance_from_sequence: Tuple[int, str] | None = None):
"""
Creates a set of DNA sequences, all of the same length.
Create either all sequences of a given length if seqs is not specified,
or all sequences in seqs if seqs is specified. If neither is specified
then all sequences of length 3 are created.
*Exactly one* of the following should be specified:
- `length` (possibly along with `alphabet` and `num_random_seqs`)
- `seqs`
- `seqarr`
- `filename`
- `hamming_distance_from_sequence` (possibly along with `alphabet` and `num_random_seqs`)
:param length:
length of sequences; `num_seqs` and `alphabet` can also be specified along with it
:param hamming_distance_from_sequence:
if specified and equal to `(dist, seq)` of type (int, str),
then only sequences at Hamming distance `dist` from `seq` will be generated.
Raises error if `length`, `seqs`, `seqarr`, or `filename` is specified.
:param num_random_seqs:
number of sequences to generate; if not specified, then all sequences
of length `length` using bases from `alphabet` are generated.
Sequences are sampled *with* replacement, so the same sequence may appear twice.
:param shuffle:
whether to shuffle sequences
:param alphabet:
a subset of {'A', 'C', 'G', 'T'}
:param seqs:
sequence (e.g., list or tuple) of strings, all of the same length
:param seqarr:
2D NumPy array, with axis 0 moving between sequences,
and axis 1 moving between consecutive DNA bases in a sequence
:param filename:
name of file containing a :any:`DNASeqList`
as written by :py:meth:`DNASeqList.write_to_file`
:param rng:
numpy random number generator (type returned by numpy.random.default_rng())
"""
for v1, v2 in it.combinations([length, seqs, seqarr, filename, hamming_distance_from_sequence], 2):
if v1 is not None and v2 is not None:
raise ValueError('exactly one of length, seqs, seqarr, filename, or '
'hamming_distance_from_sequence must be non-None')
self.rng = rng
if seqarr is not None:
self.seqarr = seqarr
self._update_size()
elif seqs is not None:
if len(seqs) == 0:
raise ValueError('seqs must have positive length')
self.seqlen = len(seqs[0])
for seq in seqs:
if len(seq) != self.seqlen:
raise ValueError('All sequences in seqs must be equal length')
self.numseqs = len(seqs)
self.seqarr = seqs2arr(seqs)
elif filename is not None:
self._read_from_file(filename)
elif length is not None:
if num_random_seqs is None:
self.seqarr = make_array_with_all_dna_seqs(length=length, bases=alphabet)
else:
self.seqarr = make_array_with_random_subset_of_dna_seqs(
length=length, num_random_seqs=num_random_seqs, rng=self.rng, bases=alphabet)
self.seqlen = length
self.numseqs = len(self.seqarr) if self.seqlen > 0 else 1
elif hamming_distance_from_sequence is not None:
dist, seq = hamming_distance_from_sequence
if num_random_seqs is None:
self.seqarr = make_array_with_all_dna_seqs_hamming_distance(dist=dist, seq=seq,
bases=alphabet)
else:
self.seqarr = make_array_with_random_subset_of_dna_seqs_hamming_distance(
num_seqs=num_random_seqs, dist=dist, seq=seq, rng=self.rng, bases=alphabet)
self.seqlen = len(seq)
self.numseqs = len(self.seqarr) if self.seqlen > 0 else 1
else:
raise ValueError('at least one of length, seqs, seqarr, filename, or '
'hamming_distance_from_sequence must be specified')
self.shift = np.arange(2 * (self.seqlen - 1), -1, -2)
if shuffle:
self.shuffle()
[docs]
def random_choice(self, num: int, rng: np.random.Generator = default_rng,
replace: bool = False) -> List[str]:
"""
Returns random choice of `num` DNA sequence(s) (represented as list of Python strings).
:param num:
number of sequences to sample
:param rng:
random number generator to use
:param replace:
whether to sample with replacement
:return:
sampled sequences
"""
idxs = rng.choice(np.arange(self.numseqs), num, replace=replace)
seqs = [self[int(idx)] for idx in idxs]
return seqs
[docs]
def random_sequence(self, rng: np.random.Generator = default_rng) -> str:
"""
Returns random DNA sequence (represented as Python string).
:return:
sampled sequence
"""
idx = int(rng.integers(0, self.numseqs))
return self[idx]
def _update_size(self) -> None:
# updates numseqs and seqlen based on shape of seqarr
self.numseqs, self.seqlen = self.seqarr.shape
def __len__(self) -> int:
return self.numseqs
def __contains__(self, seq: str) -> bool:
if len(seq) != self.seqlen:
return False
arr = seq2arr(seq)
return np.any(~np.any(self.seqarr - arr, 1))
def _read_from_file(self, filename: str) -> None:
"""Reads from fileName in the format defined in writeToFile.
Only meant to be called from constructor."""
with open(filename, 'r+') as f:
first_line = f.readline()
num_seqs_str, seq_len_str, temperature = first_line.split()
self.numseqs = int(num_seqs_str)
self.seqlen = int(seq_len_str)
self.seqarr = np.empty((self.numseqs, self.seqlen), dtype=np.ubyte)
for i in range(self.numseqs):
line = f.readline()
seq = line.strip()
self.seqarr[i] = [base2bits[base] for base in seq]
[docs]
def write_to_file(self, filename: str) -> None:
"""Writes text file describing DNA sequence list, in format
numseqs seqlen
seq1
seq2
seq3
...
where numseqs, seqlen are integers, and seq1,
... are strings from {A,C,G,T}"""
with open(filename, 'w+') as f:
f.write(str(self.numseqs) + ' ' + str(self.seqlen) + '\n')
for i in range(self.numseqs):
f.write(self.get_seq_str(i) + '\n')
[docs]
def wcenergy(self, idx: int, temperature: float) -> float:
"""Return energy of idx'th sequence binding to its complement."""
return wcenergy(self.seqarr[idx], temperature)
def __repr__(self) -> str:
return 'DNASeqSet(seqs={})'.format(str([self[i] for i in range(self.numseqs)]))
def __str__(self) -> str:
if self.numseqs <= 256:
ret = [self.get_seq_str(i) for i in range(self.numseqs)]
return ','.join(ret)
else:
ret = [self.get_seq_str(i) for i in range(3)] + ['...'] + \
[self.get_seq_str(i) for i in range(self.numseqs - 3, self.numseqs)]
return ','.join(ret)
def shuffle(self) -> None:
self.rng.shuffle(self.seqarr)
[docs]
def to_list(self) -> List[str]:
"""Return list of strings representing the sequences, e.g. ['ACG','TAA']"""
return [self.get_seq_str(idx) for idx in range(self.numseqs)]
[docs]
def get_seq_str(self, idx: int) -> str:
"""Return idx'th DNA sequence as a string."""
return arr2seq(self.seqarr[idx])
[docs]
def get_seqs_str_list(self, slice_: slice) -> List[str]:
"""Return a list of strings specified by slice."""
bases_lst = self.seqarr[slice_]
ret = []
for bases in bases_lst:
bases_ch = [bits2base[base] for base in bases]
ret.append(''.join(bases_ch))
return ret
[docs]
def keep_seqs_at_indices(self, indices: Iterable[int]) -> None:
"""Keeps only sequences at the given indices."""
if not isinstance(indices, list):
indices = list(indices)
self.seqarr = self.seqarr[indices]
self._update_size()
def __getitem__(self, slice_: int | slice) -> str | List[str]:
if isinstance(slice_, int):
return self.get_seq_str(slice_)
elif isinstance(slice_, slice):
return self.get_seqs_str_list(slice_)
else:
raise ValueError('slice_ must be int or slice')
def __setitem__(self, idx: int, seq: str) -> None:
# cannot set on slice
self.seqarr[idx] = seq2arr(seq)
[docs]
def pop(self) -> str:
"""Remove and return last seq, as a string."""
seq_str = self.get_seq_str(-1)
self.seqarr = np.delete(self.seqarr, -1, 0)
self.numseqs -= 1
return seq_str
[docs]
def pop_array(self) -> np.ndarray:
"""Remove and return last seq, as a string."""
arr = self.seqarr[-1]
self.seqarr = np.delete(self.seqarr, -1, 0)
self.numseqs -= 1
return arr
def append_seq(self, newseq: str) -> None:
self.append_arr(seq2arr(newseq))
def append_arr(self, newarr: np.ndarray) -> None:
self.seqarr = np.vstack([self.seqarr, newarr])
self.numseqs += 1
def sequences_at_hamming_distance(self, sequence: str, distance: int) -> DNASeqList:
sequence_1d_array = seq2arr(sequence)
distances = np.sum(np.bitwise_xor(self.seqarr, sequence_1d_array) != 0, axis=1)
indices_at_distance = distances == distance
arr = self.seqarr[indices_at_distance]
return DNASeqList(seqarr=arr)
[docs]
def hamming_map(self, sequence: str) -> Dict[int, DNASeqList]:
"""Return dict mapping each length `d` to a :any:`DNASeqList` of sequences that are
Hamming distance `d` from `seq`."""
# import time
# times = []
# before = time.perf_counter_ns()
sequence_1d_array = seq2arr(sequence)
distances = np.sum(np.bitwise_xor(self.seqarr, sequence_1d_array) != 0, axis=1)
distance_map = {}
# after_it_prev = time.perf_counter_ns()
# print(f'time to calculate Hamming distances: {(after_it_prev - before) / 1e6:.1f} ms')
for distance in range(self.seqlen + 1):
indices_at_distance = distances == distance
arr = self.seqarr[indices_at_distance]
if arr.shape[0] > 0: # don't bother putting empty array into map
distance_map[distance] = DNASeqList(seqarr=arr)
# after_it_next = time.perf_counter_ns()
# times.append((after_it_next - after_it_prev) / 1e6)
# after_it_prev = after_it_next
# after = time.perf_counter_ns()
# print(f'time spent finding neighbors: {(after - before) / 1e6:.1f} ms')
# print(f'times in each iteration: {times}')
return distance_map
[docs]
def sublist(self, start: int, end: int | None = None) -> DNASeqList:
"""Return sublist of DNASeqList from `start`, inclusive, to `end`, exclusive.
If `end` is not specified, goes until the end of the list."""
if end is None:
end = self.numseqs
arr = self.seqarr[start:end]
return DNASeqList(seqarr=arr)
# def filter_hamming(self, threshold: int) -> None:
# seq = self.pop_array()
# arr_keep = np.array([seq])
# self.shuffle()
# while self.seqarr.shape[0] > 0:
# seq = self.pop_array()
# while self.seqarr.shape[0] > 0:
# hamming_min = np.min(np.sum(np.bitwise_xor(arr_keep, seq) != 0, axis=1))
# too_close = (hamming_min < threshold)
# if not too_close:
# break
# seq = self.pop_array()
# arr_keep = np.vstack([arr_keep, seq])
# self.seqarr = arr_keep
# self.numseqs = self.seqarr.shape[0]
#
# def hamming_min(self, arr: np.ndarray) -> int:
# """Returns minimum Hamming distance between arr and any sequence in
# this DNASeqList."""
# distances = np.sum(np.bitwise_xor(self.seqarr, arr) != 0, axis=1)
# return np.min(distances)
[docs]
def filter_energy(self, low: float, high: float, temperature: float) -> DNASeqList:
"""Return new DNASeqList with seqs whose wc complement energy is within
the given range."""
wcenergies = calculate_wc_energies(self.seqarr, temperature)
within_range = (low <= wcenergies) & (wcenergies <= high)
new_seqarr = self.seqarr[within_range]
return DNASeqList(seqarr=new_seqarr)
[docs]
def energies(self, temperature: float) -> np.ndarray:
"""
:param temperature:
temperature in Celsius
:return:
nearest-neighbor energies of each sequence with its perfect Watson-Crick complement
"""
wcenergies = calculate_wc_energies(self.seqarr, temperature)
return wcenergies
[docs]
def filter_end_gc(self) -> DNASeqList:
"""Remove any sequence with A or T on the end. Also remove domains that
do not have an A or T either next to that base, or one away. Otherwise
we could get a domain ending in {C,G}^3, which, placed next to any
domain ending in C or G, will create a substring in {C,G}^4 and be
rejected if we are filtering those."""
left = self.seqarr[:, 0]
right = self.seqarr[:, -1]
left_p1 = self.seqarr[:, 1]
left_p2 = self.seqarr[:, 2]
right_m1 = self.seqarr[:, -2]
right_m2 = self.seqarr[:, -3]
abits = base2bits['A']
cbits = base2bits['C']
gbits = base2bits['G']
tbits = base2bits['T']
good = (((left == cbits) | (left == gbits)) & ((right == cbits) | (right == gbits)) &
((left_p1 == abits) | (left_p1 == tbits) | (left_p2 == abits) | (left_p2 == tbits)) &
((right_m1 == abits) | (right_m1 == tbits) | (right_m2 == abits) | (right_m2 == tbits)))
seqarrpass = self.seqarr[good]
return DNASeqList(seqarr=seqarrpass)
[docs]
def filter_end_at(self, gc_near_end: bool = False) -> DNASeqList:
"""Remove any sequence with C or G on the end. Also, if gc_near_end is True,
remove domains that do not have an C or G either next to that base,
or one away, to prevent breathing."""
left = self.seqarr[:, 0]
right = self.seqarr[:, -1]
abits = base2bits['A']
tbits = base2bits['T']
good = ((left == abits) | (left == tbits)) & ((right == abits) | (right == tbits))
if gc_near_end:
cbits = base2bits['C']
gbits = base2bits['G']
left_p1 = self.seqarr[:, 1]
left_p2 = self.seqarr[:, 2]
right_m1 = self.seqarr[:, -2]
right_m2 = self.seqarr[:, -3]
good = (good &
((left_p1 == cbits) | (left_p1 == gbits) | (left_p2 == cbits) | (left_p2 == gbits)) &
((right_m1 == cbits) | (right_m1 == gbits) | (right_m2 == cbits) | (right_m2 == gbits)))
seqarrpass = self.seqarr[good]
return DNASeqList(seqarr=seqarrpass)
[docs]
def filter_base_nowhere(self, base: str) -> DNASeqList:
"""Remove any sequence that has given base anywhere."""
good = (self.seqarr != base2bits[base]).all(axis=1)
seqarrpass = self.seqarr[good]
return DNASeqList(seqarr=seqarrpass)
[docs]
def filter_base_count(self, base: str, low: int, high: int) -> DNASeqList:
"""Remove any sequence not satisfying low <= #base <= high."""
sumarr = np.sum(self.seqarr == base2bits[base], axis=1)
good = (low <= sumarr) & (sumarr <= high)
seqarrpass = self.seqarr[good]
return DNASeqList(seqarr=seqarrpass)
[docs]
def filter_base_at_pos(self, pos: int, base: str) -> DNASeqList:
"""Remove any sequence that does not have given base at position pos."""
mid = self.seqarr[:, pos]
good = (mid == base2bits[base])
seqarrpass = self.seqarr[good]
return DNASeqList(seqarr=seqarrpass)
[docs]
def filter_substring(self, subs: Sequence[str]) -> DNASeqList:
"""Remove any sequence with any elements from subs as a substring."""
if len(set([len(sub) for sub in subs])) != 1:
raise ValueError(f'All substrings in subs must be equal length: {subs}')
sublen = len(subs[0])
subints = [[base2bits[base] for base in sub] for sub in subs]
powarr = [4 ** k for k in range(sublen)]
subvals = np.dot(subints, powarr)
toeplitz = create_toeplitz(self.seqlen, sublen)
convolution = np.dot(toeplitz, self.seqarr.transpose())
passall = np.ones(self.numseqs, dtype=bool)
for subval in subvals:
passsub = np.all(convolution != subval, axis=0)
passall = passall & passsub
seqarrpass = self.seqarr[passall]
return DNASeqList(seqarr=seqarrpass)
[docs]
def filter_seqs_by_g_quad(self) -> DNASeqList:
"""Removes any sticky ends with 4 G's in a row (a G-quadruplex)."""
return self.filter_substring(['GGGG'])
[docs]
def filter_seqs_by_g_quad_c_quad(self) -> DNASeqList:
"""Removes any sticky ends with 4 G's or C's in a row (a quadruplex)."""
return self.filter_substring(['GGGG', 'CCCC'])
def index(self, sequence: str | np.ndarray) -> int:
# finds index of sequence in (rows of) self.seqarr
# raises IndexError if not present
# taken from https://stackoverflow.com/questions/40382384/finding-a-matching-row-in-a-numpy-matrix
if isinstance(sequence, str):
sequence = seq2arr(sequence)
matching_condition = (self.seqarr == sequence).all(axis=1)
all_indices_tuple = np.where(matching_condition)
all_indices = all_indices_tuple[0]
first_index = all_indices[0]
return int(first_index)
[docs]
def create_toeplitz(seqlen: int, sublen: int, indices: Sequence[int] | None = None) -> np.ndarray:
"""Creates a toeplitz matrix, useful for finding subsequences.
`seqlen` is length of larger sequence; `sublen` is length of substring we're checking for.
If `indices` is None, then all rows are created, otherwise only rows for checking those indices
are created."""
powarr = [4 ** k for k in range(sublen)]
if indices is None:
rows = list(range(seqlen - (sublen - 1)))
else:
rows = sorted(list(set(indices)))
for idx in rows:
if idx < 0:
raise ValueError(f'index must be nonnegative, but {idx} is not; all indices = {indices}')
if idx >= seqlen - (sublen - 1):
raise ValueError(f'index must be less than {seqlen - (sublen - 1)}, '
f'but {idx} is not; all indices = {indices}')
num_rows = len(rows)
num_cols = seqlen
toeplitz = np.zeros((num_rows, num_cols), dtype=int)
toeplitz[:, 0:sublen] = [powarr] * num_rows
shift = list(rows)
for i in range(len(rows)):
toeplitz[i] = np.roll(toeplitz[i], shift[i])
return toeplitz
[docs]
@lru_cache(maxsize=32)
def calculate_loop_energies(temperature: float, negate: bool = False) -> np.ndarray:
"""Get SantaLucia and Hicks nearest-neighbor loop energies for given temperature,
1 M Na+. """
energies = (_dH - (temperature + 273.15) * _dS / 1000.0)
if negate:
energies = -energies
return energies
# SantaLucia & Hicks' values are in cal/mol/K for dS, and kcal/mol for dH.
# Here we divide dS by 1000 to get the RHS term into units of kcal/mol/K
# which gives an overall dG in units of kcal/mol.
# One reason we want dG to be in units of kcal/mol is to
# give reasonable/readable numbers close to 0 for dG(Assembly).
# The reason we might want to flip the sign is that, by convention, in the kTAM, G_se
# (which is computed from the usually negative dG here) is usually positive.
# _dH and _dS come from Table 1 in SantaLucia and Hicks, Annu Rev Biophys Biomol Struct. 2004;33:415-40.
# AA AC AG AT CA CC CG CT
_dH = np.array([-7.6, -8.4, -7.8, -7.2, -8.5, -8.0, -10.6, -7.8,
# GA GC GG GT TA TC TG TT
-8.2, -9.8, -8.0, -8.4, -7.2, -8.2, -8.5, -7.6],
dtype=np.float32)
# AA AC AG AT CA CC CG CT
_dS = np.array([-21.3, -22.4, -21.0, -20.4, -22.7, -19.9, -27.2, -21.0,
# GA GC GG GT TA TC TG TT
-22.2, -24.4, -19.9, -22.4, -21.3, -22.2, -22.7, -21.3],
dtype=np.float32)
# AA AC AG AT CA CC CG CT GA GC GG GT TA TC TG TT
# 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 34
# nearest-neighbor energies for Watson-Crick complements at 37C
# (Table 1 in SantaLucia and Hicks 2004)
# ordering of array is
# # AA AC AG AT CA CC CG CT
# _nndGwc = np.array([-1.00,-1.44,-1.28,-0.88,-1.45,-1.84,-2.17,-1.28,
# # GA GC GG GT TA TC TG TT
# -1.30,-2.24,-1.84,-1.44,-0.58,-1.30,-1.45,-1.00],
# dtype=np.float32)
# # AA AC AG AT CA CC CG CT
# _nndGwc = np.array([1.00,1.44,1.28,0.88,1.45,1.84,2.17,1.28,
# # GA GC GG GT TA TC TG TT
# 1.30,2.24,1.84,1.44,0.58,1.30,1.45,1.00],
# dtype=np.float32)
# _nndGwcStr = {'AA':1.00,'AC':1.44,'AG':1.28,'AT':0.88,'CA':1.45,'CC':1.84,
# 'CG':2.17,'CT':1.28,'GA':1.30,'GC':2.24,'GG':1.84,'GT':1.44,
# 'TA':0.58,'TC':1.30,'TG':1.45,'TT':1.00}
# nearest-neighbor energies for single mismatches (Table 2 in SantaLucia)
# ordering of array is
# # GA/CA GA/CG AG AT CA CC CG CT GA GC GG GT TA TC TG TT
# _nndGsmm = np.array([0.17,-1.44,-1.28,-0.88,-1.45,-1.84,-2.17,-1.28,-1.3,-2.24,-1.84,-1.44,-0.58,-1.3,-1.45,-1.0], dtype=np.float32)
_all_pairs = [((i << 2) + j, bits2base[i] + bits2base[j])
for i in range(4) for j in range(4)]
@lru_cache(maxsize=32)
def calculate_loop_energies_dict(temperature: float, negate: bool = False) -> Dict[str, float]:
loop_energies = calculate_loop_energies(temperature, negate)
return {pair[1]: loop_energies[pair[0]] for pair in _all_pairs}
[docs]
@lru_cache(maxsize=100000)
def wcenergy(seq: str, temperature: float, negate: bool = False) -> float:
"""Return the wc energy of seq binding to its complement."""
loop_energies = calculate_loop_energies_dict(temperature, negate)
return sum(loop_energies[seq[i:i + 2]] for i in range(len(seq) - 1))
def wcenergies_str(seqs: Sequence[str], temperature: float, negate: bool = False) -> List[float]:
seqarr = seqs2arr(seqs)
return list(calculate_wc_energies(seqarr, temperature, negate))
def wcenergy_str(seq: str, temperature: float, negate: bool = False) -> float:
return wcenergies_str([seq], temperature, negate)[0]
[docs]
def calculate_wc_energies(seqarr: np.ndarray, temperature: float, negate: bool = False) -> np.ndarray:
"""Calculate and store in an array all energies of all sequences in seqarr
with their Watson-Crick complements."""
loop_energies = calculate_loop_energies(temperature, negate)
left_index_bits = seqarr[:, :-1] << 2
right_index_bits = seqarr[:, 1:]
pair_indices = left_index_bits + right_index_bits
pair_energies = loop_energies[pair_indices]
energies: np.ndarray = np.sum(pair_energies, axis=1)
return energies
[docs]
def wc_arr(seqarr: np.ndarray) -> np.ndarray:
"""Return numpy array of reverse complements of sequences in `seqarr`."""
return (3 - seqarr)[:, ::-1]