Updating sequence alignment to work with clustal
Changed definitions Working on auto scoring
This commit is contained in:
parent
afed80ca7f
commit
a09d13f1e5
@ -18,7 +18,6 @@ Date: 25 November
|
||||
Herreweb
|
||||
"""
|
||||
|
||||
|
||||
def parse_sequence(sequence):
|
||||
"""
|
||||
Parse a sequence, either as a pandas DataFrame or a string, and return the result.
|
||||
@ -38,36 +37,29 @@ def parse_sequence(sequence):
|
||||
|
||||
def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename):
|
||||
"""
|
||||
Write an alignment in Stockholm format with metadata as annotations.
|
||||
Write a multiple sequence alignment with associated metadata in Stockholm format to a file.
|
||||
|
||||
Parameters:
|
||||
aligned_identifiers1 (list): List of aligned identifiers for the first sequence.
|
||||
aligned_identifiers2 (list): List of aligned identifiers for the second sequence.
|
||||
aligned_metadata1 (list): List of metadata corresponding to aligned_identifiers1.
|
||||
aligned_metadata2 (list): List of metadata corresponding to aligned_identifiers2.
|
||||
score (int): Alignment score.
|
||||
output_filename (str): Name of the output Stockholm format file.
|
||||
- aligned_identifiers1 (list): List of identifiers for the first aligned sequence.
|
||||
- aligned_identifiers2 (list): List of identifiers for the second aligned sequence.
|
||||
- aligned_metadata1 (list): List of metadata annotations for the first aligned sequence.
|
||||
- aligned_metadata2 (list): List of metadata annotations for the second aligned sequence.
|
||||
- score (float): Alignment score to be included as a global feature.
|
||||
- output_filename (str): Name of the file to write the Stockholm-formatted alignment.
|
||||
|
||||
Description:
|
||||
This function writes an alignment in the Stockholm format with custom metadata as annotations.
|
||||
It takes two lists of aligned identifiers (aligned_identifiers1 and aligned_identifiers2),
|
||||
two lists of corresponding metadata (aligned_metadata1 and aligned_metadata2), an alignment score,
|
||||
and the desired output filename.
|
||||
|
||||
The function creates a Stockholm file where each sequence in the alignment is represented by its identifier.
|
||||
It includes the metadata as custom annotations (#=GC METADATA1 and #=GC METADATA2) in the Stockholm file.
|
||||
|
||||
The Stockholm format is commonly used for representing sequence alignments in bioinformatics.
|
||||
The function opens the specified file in write mode, writes the Stockholm header,
|
||||
and iterates over aligned sequences and their associated metadata, writing them to the file.
|
||||
The alignment score is also included as a global feature. The file is closed automatically
|
||||
upon exiting the function.
|
||||
|
||||
Example:
|
||||
aligned_identifiers1 = ['COM12018', 'COM17003']
|
||||
aligned_identifiers2 = ['COM12018', 'COM17003']
|
||||
aligned_metadata1 = ['some_metadata', 'some_data']
|
||||
aligned_metadata2 = ['some_other_metadata', 'some_more_metadata']
|
||||
score = 42
|
||||
output_filename = 'alignment.stockholm'
|
||||
|
||||
write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename)
|
||||
>>> aligned_identifiers1 = ['A', 'B', 'C']
|
||||
>>> aligned_identifiers2 = ['X', 'Y', 'Z']
|
||||
>>> aligned_metadata1 = ['metaA', 'metaB', 'metaC']
|
||||
>>> aligned_metadata2 = ['metaX', 'metaY', 'metaZ']
|
||||
>>> score = 42.0
|
||||
>>> write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2,
|
||||
... aligned_metadata1, aligned_metadata2, score, 'output.sto')
|
||||
"""
|
||||
with open(output_filename, 'w') as stockholm_file:
|
||||
stockholm_file.write("# STOCKHOLM 1.0\n")
|
||||
@ -77,6 +69,16 @@ def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identi
|
||||
stockholm_file.write(f"#=GC METADATA1 {metadata1}\n")
|
||||
stockholm_file.write(f"#=GC METADATA2 {metadata2}\n")
|
||||
stockholm_file.write(f"#=GF SCORE: {score}\n")
|
||||
stockholm_file.close()
|
||||
|
||||
|
||||
def write_clustal_alignment(sequences, output_filename):
|
||||
"""
|
||||
Write to clustal format
|
||||
"""
|
||||
with open(output_filename, 'w') as clustal_file:
|
||||
for sequence in sequences:
|
||||
clustal_file.write(f"{sequence.id.ljust(20)} {sequence.seq}\n")
|
||||
|
||||
|
||||
def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_filename, aligned_metadata1=None,
|
||||
@ -96,34 +98,33 @@ def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_
|
||||
|
||||
|
||||
def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1,
|
||||
match_score=1, mismatch_penalty=-10, fasta_name="alignment", threads=None):
|
||||
match_score=1, mismatch_penalty=-10, filename="alignment", threads=None):
|
||||
"""
|
||||
Perform global sequence alignment using dynamic programming (Needleman-Wunsch).
|
||||
|
||||
Parameters:
|
||||
sequence1 (str): The first sequence to align.
|
||||
sequence2 (str): The second sequence to align.
|
||||
gap_penalty (int, optional): Penalty for introducing a gap. Default is -1.
|
||||
match_score (int, optional): Score for a match. Default is 1.
|
||||
mismatch_penalty (int, optional): Penalty for a mismatch. Default is -1.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing the aligned longer sequence, aligned shorter sequence, and alignment score.
|
||||
|
||||
Description:
|
||||
This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
|
||||
using the Needleman-Wunsch algorithm. It aligns the sequences based on the specified scoring parameters
|
||||
for gap penalties, match scores, and mismatch penalties.
|
||||
|
||||
The function returns a tuple containing the following elements:
|
||||
- The aligned longer sequence (string).
|
||||
- The aligned shorter sequence (string).
|
||||
- The aligned longer sequence (list of strings) where gaps are indicated by '-' characters.
|
||||
- The aligned shorter sequence (list of strings) where gaps are indicated by '-' characters.
|
||||
- Aligned metadata for sequence1 (list of strings).
|
||||
- Aligned metadata for sequence2 (list of strings).
|
||||
- The alignment score (int).
|
||||
|
||||
The aligned sequences are represented as strings where gaps are indicated by '-' characters.
|
||||
Note:
|
||||
If additional metadata is not provided (metadata1 or metadata2 is None), the corresponding aligned_metadata
|
||||
lists will also be None.
|
||||
|
||||
Example:
|
||||
```python
|
||||
sequence1 = "AGCT"
|
||||
sequence2 = "AAGCT"
|
||||
aligned_seq1, aligned_seq2, align_metadata1, align_metadata2, score = global_alignment_np(
|
||||
sequence1, sequence2, metadata1="ABC", metadata2="XYZ"
|
||||
)
|
||||
```
|
||||
|
||||
Additionally, the function saves the alignment as a FASTA file named 'alignment.fasta' and prints a
|
||||
human-readable alignment using Biopython's format_alignment function for visualization.
|
||||
"""
|
||||
identifiers1, metadata1 = parse_sequence(sequence1), parse_sequence(metadata1)
|
||||
identifiers2, metadata2 = parse_sequence(sequence2), parse_sequence(metadata2)
|
||||
@ -208,18 +209,73 @@ def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, ga
|
||||
padded_seq2 = seq2 + '-' * (len(seq1) - len(seq2))
|
||||
padded_sequences1.append(seq1)
|
||||
padded_sequences2.append(padded_seq2)
|
||||
|
||||
return padded_sequences1, padded_sequences2, aligned_metadata1, aligned_metadata2, score
|
||||
|
||||
|
||||
def sequence_alignment(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1,
|
||||
match_score=1, mismatch_penalty=-10, filename="alignment", threads=None,
|
||||
stockholm=True, fasta=True, clustal=False):
|
||||
"""
|
||||
Perform global sequence alignment and save the results in various formats.
|
||||
|
||||
Parameters:
|
||||
sequence1 (str): The first sequence to align.
|
||||
sequence2 (str): The second sequence to align.
|
||||
metadata1 (str, optional): Metadata for the first sequence. Default is None.
|
||||
metadata2 (str, optional): Metadata for the second sequence. Default is None.
|
||||
gap_penalty (int, optional): Penalty for introducing a gap. Default is -1.
|
||||
match_score (int, optional): Score for a match. Default is 1.
|
||||
mismatch_penalty (int, optional): Penalty for a mismatch. Default is -10.
|
||||
filename (str, optional): Name for the output files. Default is "alignment".
|
||||
threads (int, optional): Number of threads for parallel execution. Default is None.
|
||||
stockholm (bool, optional): Whether to output in Stockholm format. Default is True.
|
||||
fasta (bool, optional): Whether to output in FASTA format. Default is True.
|
||||
|
||||
Returns:
|
||||
int: The alignment score.
|
||||
|
||||
Description:
|
||||
This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
|
||||
using the Needleman-Wunsch algorithm. It automatically determines the optimal scoring parameters for gap penalties,
|
||||
match scores, and mismatch penalties based on a sample alignment.
|
||||
|
||||
The function saves the alignment in various formats based on the specified options:
|
||||
- Stockholm format if `stockholm` is True.
|
||||
- FASTA format if `fasta` is True.
|
||||
- A text file with the aligned sequences and metadata.
|
||||
|
||||
The alignment score is returned as an integer.
|
||||
|
||||
Example:
|
||||
alignment_score = sequence_alignment("AGTACG", "ATGC", metadata1="abc", metadata2="def", gap_penalty=-2,
|
||||
match_score=2, mismatch_penalty=-1, filename="my_alignment",
|
||||
threads=4, stockholm=True, fasta=True)
|
||||
"""
|
||||
padded_sequences1, padded_sequences2, aligned_metadata1, \
|
||||
aligned_metadata2, score = global_alignment_np(sequence1, sequence2, metadata1, metadata2, gap_penalty,
|
||||
match_score, mismatch_penalty, threads)
|
||||
|
||||
if metadata1 is not None and metadata2 is not None:
|
||||
write_stockholm_alignment_with_metadata(padded_sequences1, padded_sequences2, aligned_metadata1,
|
||||
aligned_metadata2, score, f'{fasta_name}.sto')
|
||||
write_text_format(padded_sequences1, padded_sequences2, score, f'{fasta_name}-text.txt',
|
||||
if stockholm is True:
|
||||
write_stockholm_alignment_with_metadata(padded_sequences1, padded_sequences2, aligned_metadata1,
|
||||
aligned_metadata2, score, f'{filename}.sto')
|
||||
if fasta is True:
|
||||
write_text_format(padded_sequences1, padded_sequences2, score, f'{filename}-text.txt',
|
||||
aligned_metadata1, aligned_metadata2)
|
||||
else:
|
||||
write_text_format(padded_sequences1, padded_sequences2, score, f'{fasta_name}-text.txt')
|
||||
write_text_format(padded_sequences1, padded_sequences2, score, f'{filename}-text.txt')
|
||||
|
||||
record1 = SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1")
|
||||
record2 = SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")
|
||||
|
||||
SeqIO.write([record1, record2], f'{fasta_name}.fasta', "fasta")
|
||||
if fasta is True:
|
||||
SeqIO.write([record1, record2], f'{filename}.fasta', "fasta")
|
||||
|
||||
if clustal is True:
|
||||
sequences = [SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1"),
|
||||
SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")]
|
||||
write_clustal_alignment(sequences, f'{filename}.aln')
|
||||
|
||||
return score
|
||||
|
||||
return '|'.join(aligned_identifiers1), '|'.join(aligned_identifiers2), score
|
||||
|
100001
sample_data/logdata.csv
Normal file
100001
sample_data/logdata.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user