Updating sequence alignment to work with clustal

Changed definitions
Working on auto scoring
This commit is contained in:
Jonathan Herrewijnen 2023-11-29 22:28:47 +01:00
parent afed80ca7f
commit a09d13f1e5
2 changed files with 100107 additions and 50 deletions

View File

@ -18,7 +18,6 @@ Date: 25 November
Herreweb
"""
def parse_sequence(sequence):
"""
Parse a sequence, either as a pandas DataFrame or a string, and return the result.
@ -38,36 +37,29 @@ def parse_sequence(sequence):
def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename):
"""
Write an alignment in Stockholm format with metadata as annotations.
Write a multiple sequence alignment with associated metadata in Stockholm format to a file.
Parameters:
aligned_identifiers1 (list): List of aligned identifiers for the first sequence.
aligned_identifiers2 (list): List of aligned identifiers for the second sequence.
aligned_metadata1 (list): List of metadata corresponding to aligned_identifiers1.
aligned_metadata2 (list): List of metadata corresponding to aligned_identifiers2.
score (int): Alignment score.
output_filename (str): Name of the output Stockholm format file.
- aligned_identifiers1 (list): List of identifiers for the first aligned sequence.
- aligned_identifiers2 (list): List of identifiers for the second aligned sequence.
- aligned_metadata1 (list): List of metadata annotations for the first aligned sequence.
- aligned_metadata2 (list): List of metadata annotations for the second aligned sequence.
- score (float): Alignment score to be included as a global feature.
- output_filename (str): Name of the file to write the Stockholm-formatted alignment.
Description:
This function writes an alignment in the Stockholm format with custom metadata as annotations.
It takes two lists of aligned identifiers (aligned_identifiers1 and aligned_identifiers2),
two lists of corresponding metadata (aligned_metadata1 and aligned_metadata2), an alignment score,
and the desired output filename.
The function creates a Stockholm file where each sequence in the alignment is represented by its identifier.
It includes the metadata as custom annotations (#=GC METADATA1 and #=GC METADATA2) in the Stockholm file.
The Stockholm format is commonly used for representing sequence alignments in bioinformatics.
The function opens the specified file in write mode, writes the Stockholm header,
and iterates over aligned sequences and their associated metadata, writing them to the file.
The alignment score is also included as a global feature. The file is closed automatically
upon exiting the function.
Example:
aligned_identifiers1 = ['COM12018', 'COM17003']
aligned_identifiers2 = ['COM12018', 'COM17003']
aligned_metadata1 = ['some_metadata', 'some_data']
aligned_metadata2 = ['some_other_metadata', 'some_more_metadata']
score = 42
output_filename = 'alignment.stockholm'
write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename)
>>> aligned_identifiers1 = ['A', 'B', 'C']
>>> aligned_identifiers2 = ['X', 'Y', 'Z']
>>> aligned_metadata1 = ['metaA', 'metaB', 'metaC']
>>> aligned_metadata2 = ['metaX', 'metaY', 'metaZ']
>>> score = 42.0
>>> write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2,
... aligned_metadata1, aligned_metadata2, score, 'output.sto')
"""
with open(output_filename, 'w') as stockholm_file:
stockholm_file.write("# STOCKHOLM 1.0\n")
@ -77,6 +69,16 @@ def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identi
stockholm_file.write(f"#=GC METADATA1 {metadata1}\n")
stockholm_file.write(f"#=GC METADATA2 {metadata2}\n")
stockholm_file.write(f"#=GF SCORE: {score}\n")
stockholm_file.close()
def write_clustal_alignment(sequences, output_filename):
"""
Write to clustal format
"""
with open(output_filename, 'w') as clustal_file:
for sequence in sequences:
clustal_file.write(f"{sequence.id.ljust(20)} {sequence.seq}\n")
def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_filename, aligned_metadata1=None,
@ -96,34 +98,33 @@ def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_
def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1,
match_score=1, mismatch_penalty=-10, fasta_name="alignment", threads=None):
match_score=1, mismatch_penalty=-10, filename="alignment", threads=None):
"""
Perform global sequence alignment using dynamic programming (Needleman-Wunsch).
Parameters:
sequence1 (str): The first sequence to align.
sequence2 (str): The second sequence to align.
gap_penalty (int, optional): Penalty for introducing a gap. Default is -1.
match_score (int, optional): Score for a match. Default is 1.
mismatch_penalty (int, optional): Penalty for a mismatch. Default is -1.
Returns:
tuple: A tuple containing the aligned longer sequence, aligned shorter sequence, and alignment score.
Description:
This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
using the Needleman-Wunsch algorithm. It aligns the sequences based on the specified scoring parameters
for gap penalties, match scores, and mismatch penalties.
The function returns a tuple containing the following elements:
- The aligned longer sequence (string).
- The aligned shorter sequence (string).
- The aligned longer sequence (list of strings) where gaps are indicated by '-' characters.
- The aligned shorter sequence (list of strings) where gaps are indicated by '-' characters.
- Aligned metadata for sequence1 (list of strings).
- Aligned metadata for sequence2 (list of strings).
- The alignment score (int).
The aligned sequences are represented as strings where gaps are indicated by '-' characters.
Note:
If additional metadata is not provided (metadata1 or metadata2 is None), the corresponding aligned_metadata
lists will also be None.
Example:
```python
sequence1 = "AGCT"
sequence2 = "AAGCT"
aligned_seq1, aligned_seq2, align_metadata1, align_metadata2, score = global_alignment_np(
sequence1, sequence2, metadata1="ABC", metadata2="XYZ"
)
```
Additionally, the function saves the alignment as a FASTA file named 'alignment.fasta' and prints a
human-readable alignment using Biopython's format_alignment function for visualization.
"""
identifiers1, metadata1 = parse_sequence(sequence1), parse_sequence(metadata1)
identifiers2, metadata2 = parse_sequence(sequence2), parse_sequence(metadata2)
@ -208,18 +209,73 @@ def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, ga
padded_seq2 = seq2 + '-' * (len(seq1) - len(seq2))
padded_sequences1.append(seq1)
padded_sequences2.append(padded_seq2)
return padded_sequences1, padded_sequences2, aligned_metadata1, aligned_metadata2, score
def sequence_alignment(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1,
match_score=1, mismatch_penalty=-10, filename="alignment", threads=None,
stockholm=True, fasta=True, clustal=False):
"""
Perform global sequence alignment and save the results in various formats.
Parameters:
sequence1 (str): The first sequence to align.
sequence2 (str): The second sequence to align.
metadata1 (str, optional): Metadata for the first sequence. Default is None.
metadata2 (str, optional): Metadata for the second sequence. Default is None.
gap_penalty (int, optional): Penalty for introducing a gap. Default is -1.
match_score (int, optional): Score for a match. Default is 1.
mismatch_penalty (int, optional): Penalty for a mismatch. Default is -10.
filename (str, optional): Name for the output files. Default is "alignment".
threads (int, optional): Number of threads for parallel execution. Default is None.
stockholm (bool, optional): Whether to output in Stockholm format. Default is True.
fasta (bool, optional): Whether to output in FASTA format. Default is True.
Returns:
int: The alignment score.
Description:
This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
using the Needleman-Wunsch algorithm. It automatically determines the optimal scoring parameters for gap penalties,
match scores, and mismatch penalties based on a sample alignment.
The function saves the alignment in various formats based on the specified options:
- Stockholm format if `stockholm` is True.
- FASTA format if `fasta` is True.
- A text file with the aligned sequences and metadata.
The alignment score is returned as an integer.
Example:
alignment_score = sequence_alignment("AGTACG", "ATGC", metadata1="abc", metadata2="def", gap_penalty=-2,
match_score=2, mismatch_penalty=-1, filename="my_alignment",
threads=4, stockholm=True, fasta=True)
"""
padded_sequences1, padded_sequences2, aligned_metadata1, \
aligned_metadata2, score = global_alignment_np(sequence1, sequence2, metadata1, metadata2, gap_penalty,
match_score, mismatch_penalty, threads)
if metadata1 is not None and metadata2 is not None:
write_stockholm_alignment_with_metadata(padded_sequences1, padded_sequences2, aligned_metadata1,
aligned_metadata2, score, f'{fasta_name}.sto')
write_text_format(padded_sequences1, padded_sequences2, score, f'{fasta_name}-text.txt',
if stockholm is True:
write_stockholm_alignment_with_metadata(padded_sequences1, padded_sequences2, aligned_metadata1,
aligned_metadata2, score, f'{filename}.sto')
if fasta is True:
write_text_format(padded_sequences1, padded_sequences2, score, f'{filename}-text.txt',
aligned_metadata1, aligned_metadata2)
else:
write_text_format(padded_sequences1, padded_sequences2, score, f'{fasta_name}-text.txt')
write_text_format(padded_sequences1, padded_sequences2, score, f'{filename}-text.txt')
record1 = SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1")
record2 = SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")
SeqIO.write([record1, record2], f'{fasta_name}.fasta', "fasta")
if fasta is True:
SeqIO.write([record1, record2], f'{filename}.fasta', "fasta")
if clustal is True:
sequences = [SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1"),
SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")]
write_clustal_alignment(sequences, f'{filename}.aln')
return score
return '|'.join(aligned_identifiers1), '|'.join(aligned_identifiers2), score

100001
sample_data/logdata.csv Normal file

File diff suppressed because it is too large Load Diff