Updating sequence alignment to work with clustal
Changed definitions Working on auto scoring
This commit is contained in:
parent
afed80ca7f
commit
a09d13f1e5
@ -18,7 +18,6 @@ Date: 25 November
|
|||||||
Herreweb
|
Herreweb
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def parse_sequence(sequence):
|
def parse_sequence(sequence):
|
||||||
"""
|
"""
|
||||||
Parse a sequence, either as a pandas DataFrame or a string, and return the result.
|
Parse a sequence, either as a pandas DataFrame or a string, and return the result.
|
||||||
@ -38,36 +37,29 @@ def parse_sequence(sequence):
|
|||||||
|
|
||||||
def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename):
|
def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename):
|
||||||
"""
|
"""
|
||||||
Write an alignment in Stockholm format with metadata as annotations.
|
Write a multiple sequence alignment with associated metadata in Stockholm format to a file.
|
||||||
|
|
||||||
Parameters:
|
Parameters:
|
||||||
aligned_identifiers1 (list): List of aligned identifiers for the first sequence.
|
- aligned_identifiers1 (list): List of identifiers for the first aligned sequence.
|
||||||
aligned_identifiers2 (list): List of aligned identifiers for the second sequence.
|
- aligned_identifiers2 (list): List of identifiers for the second aligned sequence.
|
||||||
aligned_metadata1 (list): List of metadata corresponding to aligned_identifiers1.
|
- aligned_metadata1 (list): List of metadata annotations for the first aligned sequence.
|
||||||
aligned_metadata2 (list): List of metadata corresponding to aligned_identifiers2.
|
- aligned_metadata2 (list): List of metadata annotations for the second aligned sequence.
|
||||||
score (int): Alignment score.
|
- score (float): Alignment score to be included as a global feature.
|
||||||
output_filename (str): Name of the output Stockholm format file.
|
- output_filename (str): Name of the file to write the Stockholm-formatted alignment.
|
||||||
|
|
||||||
Description:
|
The function opens the specified file in write mode, writes the Stockholm header,
|
||||||
This function writes an alignment in the Stockholm format with custom metadata as annotations.
|
and iterates over aligned sequences and their associated metadata, writing them to the file.
|
||||||
It takes two lists of aligned identifiers (aligned_identifiers1 and aligned_identifiers2),
|
The alignment score is also included as a global feature. The file is closed automatically
|
||||||
two lists of corresponding metadata (aligned_metadata1 and aligned_metadata2), an alignment score,
|
upon exiting the function.
|
||||||
and the desired output filename.
|
|
||||||
|
|
||||||
The function creates a Stockholm file where each sequence in the alignment is represented by its identifier.
|
|
||||||
It includes the metadata as custom annotations (#=GC METADATA1 and #=GC METADATA2) in the Stockholm file.
|
|
||||||
|
|
||||||
The Stockholm format is commonly used for representing sequence alignments in bioinformatics.
|
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
aligned_identifiers1 = ['COM12018', 'COM17003']
|
>>> aligned_identifiers1 = ['A', 'B', 'C']
|
||||||
aligned_identifiers2 = ['COM12018', 'COM17003']
|
>>> aligned_identifiers2 = ['X', 'Y', 'Z']
|
||||||
aligned_metadata1 = ['some_metadata', 'some_data']
|
>>> aligned_metadata1 = ['metaA', 'metaB', 'metaC']
|
||||||
aligned_metadata2 = ['some_other_metadata', 'some_more_metadata']
|
>>> aligned_metadata2 = ['metaX', 'metaY', 'metaZ']
|
||||||
score = 42
|
>>> score = 42.0
|
||||||
output_filename = 'alignment.stockholm'
|
>>> write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2,
|
||||||
|
... aligned_metadata1, aligned_metadata2, score, 'output.sto')
|
||||||
write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename)
|
|
||||||
"""
|
"""
|
||||||
with open(output_filename, 'w') as stockholm_file:
|
with open(output_filename, 'w') as stockholm_file:
|
||||||
stockholm_file.write("# STOCKHOLM 1.0\n")
|
stockholm_file.write("# STOCKHOLM 1.0\n")
|
||||||
@ -77,6 +69,16 @@ def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identi
|
|||||||
stockholm_file.write(f"#=GC METADATA1 {metadata1}\n")
|
stockholm_file.write(f"#=GC METADATA1 {metadata1}\n")
|
||||||
stockholm_file.write(f"#=GC METADATA2 {metadata2}\n")
|
stockholm_file.write(f"#=GC METADATA2 {metadata2}\n")
|
||||||
stockholm_file.write(f"#=GF SCORE: {score}\n")
|
stockholm_file.write(f"#=GF SCORE: {score}\n")
|
||||||
|
stockholm_file.close()
|
||||||
|
|
||||||
|
|
||||||
|
def write_clustal_alignment(sequences, output_filename):
|
||||||
|
"""
|
||||||
|
Write to clustal format
|
||||||
|
"""
|
||||||
|
with open(output_filename, 'w') as clustal_file:
|
||||||
|
for sequence in sequences:
|
||||||
|
clustal_file.write(f"{sequence.id.ljust(20)} {sequence.seq}\n")
|
||||||
|
|
||||||
|
|
||||||
def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_filename, aligned_metadata1=None,
|
def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_filename, aligned_metadata1=None,
|
||||||
@ -96,34 +98,33 @@ def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_
|
|||||||
|
|
||||||
|
|
||||||
def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1,
|
def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1,
|
||||||
match_score=1, mismatch_penalty=-10, fasta_name="alignment", threads=None):
|
match_score=1, mismatch_penalty=-10, filename="alignment", threads=None):
|
||||||
"""
|
"""
|
||||||
Perform global sequence alignment using dynamic programming (Needleman-Wunsch).
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
sequence1 (str): The first sequence to align.
|
|
||||||
sequence2 (str): The second sequence to align.
|
|
||||||
gap_penalty (int, optional): Penalty for introducing a gap. Default is -1.
|
|
||||||
match_score (int, optional): Score for a match. Default is 1.
|
|
||||||
mismatch_penalty (int, optional): Penalty for a mismatch. Default is -1.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: A tuple containing the aligned longer sequence, aligned shorter sequence, and alignment score.
|
|
||||||
|
|
||||||
Description:
|
Description:
|
||||||
This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
|
This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
|
||||||
using the Needleman-Wunsch algorithm. It aligns the sequences based on the specified scoring parameters
|
using the Needleman-Wunsch algorithm. It aligns the sequences based on the specified scoring parameters
|
||||||
for gap penalties, match scores, and mismatch penalties.
|
for gap penalties, match scores, and mismatch penalties.
|
||||||
|
|
||||||
The function returns a tuple containing the following elements:
|
The function returns a tuple containing the following elements:
|
||||||
- The aligned longer sequence (string).
|
- The aligned longer sequence (list of strings) where gaps are indicated by '-' characters.
|
||||||
- The aligned shorter sequence (string).
|
- The aligned shorter sequence (list of strings) where gaps are indicated by '-' characters.
|
||||||
|
- Aligned metadata for sequence1 (list of strings).
|
||||||
|
- Aligned metadata for sequence2 (list of strings).
|
||||||
- The alignment score (int).
|
- The alignment score (int).
|
||||||
|
|
||||||
The aligned sequences are represented as strings where gaps are indicated by '-' characters.
|
Note:
|
||||||
|
If additional metadata is not provided (metadata1 or metadata2 is None), the corresponding aligned_metadata
|
||||||
|
lists will also be None.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
```python
|
||||||
|
sequence1 = "AGCT"
|
||||||
|
sequence2 = "AAGCT"
|
||||||
|
aligned_seq1, aligned_seq2, align_metadata1, align_metadata2, score = global_alignment_np(
|
||||||
|
sequence1, sequence2, metadata1="ABC", metadata2="XYZ"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
Additionally, the function saves the alignment as a FASTA file named 'alignment.fasta' and prints a
|
|
||||||
human-readable alignment using Biopython's format_alignment function for visualization.
|
|
||||||
"""
|
"""
|
||||||
identifiers1, metadata1 = parse_sequence(sequence1), parse_sequence(metadata1)
|
identifiers1, metadata1 = parse_sequence(sequence1), parse_sequence(metadata1)
|
||||||
identifiers2, metadata2 = parse_sequence(sequence2), parse_sequence(metadata2)
|
identifiers2, metadata2 = parse_sequence(sequence2), parse_sequence(metadata2)
|
||||||
@ -209,17 +210,72 @@ def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, ga
|
|||||||
padded_sequences1.append(seq1)
|
padded_sequences1.append(seq1)
|
||||||
padded_sequences2.append(padded_seq2)
|
padded_sequences2.append(padded_seq2)
|
||||||
|
|
||||||
|
return padded_sequences1, padded_sequences2, aligned_metadata1, aligned_metadata2, score
|
||||||
|
|
||||||
|
|
||||||
|
def sequence_alignment(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1,
|
||||||
|
match_score=1, mismatch_penalty=-10, filename="alignment", threads=None,
|
||||||
|
stockholm=True, fasta=True, clustal=False):
|
||||||
|
"""
|
||||||
|
Perform global sequence alignment and save the results in various formats.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
sequence1 (str): The first sequence to align.
|
||||||
|
sequence2 (str): The second sequence to align.
|
||||||
|
metadata1 (str, optional): Metadata for the first sequence. Default is None.
|
||||||
|
metadata2 (str, optional): Metadata for the second sequence. Default is None.
|
||||||
|
gap_penalty (int, optional): Penalty for introducing a gap. Default is -1.
|
||||||
|
match_score (int, optional): Score for a match. Default is 1.
|
||||||
|
mismatch_penalty (int, optional): Penalty for a mismatch. Default is -10.
|
||||||
|
filename (str, optional): Name for the output files. Default is "alignment".
|
||||||
|
threads (int, optional): Number of threads for parallel execution. Default is None.
|
||||||
|
stockholm (bool, optional): Whether to output in Stockholm format. Default is True.
|
||||||
|
fasta (bool, optional): Whether to output in FASTA format. Default is True.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: The alignment score.
|
||||||
|
|
||||||
|
Description:
|
||||||
|
This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
|
||||||
|
using the Needleman-Wunsch algorithm. It automatically determines the optimal scoring parameters for gap penalties,
|
||||||
|
match scores, and mismatch penalties based on a sample alignment.
|
||||||
|
|
||||||
|
The function saves the alignment in various formats based on the specified options:
|
||||||
|
- Stockholm format if `stockholm` is True.
|
||||||
|
- FASTA format if `fasta` is True.
|
||||||
|
- A text file with the aligned sequences and metadata.
|
||||||
|
|
||||||
|
The alignment score is returned as an integer.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
alignment_score = sequence_alignment("AGTACG", "ATGC", metadata1="abc", metadata2="def", gap_penalty=-2,
|
||||||
|
match_score=2, mismatch_penalty=-1, filename="my_alignment",
|
||||||
|
threads=4, stockholm=True, fasta=True)
|
||||||
|
"""
|
||||||
|
padded_sequences1, padded_sequences2, aligned_metadata1, \
|
||||||
|
aligned_metadata2, score = global_alignment_np(sequence1, sequence2, metadata1, metadata2, gap_penalty,
|
||||||
|
match_score, mismatch_penalty, threads)
|
||||||
|
|
||||||
if metadata1 is not None and metadata2 is not None:
|
if metadata1 is not None and metadata2 is not None:
|
||||||
|
if stockholm is True:
|
||||||
write_stockholm_alignment_with_metadata(padded_sequences1, padded_sequences2, aligned_metadata1,
|
write_stockholm_alignment_with_metadata(padded_sequences1, padded_sequences2, aligned_metadata1,
|
||||||
aligned_metadata2, score, f'{fasta_name}.sto')
|
aligned_metadata2, score, f'{filename}.sto')
|
||||||
write_text_format(padded_sequences1, padded_sequences2, score, f'{fasta_name}-text.txt',
|
if fasta is True:
|
||||||
|
write_text_format(padded_sequences1, padded_sequences2, score, f'{filename}-text.txt',
|
||||||
aligned_metadata1, aligned_metadata2)
|
aligned_metadata1, aligned_metadata2)
|
||||||
else:
|
else:
|
||||||
write_text_format(padded_sequences1, padded_sequences2, score, f'{fasta_name}-text.txt')
|
write_text_format(padded_sequences1, padded_sequences2, score, f'{filename}-text.txt')
|
||||||
|
|
||||||
record1 = SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1")
|
record1 = SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1")
|
||||||
record2 = SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")
|
record2 = SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")
|
||||||
|
|
||||||
SeqIO.write([record1, record2], f'{fasta_name}.fasta', "fasta")
|
if fasta is True:
|
||||||
|
SeqIO.write([record1, record2], f'{filename}.fasta', "fasta")
|
||||||
|
|
||||||
|
if clustal is True:
|
||||||
|
sequences = [SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1"),
|
||||||
|
SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")]
|
||||||
|
write_clustal_alignment(sequences, f'{filename}.aln')
|
||||||
|
|
||||||
|
return score
|
||||||
|
|
||||||
return '|'.join(aligned_identifiers1), '|'.join(aligned_identifiers2), score
|
|
||||||
|
100001
sample_data/logdata.csv
Normal file
100001
sample_data/logdata.csv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user