Updating sequence alignment to work with clustal

Changed definitions Working on auto scoring
2023-11-29 22:28:47 +01:00 · 2023-11-29 22:28:47 +01:00 · a09d13f1e5
commit a09d13f1e5
parent afed80ca7f
2 changed files with 100107 additions and 50 deletions
--- a/herrewebpy/bioinformatics/sequence_alignment.py
+++ b/herrewebpy/bioinformatics/sequence_alignment.py
@ -18,7 +18,6 @@ Date: 25 November
 Herreweb
 """
 def parse_sequence(sequence):
    """
    Parse a sequence, either as a pandas DataFrame or a string, and return the result.
@ -38,36 +37,29 @@ def parse_sequence(sequence):
 def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename):
    """
-    Write an alignment in Stockholm format with metadata as annotations.
+    Write a multiple sequence alignment with associated metadata in Stockholm format to a file.
    Parameters:
-        aligned_identifiers1 (list): List of aligned identifiers for the first sequence.
+    - aligned_identifiers1 (list): List of identifiers for the first aligned sequence.
-        aligned_identifiers2 (list): List of aligned identifiers for the second sequence.
+    - aligned_identifiers2 (list): List of identifiers for the second aligned sequence.
-        aligned_metadata1 (list): List of metadata corresponding to aligned_identifiers1.
+    - aligned_metadata1 (list): List of metadata annotations for the first aligned sequence.
-        aligned_metadata2 (list): List of metadata corresponding to aligned_identifiers2.
+    - aligned_metadata2 (list): List of metadata annotations for the second aligned sequence.
-        score (int): Alignment score.
+    - score (float): Alignment score to be included as a global feature.
-        output_filename (str): Name of the output Stockholm format file.
+    - output_filename (str): Name of the file to write the Stockholm-formatted alignment.
-    Description:
+    The function opens the specified file in write mode, writes the Stockholm header,
-        This function writes an alignment in the Stockholm format with custom metadata as annotations.
+    and iterates over aligned sequences and their associated metadata, writing them to the file.
-        It takes two lists of aligned identifiers (aligned_identifiers1 and aligned_identifiers2),
+    The alignment score is also included as a global feature. The file is closed automatically
-        two lists of corresponding metadata (aligned_metadata1 and aligned_metadata2), an alignment score,
+    upon exiting the function.
        and the desired output filename.
        The function creates a Stockholm file where each sequence in the alignment is represented by its identifier.
        It includes the metadata as custom annotations (#=GC METADATA1 and #=GC METADATA2) in the Stockholm file.
        The Stockholm format is commonly used for representing sequence alignments in bioinformatics.
    Example:
-        aligned_identifiers1 = ['COM12018', 'COM17003']
+    >>> aligned_identifiers1 = ['A', 'B', 'C']
-        aligned_identifiers2 = ['COM12018', 'COM17003']
+    >>> aligned_identifiers2 = ['X', 'Y', 'Z']
-        aligned_metadata1 = ['some_metadata', 'some_data']
+    >>> aligned_metadata1 = ['metaA', 'metaB', 'metaC']
-        aligned_metadata2 = ['some_other_metadata', 'some_more_metadata']
+    >>> aligned_metadata2 = ['metaX', 'metaY', 'metaZ']
-        score = 42
+    >>> score = 42.0
-        output_filename = 'alignment.stockholm'
+    >>> write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2,
-
+    ...                                          aligned_metadata1, aligned_metadata2, score, 'output.sto')
        write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename)
    """
    with open(output_filename, 'w') as stockholm_file:
        stockholm_file.write("# STOCKHOLM 1.0\n")
@ -77,6 +69,16 @@ def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identi
            stockholm_file.write(f"#=GC METADATA1 {metadata1}\n")
            stockholm_file.write(f"#=GC METADATA2 {metadata2}\n")
        stockholm_file.write(f"#=GF SCORE: {score}\n")
    stockholm_file.close()
 def write_clustal_alignment(sequences, output_filename):
    """
    Write to clustal format
    """
    with open(output_filename, 'w') as clustal_file:
        for sequence in sequences:
            clustal_file.write(f"{sequence.id.ljust(20)} {sequence.seq}\n")
 def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_filename, aligned_metadata1=None,
@ -96,34 +98,33 @@ def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_
 def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1, 
-                        match_score=1, mismatch_penalty=-10, fasta_name="alignment", threads=None):
+                        match_score=1, mismatch_penalty=-10, filename="alignment", threads=None):
    """
    Perform global sequence alignment using dynamic programming (Needleman-Wunsch).
    Parameters:
        sequence1 (str): The first sequence to align.
        sequence2 (str): The second sequence to align.
        gap_penalty (int, optional): Penalty for introducing a gap. Default is -1.
        match_score (int, optional): Score for a match. Default is 1.
        mismatch_penalty (int, optional): Penalty for a mismatch. Default is -1.
    Returns:
        tuple: A tuple containing the aligned longer sequence, aligned shorter sequence, and alignment score.
    Description:
        This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
        using the Needleman-Wunsch algorithm. It aligns the sequences based on the specified scoring parameters
        for gap penalties, match scores, and mismatch penalties.
        The function returns a tuple containing the following elements:
-        - The aligned longer sequence (string).
+        - The aligned longer sequence (list of strings) where gaps are indicated by '-' characters.
-        - The aligned shorter sequence (string).
+        - The aligned shorter sequence (list of strings) where gaps are indicated by '-' characters.
        - Aligned metadata for sequence1 (list of strings).
        - Aligned metadata for sequence2 (list of strings).
        - The alignment score (int).
-        The aligned sequences are represented as strings where gaps are indicated by '-' characters.
+    Note:
        If additional metadata is not provided (metadata1 or metadata2 is None), the corresponding aligned_metadata
        lists will also be None.
    Example:
        ```python
        sequence1 = "AGCT"
        sequence2 = "AAGCT"
        aligned_seq1, aligned_seq2, align_metadata1, align_metadata2, score = global_alignment_np(
            sequence1, sequence2, metadata1="ABC", metadata2="XYZ"
        )
        ```
        Additionally, the function saves the alignment as a FASTA file named 'alignment.fasta' and prints a
        human-readable alignment using Biopython's format_alignment function for visualization.
    """
    identifiers1, metadata1 = parse_sequence(sequence1), parse_sequence(metadata1)
    identifiers2, metadata2 = parse_sequence(sequence2), parse_sequence(metadata2)
@ -209,17 +210,72 @@ def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, ga
                padded_sequences1.append(seq1)
                padded_sequences2.append(padded_seq2)
    return padded_sequences1, padded_sequences2, aligned_metadata1, aligned_metadata2, score
 def sequence_alignment(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1, 
                       match_score=1, mismatch_penalty=-10, filename="alignment", threads=None,
                       stockholm=True, fasta=True, clustal=False):
    """
    Perform global sequence alignment and save the results in various formats.
    Parameters:
        sequence1 (str): The first sequence to align.
        sequence2 (str): The second sequence to align.
        metadata1 (str, optional): Metadata for the first sequence. Default is None.
        metadata2 (str, optional): Metadata for the second sequence. Default is None.
        gap_penalty (int, optional): Penalty for introducing a gap. Default is -1.
        match_score (int, optional): Score for a match. Default is 1.
        mismatch_penalty (int, optional): Penalty for a mismatch. Default is -10.
        filename (str, optional): Name for the output files. Default is "alignment".
        threads (int, optional): Number of threads for parallel execution. Default is None.
        stockholm (bool, optional): Whether to output in Stockholm format. Default is True.
        fasta (bool, optional): Whether to output in FASTA format. Default is True.
    Returns:
        int: The alignment score.
    Description:
        This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
        using the Needleman-Wunsch algorithm. It automatically determines the optimal scoring parameters for gap penalties,
        match scores, and mismatch penalties based on a sample alignment.
        The function saves the alignment in various formats based on the specified options:
        - Stockholm format if `stockholm` is True.
        - FASTA format if `fasta` is True.
        - A text file with the aligned sequences and metadata.
        The alignment score is returned as an integer.
    Example:
        alignment_score = sequence_alignment("AGTACG", "ATGC", metadata1="abc", metadata2="def", gap_penalty=-2,
                                            match_score=2, mismatch_penalty=-1, filename="my_alignment",
                                            threads=4, stockholm=True, fasta=True)
    """
    padded_sequences1, padded_sequences2, aligned_metadata1, \
        aligned_metadata2, score = global_alignment_np(sequence1, sequence2, metadata1, metadata2, gap_penalty, 
                                                       match_score, mismatch_penalty, threads)
    if metadata1 is not None and metadata2 is not None:
        if stockholm is True:
            write_stockholm_alignment_with_metadata(padded_sequences1, padded_sequences2, aligned_metadata1, 
-                                                aligned_metadata2, score, f'{fasta_name}.sto')
+                                                    aligned_metadata2, score, f'{filename}.sto')
-        write_text_format(padded_sequences1, padded_sequences2, score, f'{fasta_name}-text.txt', 
+        if fasta is True:
            write_text_format(padded_sequences1, padded_sequences2, score, f'{filename}-text.txt', 
                            aligned_metadata1, aligned_metadata2)
    else:
-        write_text_format(padded_sequences1, padded_sequences2, score, f'{fasta_name}-text.txt')
+            write_text_format(padded_sequences1, padded_sequences2, score, f'{filename}-text.txt')
    record1 = SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1")
    record2 = SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")
-    SeqIO.write([record1, record2], f'{fasta_name}.fasta', "fasta")
+    if fasta is True:
        SeqIO.write([record1, record2], f'{filename}.fasta', "fasta")
    if clustal is True:
        sequences = [SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1"),
                     SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")]
        write_clustal_alignment(sequences, f'{filename}.aln')
    return score
    return '|'.join(aligned_identifiers1), '|'.join(aligned_identifiers2), score
--- a/sample_data/logdata.csv
+++ b/sample_data/logdata.csv