Updating sequence alignment to work with clustal

Changed definitions Working on auto scoring
2023-11-29 22:28:47 +01:00 · 2023-11-29 22:28:47 +01:00 · a09d13f1e5
commit a09d13f1e5
parent afed80ca7f
2 changed files with 100107 additions and 50 deletions
--- a/herrewebpy/bioinformatics/sequence_alignment.py
+++ b/herrewebpy/bioinformatics/sequence_alignment.py
@ -18,7 +18,6 @@ Date: 25 November
 Herreweb
 """

-
 def parse_sequence(sequence):
    """
    Parse a sequence, either as a pandas DataFrame or a string, and return the result.
@ -38,36 +37,29 @@ def parse_sequence(sequence):

 def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename):
    """
-    Write an alignment in Stockholm format with metadata as annotations.
+    Write a multiple sequence alignment with associated metadata in Stockholm format to a file.

    Parameters:
-        aligned_identifiers1 (list): List of aligned identifiers for the first sequence.
-        aligned_identifiers2 (list): List of aligned identifiers for the second sequence.
-        aligned_metadata1 (list): List of metadata corresponding to aligned_identifiers1.
-        aligned_metadata2 (list): List of metadata corresponding to aligned_identifiers2.
-        score (int): Alignment score.
-        output_filename (str): Name of the output Stockholm format file.
+    - aligned_identifiers1 (list): List of identifiers for the first aligned sequence.
+    - aligned_identifiers2 (list): List of identifiers for the second aligned sequence.
+    - aligned_metadata1 (list): List of metadata annotations for the first aligned sequence.
+    - aligned_metadata2 (list): List of metadata annotations for the second aligned sequence.
+    - score (float): Alignment score to be included as a global feature.
+    - output_filename (str): Name of the file to write the Stockholm-formatted alignment.

-    Description:
-        This function writes an alignment in the Stockholm format with custom metadata as annotations.
-        It takes two lists of aligned identifiers (aligned_identifiers1 and aligned_identifiers2),
-        two lists of corresponding metadata (aligned_metadata1 and aligned_metadata2), an alignment score,
-        and the desired output filename.
-
-        The function creates a Stockholm file where each sequence in the alignment is represented by its identifier.
-        It includes the metadata as custom annotations (#=GC METADATA1 and #=GC METADATA2) in the Stockholm file.
-
-        The Stockholm format is commonly used for representing sequence alignments in bioinformatics.
+    The function opens the specified file in write mode, writes the Stockholm header,
+    and iterates over aligned sequences and their associated metadata, writing them to the file.
+    The alignment score is also included as a global feature. The file is closed automatically
+    upon exiting the function.

    Example:
-        aligned_identifiers1 = ['COM12018', 'COM17003']
-        aligned_identifiers2 = ['COM12018', 'COM17003']
-        aligned_metadata1 = ['some_metadata', 'some_data']
-        aligned_metadata2 = ['some_other_metadata', 'some_more_metadata']
-        score = 42
-        output_filename = 'alignment.stockholm'
-
-        write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2, aligned_metadata1, aligned_metadata2, score, output_filename)
+    >>> aligned_identifiers1 = ['A', 'B', 'C']
+    >>> aligned_identifiers2 = ['X', 'Y', 'Z']
+    >>> aligned_metadata1 = ['metaA', 'metaB', 'metaC']
+    >>> aligned_metadata2 = ['metaX', 'metaY', 'metaZ']
+    >>> score = 42.0
+    >>> write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identifiers2,
+    ...                                          aligned_metadata1, aligned_metadata2, score, 'output.sto')
    """
    with open(output_filename, 'w') as stockholm_file:
        stockholm_file.write("# STOCKHOLM 1.0\n")
@ -77,6 +69,16 @@ def write_stockholm_alignment_with_metadata(aligned_identifiers1, aligned_identi
            stockholm_file.write(f"#=GC METADATA1 {metadata1}\n")
            stockholm_file.write(f"#=GC METADATA2 {metadata2}\n")
        stockholm_file.write(f"#=GF SCORE: {score}\n")
+    stockholm_file.close()
+
+
+def write_clustal_alignment(sequences, output_filename):
+    """
+    Write to clustal format
+    """
+    with open(output_filename, 'w') as clustal_file:
+        for sequence in sequences:
+            clustal_file.write(f"{sequence.id.ljust(20)} {sequence.seq}\n")


 def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_filename, aligned_metadata1=None,
@ -96,34 +98,33 @@ def write_text_format(aligned_identifiers1, aligned_identifiers2, score, output_


 def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1, 
-                        match_score=1, mismatch_penalty=-10, fasta_name="alignment", threads=None):
+                        match_score=1, mismatch_penalty=-10, filename="alignment", threads=None):
    """
-    Perform global sequence alignment using dynamic programming (Needleman-Wunsch).
-
-    Parameters:
-        sequence1 (str): The first sequence to align.
-        sequence2 (str): The second sequence to align.
-        gap_penalty (int, optional): Penalty for introducing a gap. Default is -1.
-        match_score (int, optional): Score for a match. Default is 1.
-        mismatch_penalty (int, optional): Penalty for a mismatch. Default is -1.
-
-    Returns:
-        tuple: A tuple containing the aligned longer sequence, aligned shorter sequence, and alignment score.
-
    Description:
        This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
        using the Needleman-Wunsch algorithm. It aligns the sequences based on the specified scoring parameters
        for gap penalties, match scores, and mismatch penalties.

        The function returns a tuple containing the following elements:
-        - The aligned longer sequence (string).
-        - The aligned shorter sequence (string).
+        - The aligned longer sequence (list of strings) where gaps are indicated by '-' characters.
+        - The aligned shorter sequence (list of strings) where gaps are indicated by '-' characters.
+        - Aligned metadata for sequence1 (list of strings).
+        - Aligned metadata for sequence2 (list of strings).
        - The alignment score (int).

-        The aligned sequences are represented as strings where gaps are indicated by '-' characters.
+    Note:
+        If additional metadata is not provided (metadata1 or metadata2 is None), the corresponding aligned_metadata
+        lists will also be None.
+
+    Example:
+        ```python
+        sequence1 = "AGCT"
+        sequence2 = "AAGCT"
+        aligned_seq1, aligned_seq2, align_metadata1, align_metadata2, score = global_alignment_np(
+            sequence1, sequence2, metadata1="ABC", metadata2="XYZ"
+        )
+        ```

-        Additionally, the function saves the alignment as a FASTA file named 'alignment.fasta' and prints a
-        human-readable alignment using Biopython's format_alignment function for visualization.
    """
    identifiers1, metadata1 = parse_sequence(sequence1), parse_sequence(metadata1)
    identifiers2, metadata2 = parse_sequence(sequence2), parse_sequence(metadata2)
@ -209,17 +210,72 @@ def global_alignment_np(sequence1, sequence2, metadata1=None, metadata2=None, ga
                padded_sequences1.append(seq1)
                padded_sequences2.append(padded_seq2)
    
+    return padded_sequences1, padded_sequences2, aligned_metadata1, aligned_metadata2, score
+    
+
+def sequence_alignment(sequence1, sequence2, metadata1=None, metadata2=None, gap_penalty=-1, 
+                       match_score=1, mismatch_penalty=-10, filename="alignment", threads=None,
+                       stockholm=True, fasta=True, clustal=False):
+    """
+    Perform global sequence alignment and save the results in various formats.
+
+    Parameters:
+        sequence1 (str): The first sequence to align.
+        sequence2 (str): The second sequence to align.
+        metadata1 (str, optional): Metadata for the first sequence. Default is None.
+        metadata2 (str, optional): Metadata for the second sequence. Default is None.
+        gap_penalty (int, optional): Penalty for introducing a gap. Default is -1.
+        match_score (int, optional): Score for a match. Default is 1.
+        mismatch_penalty (int, optional): Penalty for a mismatch. Default is -10.
+        filename (str, optional): Name for the output files. Default is "alignment".
+        threads (int, optional): Number of threads for parallel execution. Default is None.
+        stockholm (bool, optional): Whether to output in Stockholm format. Default is True.
+        fasta (bool, optional): Whether to output in FASTA format. Default is True.
+
+    Returns:
+        int: The alignment score.
+
+    Description:
+        This function performs global sequence alignment between two input sequences, `sequence1` and `sequence2`,
+        using the Needleman-Wunsch algorithm. It automatically determines the optimal scoring parameters for gap penalties,
+        match scores, and mismatch penalties based on a sample alignment.
+
+        The function saves the alignment in various formats based on the specified options:
+        - Stockholm format if `stockholm` is True.
+        - FASTA format if `fasta` is True.
+        - A text file with the aligned sequences and metadata.
+
+        The alignment score is returned as an integer.
+
+    Example:
+        alignment_score = sequence_alignment("AGTACG", "ATGC", metadata1="abc", metadata2="def", gap_penalty=-2,
+                                            match_score=2, mismatch_penalty=-1, filename="my_alignment",
+                                            threads=4, stockholm=True, fasta=True)
+    """
+    padded_sequences1, padded_sequences2, aligned_metadata1, \
+        aligned_metadata2, score = global_alignment_np(sequence1, sequence2, metadata1, metadata2, gap_penalty, 
+                                                       match_score, mismatch_penalty, threads)
+
    if metadata1 is not None and metadata2 is not None:
-        write_stockholm_alignment_with_metadata(padded_sequences1, padded_sequences2, aligned_metadata1, 
-                                                aligned_metadata2, score, f'{fasta_name}.sto')
-        write_text_format(padded_sequences1, padded_sequences2, score, f'{fasta_name}-text.txt', 
+        if stockholm is True:
+            write_stockholm_alignment_with_metadata(padded_sequences1, padded_sequences2, aligned_metadata1, 
+                                                    aligned_metadata2, score, f'{filename}.sto')
+        if fasta is True:
+            write_text_format(padded_sequences1, padded_sequences2, score, f'{filename}-text.txt', 
                            aligned_metadata1, aligned_metadata2)
    else:
-        write_text_format(padded_sequences1, padded_sequences2, score, f'{fasta_name}-text.txt')
+            write_text_format(padded_sequences1, padded_sequences2, score, f'{filename}-text.txt')

    record1 = SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1")
    record2 = SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")

-    SeqIO.write([record1, record2], f'{fasta_name}.fasta', "fasta")
+    if fasta is True:
+        SeqIO.write([record1, record2], f'{filename}.fasta', "fasta")
+
+    if clustal is True:
+        sequences = [SeqRecord(Seq("|".join(padded_sequences1)), id="sequence1"),
+                     SeqRecord(Seq("|".join(padded_sequences2)), id="sequence2")]
+        write_clustal_alignment(sequences, f'{filename}.aln')
+    
+    return score

-    return '|'.join(aligned_identifiers1), '|'.join(aligned_identifiers2), score
--- a/sample_data/logdata.csv
+++ b/sample_data/logdata.csv