Source code for openprotein.align.align

"""Align API interface for creating alignments and MSAs (multiple sequence alignments) which can be used for other protein tasks."""

from collections.abc import Sequence
from io import BytesIO
from typing import BinaryIO, Iterator

from openprotein.base import APISession
from openprotein.errors import DeprecationError
from openprotein.jobs import Job
from openprotein.protein import Protein

from . import api
from .msa import MSAFuture
from .schemas import AbNumberScheme, AlignType


[docs] class AlignAPI: """Align API interface for creating alignments and MSAs (multiple sequence alignments) which can be used for other protein tasks."""
[docs] def __init__(self, session: APISession): self.session = session
[docs] def mafft( self, sequences: Sequence[bytes | str], names: Sequence[str] | None = None, auto: bool = True, ep: float | None = None, op: float | None = None, ) -> MSAFuture: """ Align sequences using the `mafft` algorithm. Set `auto` to True to automatically attempt the best parameters. Leave a parameter as None to use system defaults. Parameters ---------- sequences : Sequence[bytes or str] Sequences to align. names : Sequence[str], optional Optional list of sequence names, must be the same length as sequences if provided. auto : bool, default=True Set to True to automatically set algorithm parameters. ep : float, optional MAFFT "ep" parameter. Sets the offset value for the scoring matrix; lower values make gap opening more difficult. If None, uses system default. op : float, optional MAFFT "op" parameter. Sets the gap opening penalty; higher values increase the cost of opening gaps. If None, uses system default. Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. Raises ------ Exception If names and sequences are not the same length. """ if names is not None and len(names) != len(sequences): raise Exception( f"Names and sequences must be same length, but were {len(names)} and {len(sequences)}" ) lines = [] if names is None: # as CSV lines = [s.encode() if isinstance(s, str) else s for s in sequences] else: # as fasta for name, sequence in zip(names, sequences): if isinstance(name, str): name = name.encode() if isinstance(sequence, str): sequence = sequence.encode() lines.append(b">" + name) lines.append(sequence) content = b"\n".join(lines) stream = BytesIO(content) return self.mafft_file(stream, auto=auto, ep=ep, op=op)
[docs] def mafft_file(self, file, auto=True, ep=None, op=None) -> MSAFuture: """ Align sequences using the `mafft` algorithm. Sequences can be provided as FASTA or CSV formats. If CSV, the file must be headerless with either a single sequence column or name, sequence columns. Set `auto` to True to automatically attempt the best parameters. Leave a parameter as None to use system defaults. Parameters ---------- file : file-like object Sequences to align in FASTA or CSV format. auto : bool, default=True Set to True to automatically set algorithm parameters. ep : float, optional MAFFT "ep" parameter. Sets the offset value for the scoring matrix; lower values make gap opening more difficult. If None, uses system default. op : float, optional MAFFT "op" parameter. Sets the gap opening penalty; higher values increase the cost of opening gaps. If None, uses system default. Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. """ job = api.mafft_post(self.session, file, auto=auto, ep=ep, op=op) return MSAFuture.create(session=self.session, job=job)
[docs] def clustalo( self, sequences: Sequence[bytes | str], names: Sequence[str] | None = None, clustersize: int | None = None, iterations: int | None = None, ) -> MSAFuture: """ Align sequences using the `clustal omega` algorithm. Sequences can be provided as FASTA or CSV formats. If CSV, the file must be headerless with either a single sequence column or name, sequence columns. Parameters ---------- sequences : Sequence[bytes or str] Sequences to align. names : Sequence[str], optional Optional list of sequence names, must be the same length as sequences if provided. clustersize : int, optional Maximum number of sequences per cluster during guide tree generation. If None, uses the default value. iterations : int, optional Number of refinement iterations performed during alignment. If None, uses the default value. Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. Raises ------ Exception If names and sequences are not the same length. """ if names is not None and len(names) != len(sequences): raise Exception( f"Names and sequences must be same length, but were {len(names)} and {len(sequences)}" ) lines = [] if names is None: # as CSV lines = [s.encode() if isinstance(s, str) else s for s in sequences] else: # as fasta for name, sequence in zip(names, sequences): if isinstance(name, str): name = name.encode() if isinstance(sequence, str): sequence = sequence.encode() lines.append(b">" + name) lines.append(sequence) content = b"\n".join(lines) stream = BytesIO(content) return self.clustalo_file( stream, clustersize=clustersize, iterations=iterations )
[docs] def clustalo_file(self, file, clustersize=None, iterations=None) -> MSAFuture: """ Align sequences using the `clustal omega` algorithm. Sequences can be provided as FASTA or CSV formats. If CSV, the file must be headerless with either a single sequence column or name, sequence columns. Parameters ---------- file : file-like object Sequences to align in FASTA or CSV format. clustersize : int, optional Maximum number of sequences per cluster during guide tree generation. If None, uses the default value. iterations : int, optional Number of refinement iterations performed during alignment. If None, uses the default value. Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. """ job = api.clustalo_post( self.session, file, clustersize=clustersize, iterations=iterations ) return MSAFuture.create(session=self.session, job=job)
[docs] def abnumber( self, sequences: Sequence[bytes | str], names: Sequence[str] | None = None, scheme: AbNumberScheme = AbNumberScheme.CHOTHIA, ) -> MSAFuture: """ Align antibody sequences using `AbNumber`. Sequences can be provided as FASTA or CSV formats. If CSV, the file must be headerless with either a single sequence column or name, sequence columns. The antibody numbering scheme can be specified. Parameters ---------- sequences : Sequence[bytes or str] Sequences to align. names : Sequence[str], optional Optional list of sequence names, must be the same length as sequences if provided. scheme : AbNumberScheme, default=AbNumberScheme.CHOTHIA Antibody numbering scheme. Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. Raises ------ Exception If names and sequences are not the same length. """ if names is not None and len(names) != len(sequences): raise Exception( f"Names and sequences must be same length, but were {len(names)} and {len(sequences)}" ) lines = [] if names is None: # as CSV lines = [s.encode() if isinstance(s, str) else s for s in sequences] else: # as fasta for name, sequence in zip(names, sequences): if isinstance(name, str): name = name.encode() if isinstance(sequence, str): sequence = sequence.encode() lines.append(b">" + name) lines.append(sequence) content = b"\n".join(lines) stream = BytesIO(content) return self.abnumber_file(stream, scheme=scheme)
[docs] def abnumber_file( self, file, scheme: AbNumberScheme = AbNumberScheme.CHOTHIA ) -> MSAFuture: """ Align antibody sequences using `AbNumber`. Sequences can be provided as FASTA or CSV formats. If CSV, the file must be headerless with either a single sequence column or name, sequence columns. The antibody numbering scheme can be specified. Parameters ---------- file : file-like object Sequences to align in FASTA or CSV format. scheme : AbNumberScheme, default=AbNumberScheme.CHOTHIA Antibody numbering scheme. Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. """ job = api.abnumber_post(self.session, file, scheme=scheme) return MSAFuture.create(session=self.session, job=job)
[docs] def upload_msa(self, msa_file: BinaryIO) -> MSAFuture: """ Upload an MSA from a file. Parameters ---------- msa_file : str Path to a ready-made MSA file. Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. Raises ------ APIError If there is an issue with the API request. """ return MSAFuture.create( session=self.session, job=api.msa_post(self.session, msa_file=msa_file) )
[docs] def create_msa(self, seed: bytes) -> MSAFuture: """ Construct an MSA via homology search with the seed sequence. Parameters ---------- seed : bytes Seed sequence for the MSA construction. Returns ------- MSAFuture Future object awaiting the contents of the MSA upload. Raises ------ APIError If there is an issue with the API request. """ return MSAFuture.create( session=self.session, job=api.msa_post(self.session, seed=seed) )
[docs] def upload_prompt(self, prompt_file: BinaryIO): """ Directly upload a prompt. This method is deprecated. Use `create_prompt` on the `prompt` module instead. Parameters ---------- prompt_file : BinaryIO Binary I/O object representing the prompt file. Returns ------- PromptJob An object representing the status and results of the prompt job. Raises ------ DeprecationError This method is no longer supported. """ raise DeprecationError( "This method is no longer supported! Use `create_prompt` on the `prompt` module instead." )
[docs] def get_prompt( self, job: Job, prompt_index: int | None = None ) -> Iterator[list[str]]: """ Get prompts for a given job. This method is deprecated. Use `get_prompt` on the `prompt` module instead. Parameters ---------- job : Job The job for which to retrieve data. prompt_index : int, optional The replicate number for the prompt (input_type=-PROMPT only). Returns ------- Iterator[list[str]] An iterator over rows of the prompt data. Raises ------ DeprecationError This method is no longer supported. """ raise DeprecationError( "This method is no longer supported! Use `get_prompt` on the `prompt` module instead." )
[docs] def get_seed(self, job_id: str) -> str: """ Get seed sequence for a given MSA job. Parameters ---------- job : Job The job for which to retrieve data. Returns ------- str Seed sequence that was used to generate the MSA. """ return api.get_seed(session=self.session, job_id=job_id)
[docs] def get_msa(self, job_id: str) -> Iterator[tuple[str, str]]: """ Get generated MSA for a given job. Parameters ---------- job : Job The job for which to retrieve data. Returns ------- Iterator[tuple[str, str]] An iterator over names and sequences of the MSA data. """ return api.get_msa(session=self.session, job_id=job_id)