Source code for openprotein.align.msa

"""MSA results represented as a future."""

from typing import Iterator

from openprotein import config
from openprotein.base import APISession
from openprotein.jobs import Future, JobType
from openprotein.prompt import Prompt

from . import api
from .future import AlignFuture
from .schemas import (
    AbNumberJob,
    ClustalOJob,
    MafftJob,
    MSAJob,
    MSASamplingMethod,
)


# TODO - AbNumber should probably be  different subclass, because it supports an additional `get` API for the antibody numbering
[docs] class MSAFuture(AlignFuture, Future): """ Represents a future for MSA (Multiple Sequence Alignment) results. Parameters ---------- session : APISession An instance of APISession for API interactions. job : MSAJob The MSA job. page_size : int, optional The number of results to fetch in a single page. Defaults to config.POET_PAGE_SIZE. Attributes ---------- session : APISession An instance of APISession for API interactions. job : MSAJob | MafftJob | ClustalOJob | AbNumberJob The MSA job. page_size : int The number of results to fetch in a single page. msa_id : str The job ID for the MSA. Methods ------- get(verbose=False) Retrieve the MSA of the job as an iterator over CSV rows. sample_prompt(...) Create a protein sequence prompt from the linked MSA for PoET Jobs. """ job: MSAJob | MafftJob | ClustalOJob | AbNumberJob
[docs] def __init__( self, session: APISession, job: MSAJob, page_size: int = config.POET_PAGE_SIZE ): """ Initialize an MSAFuture instance. Parameters ---------- session : APISession An instance of APISession for API interactions. job : MSAJob The MSA job. page_size : int, optional The number of results to fetch in a single page. Defaults to config.POET_PAGE_SIZE. """ super().__init__(session, job) self.page_size = page_size self.msa_id = self.job.job_id
[docs] def get(self, verbose: bool = False) -> Iterator[tuple[str, str]]: """ Retrieve the MSA of the job. Parameters ---------- verbose : bool, optional Whether to print verbose output. Defaults to False. Returns ------- Iterator[tuple[str, str]] An iterator over names and sequences of the MSA data. """ return api.get_msa(session=self.session, job_id=self.job.job_id)
[docs] def sample_prompt( self, num_sequences: int | None = None, num_residues: int | None = None, method: MSASamplingMethod = MSASamplingMethod.NEIGHBORS_NONGAP_NORM_NO_LIMIT, homology_level: float = 0.8, max_similarity: float = 1.0, min_similarity: float = 0.0, always_include_seed_sequence: bool = False, num_ensemble_prompts: int = 1, random_seed: int | None = None, ) -> Prompt: """ Create a protein sequence prompt from the linked MSA for PoET Jobs. Parameters ---------- num_sequences : int, optional Maximum number of sequences in the prompt. Must be less than 100. num_residues : int, optional Maximum number of residues (tokens) in the prompt. Must be less than 24577. method : MSASamplingMethod, optional Method to use for MSA sampling. Defaults to NEIGHBORS_NONGAP_NORM_NO_LIMIT. homology_level : float, optional Level of homology for sequences in the MSA (neighbors methods only). Must be between 0 and 1. Defaults to 0.8. max_similarity : float, optional Maximum similarity between sequences in the MSA and the seed. Must be between 0 and 1. Defaults to 1.0. min_similarity : float, optional Minimum similarity between sequences in the MSA and the seed. Must be between 0 and 1. Defaults to 0.0. always_include_seed_sequence : bool, optional Whether to always include the seed sequence in the MSA. Defaults to False. num_ensemble_prompts : int, optional Number of ensemble jobs to run. Defaults to 1. random_seed : int, optional Seed for random number generation. Defaults to a random number between 0 and 2**32-1. Raises ------ InvalidParameterError If provided parameter values are not in the allowed range. MissingParameterError If both or none of 'num_sequences' and 'num_residues' are specified. Returns ------- Prompt A Prompt instance for the created prompt job. """ msa_id = self.msa_id job = api.prompt_post( self.session, msa_id=msa_id, num_sequences=num_sequences, num_residues=num_residues, method=method, homology_level=homology_level, max_similarity=max_similarity, min_similarity=min_similarity, always_include_seed_sequence=always_include_seed_sequence, num_ensemble_prompts=num_ensemble_prompts, random_seed=random_seed, ) future = Prompt.create( session=self.session, job=job, num_replicates=num_ensemble_prompts ) return future