Source code for openprotein.embeddings.poet

"""Original PoET model handling various protein engineering tasks."""

from typing import TYPE_CHECKING

from openprotein.base import APISession
from openprotein.common import ModelMetadata, ReductionType
from openprotein.data import AssayDataset, AssayMetadata
from openprotein.prompt import Prompt

from . import api
from .future import (
    EmbeddingsGenerateFuture,
    EmbeddingsResultFuture,
    EmbeddingsScoreFuture,
)
from .models import EmbeddingModel

if TYPE_CHECKING:
    from openprotein.predictor import PredictorModel
    from openprotein.svd import SVDModel
    from openprotein.umap import UMAPModel


[docs] class PoETModel(EmbeddingModel): """ Class for OpenProtein's foundation model PoET. Note ---- PoET functions are dependent on a prompt supplied via the prompt endpoints. Examples -------- View specific model details (including supported tokens) with the `?` operator. >>> import openprotein >>> session = openprotein.connect(username="user", password="password") >>> session.embedding.poet.<embeddings_method> """ model_id = "poet" # TODO - Add model to explicitly require prompt_id
[docs] def __init__( self, session: APISession, model_id: str, metadata: ModelMetadata | None = None, ): super().__init__(session=session, model_id=model_id, metadata=metadata)
[docs] def embed( self, sequences: list[bytes], prompt: str | Prompt | None = None, reduction: ReductionType | None = ReductionType.MEAN, **kwargs, ) -> EmbeddingsResultFuture: """ Embed sequences using the PoET model. Parameters ---------- sequences : list of bytes Sequences to embed. prompt : str or Prompt or None, optional Prompt from an align workflow to condition the PoET model. reduction : ReductionType or None, optional Embeddings reduction to use (e.g., mean). Default is ReductionType.MEAN. **kwargs Additional keyword arguments. Returns ------- EmbeddingsResultFuture Future object that returns the embeddings of the submitted sequences. """ if prompt is None: prompt_id = None else: prompt_id = prompt if isinstance(prompt, str) else prompt.id return super().embed( sequences=sequences, reduction=reduction, prompt_id=prompt_id, **kwargs, )
[docs] def logits( self, sequences: list[bytes], prompt: str | Prompt | None = None, **kwargs, ) -> EmbeddingsResultFuture: """ Compute logits for sequences using the PoET model. Parameters ---------- sequences : list of bytes Sequences to analyze. prompt : str or Prompt or None, optional Prompt from an align workflow to condition the PoET model. **kwargs Additional keyword arguments. Returns ------- EmbeddingsResultFuture Future object that returns the logits of the submitted sequences. """ if prompt is None: prompt_id = None else: prompt_id = prompt if isinstance(prompt, str) else prompt.id return super().logits(sequences=sequences, prompt_id=prompt_id)
[docs] def attn(self): """ Attention is not available for PoET. Raises ------ ValueError Always raised, as attention is not supported for PoET. """ raise ValueError("Attn not yet supported for poet")
[docs] def score( self, sequences: list[bytes], prompt: str | Prompt | None = None, **kwargs, ) -> EmbeddingsScoreFuture: """ Score query sequences using the specified prompt. Parameters ---------- sequences : list of bytes Sequences to score. prompt : str or Prompt or None, optional Prompt from an align workflow to condition the PoET model. **kwargs Additional keyword arguments. Returns ------- EmbeddingsScoreFuture Future object that returns the scores of the submitted sequences. """ if prompt is None: prompt_id = None else: prompt_id = prompt if isinstance(prompt, str) else prompt.id return EmbeddingsScoreFuture.create( session=self.session, job=api.request_score_post( session=self.session, model_id=self.id, prompt_id=prompt_id, sequences=sequences, **kwargs, ), )
[docs] def indel( self, sequence: bytes, prompt: str | Prompt | None = None, insert: str | None = None, delete: list[int] | None = None, **kwargs, ) -> EmbeddingsScoreFuture: """ Score all indels of the query sequence using the specified prompt. Parameters ---------- sequence : bytes Sequence to analyze. prompt : str or Prompt or None, optional Prompt from an align workflow to condition the PoET model. insert : str or None, optional Insertion fragment at each site. delete : list of int or None, optional Range of size of fragment to delete at each site. **kwargs Additional keyword arguments. Returns ------- EmbeddingsScoreFuture Future object that returns the scores of the indel-ed sequence. Raises ------ ValueError If neither insert nor delete is provided. """ if not insert and not delete: raise ValueError("Expected insert and/or delete to be provided") if prompt is None: prompt_id = None else: prompt_id = prompt if isinstance(prompt, str) else prompt.id return EmbeddingsScoreFuture.create( session=self.session, job=api.request_score_indel_post( session=self.session, model_id=self.id, base_sequence=sequence, prompt_id=prompt_id, insert=insert, delete=delete, **kwargs, ), )
[docs] def single_site( self, sequence: bytes, prompt: str | Prompt | None = None, **kwargs, ) -> EmbeddingsScoreFuture: """ Score all single substitutions of the query sequence using the specified prompt. Parameters ---------- sequence : bytes Sequence to analyze. prompt : str or Prompt or None, optional Prompt from an align workflow to condition the PoET model. **kwargs Additional keyword arguments. Returns ------- EmbeddingsScoreFuture Future object that returns the scores of the mutated sequence. """ if prompt is None: prompt_id = None else: prompt_id = prompt if isinstance(prompt, str) else prompt.id return EmbeddingsScoreFuture.create( session=self.session, job=api.request_score_single_site_post( session=self.session, model_id=self.id, base_sequence=sequence, prompt_id=prompt_id, **kwargs, ), )
[docs] def generate( self, prompt: str | Prompt, num_samples: int = 100, temperature: float = 1.0, topk: float | None = None, topp: float | None = None, max_length: int = 1000, seed: int | None = None, **kwargs, ) -> EmbeddingsGenerateFuture: """ Generate protein sequences conditioned on a prompt. Parameters ---------- prompt : str or Prompt Prompt from an align workflow to condition the PoET model. num_samples : int, optional Number of samples to generate. Default is 100. temperature : float, optional Temperature for sampling. Higher values produce more random outputs. Default is 1.0. topk : float or None, optional Number of top-k residues to consider during sampling. Default is None. topp : float or None, optional Cumulative probability threshold for top-p sampling. Default is None. max_length : int, optional Maximum length of generated proteins. Default is 1000. seed : int or None, optional Seed for random number generation. Default is None. **kwargs Additional keyword arguments. Returns ------- EmbeddingsGenerateFuture Future object representing the status and information about the generation job. """ prompt_id = prompt if isinstance(prompt, str) else prompt.id return EmbeddingsGenerateFuture.create( session=self.session, job=api.request_generate_post( session=self.session, model_id=self.id, num_samples=num_samples, temperature=temperature, topk=topk, topp=topp, max_length=max_length, random_seed=seed, prompt_id=prompt_id, **kwargs, ), )
[docs] def fit_svd( self, prompt: str | Prompt | None = None, sequences: list[bytes] | list[str] | None = None, assay: AssayDataset | None = None, n_components: int = 1024, reduction: ReductionType | None = None, **kwargs, ) -> "SVDModel": """ Fit an SVD on the embedding results of PoET. This function creates an SVDModel based on the embeddings from this model as well as the hyperparameters specified in the arguments. Parameters ---------- prompt : str or Prompt or None, optional Prompt from an align workflow to condition the PoET model. sequences : list of bytes or list of str or None, optional Sequences to use for SVD. assay : AssayDataset or None, optional Assay dataset to use for SVD. n_components : int, optional Number of components in SVD. Determines output shapes. Default is 1024. reduction : ReductionType or None, optional Embeddings reduction to use (e.g., mean). **kwargs Additional keyword arguments. Returns ------- SVDModel Future that represents the fitted SVD model. """ if prompt is None: prompt_id = None else: prompt_id = prompt if isinstance(prompt, str) else prompt.id return super().fit_svd( sequences=sequences, assay=assay, n_components=n_components, reduction=reduction, prompt_id=prompt_id, **kwargs, )
[docs] def fit_umap( self, prompt: str | Prompt | None = None, sequences: list[bytes] | list[str] | None = None, assay: AssayDataset | None = None, n_components: int = 2, reduction: ReductionType | None = ReductionType.MEAN, **kwargs, ) -> "UMAPModel": """ Fit a UMAP on assay using PoET and hyperparameters. This function creates a UMAP based on the embeddings from this PoET model as well as the hyperparameters specified in the arguments. Parameters ---------- prompt : str or Prompt or None, optional Prompt from an align workflow to condition the PoET model. sequences : list of bytes or list of str or None, optional Optional sequences to fit UMAP with. Either use sequences or assay. Sequences is preferred. assay : AssayDataset or None, optional Optional assay containing sequences to fit UMAP with. Either use sequences or assay. Ignored if sequences are provided. n_components : int, optional Number of components in UMAP fit. Determines output shapes. Default is 2. reduction : ReductionType or None, optional Embeddings reduction to use (e.g., mean). Default is ReductionType.MEAN. **kwargs Additional keyword arguments. Returns ------- UMAPModel Future that represents the fitted UMAP model. """ if prompt is None: prompt_id = None else: prompt_id = prompt if isinstance(prompt, str) else prompt.id return super().fit_umap( sequences=sequences, assay=assay, n_components=n_components, reduction=reduction, prompt_id=prompt_id, **kwargs, )
[docs] def fit_gp( self, assay: AssayMetadata | AssayDataset | str, properties: list[str], prompt: str | Prompt | None = None, **kwargs, ) -> "PredictorModel": """ Fit a Gaussian Process (GP) on assay using this embedding model and hyperparameters. Parameters ---------- assay : AssayMetadata or AssayDataset or str Assay to fit GP on. properties : list of str Properties in the assay to fit the GP on. prompt : str or Prompt or None, optional Prompt from an align workflow to condition the PoET model. **kwargs Additional keyword arguments. Returns ------- PredictorModel Future that represents the trained predictor model. """ if prompt is None: prompt_id = None else: prompt_id = prompt if isinstance(prompt, str) else prompt.id return super().fit_gp( assay=assay, properties=properties, prompt_id=prompt_id, **kwargs, )