Source code for openprotein.embeddings.poet2

"""Proprietary PoET-2 model providing top-class performance on protein engineering tasks."""

from collections.abc import Sequence
from typing import TYPE_CHECKING, Literal

import numpy as np

from openprotein.base import APISession
from openprotein.common import ModelMetadata, ReductionType
from openprotein.data import AssayDataset, AssayMetadata
from openprotein.prompt import Prompt, PromptAPI, Query
from openprotein.protein import Protein
from openprotein.utils import uuid

from .future import (
    EmbeddingsGenerateFuture,
    EmbeddingsResultFuture,
    EmbeddingsScoreFuture,
)
from .models import EmbeddingModel
from .poet import PoETModel

if TYPE_CHECKING:
    from openprotein.predictor import PredictorModel
    from openprotein.svd import SVDModel
    from openprotein.umap import UMAPModel


[docs] class PoET2Model(PoETModel, EmbeddingModel): """ Class for OpenProtein's foundation model PoET 2. PoET functions are dependent on a prompt supplied via the prompt endpoints. Examples -------- View specific model details (including supported tokens) with the `?` operator. Examples -------- >>> import openprotein >>> session = openprotein.connect(username="user", password="password") >>> session.embedding.poet2.<embeddings_method> """ model_id = "poet-2" # TODO - Add model to explicitly require prompt_id
[docs] def __init__( self, session: APISession, model_id: str, metadata: ModelMetadata | None = None, ): super().__init__(session=session, model_id=model_id, metadata=metadata)
def __resolve_query( self, query: str | bytes | Protein | Query | None = None, ) -> str | None: if query is None: query_id = None elif ( isinstance(query, Protein) or isinstance(query, bytes) or (isinstance(query, str) and not uuid.is_valid_uuid(query)) ): prompt_api = getattr(self.session, "prompt", None) assert isinstance(prompt_api, PromptAPI) query_ = prompt_api.create_query(query=query) query_id = query_.id else: query_id = query if isinstance(query, str) else query.id return query_id
[docs] def embed( self, sequences: list[bytes], reduction: ReductionType | None = ReductionType.MEAN, prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, decoder_type: Literal["mlm", "clm"] | None = None, ) -> EmbeddingsResultFuture: """ Embed sequences using this model. Parameters ---------- sequences : list of bytes Sequences to embed. reduction : ReductionType or None, optional Embeddings reduction to use (e.g. mean). Default is ReductionType.MEAN. prompt : str or Prompt or None, optional Prompt or prompt_id or prompt from an align workflow to condition PoET model. query : str or bytes or Protein or Query or None, optional Query to use with prompt. use_query_structure_in_decoder : bool, optional Whether to use query structure in decoder. Default is True. decoder_type : {'mlm', 'clm'} or None, optional Decoder type. Default is None. Returns ------- EmbeddingsResultFuture A future object that returns the embeddings of the submitted sequences. """ query_id = self.__resolve_query(query=query) return super().embed( sequences=sequences, reduction=reduction, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, decoder_type=decoder_type, )
[docs] def logits( self, sequences: list[bytes], prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, decoder_type: Literal["mlm", "clm"] | None = None, ) -> EmbeddingsResultFuture: """ Compute logit embeddings for sequences using this model. Parameters ---------- sequences : list of bytes Sequences to analyze. prompt : str or Prompt or None, optional Prompt or prompt_id or prompt from an align workflow to condition PoET model. query : str or bytes or Protein or Query or None, optional Query to use with prompt. use_query_structure_in_decoder : bool, optional Whether to use query structure in decoder. Default is True. decoder_type : {'mlm', 'clm'} or None, optional Decoder type. Default is None. Returns ------- EmbeddingsResultFuture A future object that returns the logits of the submitted sequences. """ query_id = self.__resolve_query(query=query) return super().logits( sequences=sequences, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, decoder_type=decoder_type, )
[docs] def score( self, sequences: list[bytes], prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, decoder_type: Literal["mlm", "clm"] | None = None, ) -> EmbeddingsScoreFuture: """ Score query sequences using the specified prompt. Parameters ---------- sequences : list of bytes Sequences to score. prompt : str or Prompt or None, optional Prompt or prompt_id or prompt from an align workflow to condition PoET model. query : str or bytes or Protein or Query or None, optional Query to use with prompt. use_query_structure_in_decoder : bool, optional Whether to use query structure in decoder. Default is True. decoder_type : {'mlm', 'clm'} or None, optional Decoder type. Default is None. Returns ------- EmbeddingsScoreFuture A future object that returns the scores of the submitted sequences. """ query_id = self.__resolve_query(query=query) return super().score( sequences=sequences, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, decoder_type=decoder_type, )
[docs] def indel( self, sequence: bytes, prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, decoder_type: Literal["mlm", "clm"] | None = None, insert: str | None = None, delete: list[int] | None = None, **kwargs, ) -> EmbeddingsScoreFuture: """ Score all indels of the query sequence using the specified prompt. Parameters ---------- sequence : bytes Sequence to analyze. prompt : str or Prompt or None, optional Prompt from an align workflow to condition the PoET model. query : str or bytes or Protein or Query or None, optional Query to use with prompt. use_query_structure_in_decoder : bool, optional Whether to use query structure in decoder. Default is True. decoder_type : {'mlm', 'clm'} or None, optional Decoder type. Default is None. insert : str or None, optional Insertion fragment at each site. delete : list of int or None, optional Range of size of fragment to delete at each site. **kwargs Additional keyword arguments. Returns ------- EmbeddingsScoreFuture A future object that returns the scores of the indel-ed sequence. Raises ------ ValueError If neither insert nor delete is provided. """ query_id = self.__resolve_query(query=query) return super().indel( sequence=sequence, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, decoder_type=decoder_type, insert=insert, delete=delete, )
[docs] def single_site( self, sequence: bytes, prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, decoder_type: Literal["mlm", "clm"] | None = None, ) -> EmbeddingsScoreFuture: """ Score all single substitutions of the query sequence using the specified prompt. Parameters ---------- sequence : bytes Sequence to analyze. prompt : str or Prompt or None, optional Prompt or prompt_id or prompt from an align workflow to condition PoET model. query : str or bytes or Protein or Query or None, optional Query to use with prompt. use_query_structure_in_decoder : bool, optional Whether to use query structure in decoder. Default is True. decoder_type : {'mlm', 'clm'} or None, optional Decoder type. Default is None. Returns ------- EmbeddingsScoreFuture A future object that returns the scores of the mutated sequence. """ query_id = self.__resolve_query(query=query) return super().single_site( sequence=sequence, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, decoder_type=decoder_type, )
[docs] def generate( self, prompt: str | Prompt, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, num_samples: int = 100, temperature: float = 1.0, topk: float | None = None, topp: float | None = None, max_length: int = 1000, seed: int | None = None, ensemble_weights: Sequence[float] | None = None, ensemble_method: Literal["arithmetic", "geometric"] | None = None, ) -> EmbeddingsGenerateFuture: """ Generate protein sequences conditioned on a prompt. Parameters ---------- prompt : str or Prompt Prompt from an align workflow to condition PoET model. query : str or bytes or Protein or Query or None, optional Query to use with prompt. use_query_structure_in_decoder : bool, optional Whether to use query structure in decoder. Default is True. num_samples : int, optional The number of samples to generate. Default is 100. temperature : float, optional The temperature for sampling. Higher values produce more random outputs. Default is 1.0. topk : float or None, optional The number of top-k residues to consider during sampling. Default is None. topp : float or None, optional The cumulative probability threshold for top-p sampling. Default is None. max_length : int, optional The maximum length of generated proteins. Default is 1000. seed : int or None, optional Seed for random number generation. Default is None. ensemble_weights : Sequence of float or None, optional Weights for combining likelihoods from multiple prompts in the ensemble. The length of this sequence must match the number of prompts. All weights must be finite. If ensemble_method is "arithmetic", then weights must also be non-negative, and have a non-zero sum. ensemble_method : {'arithmetic', 'geometric'} or None, optional Method used to combine likelihoods from multiple prompts in the ensemble. If "arithmetic", the weighted mean is used; if "geometric", the weighted geometric mean is used. If None (default), the method defaults to "arithmetic", but this behavior may change in the future. Returns ------- EmbeddingsGenerateFuture A future object representing the status and information about the generation job. """ query_id = self.__resolve_query(query=query) if ensemble_weights is not None: # NB: for now, ensemble_method is None -> ensemble_method == "arithmetic" if ensemble_method is None or (ensemble_method == "arithmetic"): assert all(w >= 0 for w in ensemble_weights) assert sum(ensemble_weights) >= 0 assert np.isfinite(np.array(ensemble_weights)).all() if isinstance(prompt, Prompt): assert len(ensemble_weights) == prompt.num_replicates, ( f"Number of ensemble weights ({len(ensemble_weights)}) must be " f"equal to the number of prompts ({prompt.num_replicates})" ) return super().generate( prompt=prompt, num_samples=num_samples, temperature=temperature, topk=topk, topp=topp, max_length=max_length, seed=seed, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, ensemble_weights=ensemble_weights, ensemble_method=ensemble_method, )
[docs] def fit_svd( self, sequences: list[bytes] | list[str] | None = None, assay: AssayDataset | None = None, n_components: int = 1024, reduction: ReductionType | None = None, prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, ) -> "SVDModel": """ Fit an SVD on the embedding results of PoET. This function will create an SVDModel based on the embeddings from this model as well as the hyperparameters specified in the arguments. Parameters ---------- sequences : list of bytes or list of str or None, optional Sequences to fit SVD. If None, assay must be provided. assay : AssayDataset or None, optional Assay containing sequences to fit SVD. Ignored if sequences are provided. n_components : int, optional Number of components in SVD. Determines output shapes. Default is 1024. reduction : ReductionType or None, optional Embeddings reduction to use (e.g. mean). prompt : str or Prompt or None, optional Prompt from an align workflow to condition PoET model. query : str or bytes or Protein or Query or None, optional Query to use with prompt. use_query_structure_in_decoder : bool, optional Whether to use query structure in decoder. Default is True. Returns ------- SVDModel A future that represents the fitted SVD model. """ query_id = self.__resolve_query(query=query) return super().fit_svd( sequences=sequences, assay=assay, n_components=n_components, reduction=reduction, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, )
[docs] def fit_umap( self, sequences: list[bytes] | list[str] | None = None, assay: AssayDataset | None = None, n_components: int = 2, reduction: ReductionType | None = ReductionType.MEAN, prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, ) -> "UMAPModel": """ Fit a UMAP on assay using PoET and hyperparameters. This function will create a UMAP based on the embeddings from this PoET model as well as the hyperparameters specified in the arguments. Parameters ---------- sequences : list of bytes or list of str or None, optional Sequences to fit UMAP. If None, assay must be provided. assay : AssayDataset or None, optional Assay containing sequences to fit UMAP. Ignored if sequences are provided. n_components : int, optional Number of components in UMAP fit. Determines output shapes. Default is 2. reduction : ReductionType or None, optional Embeddings reduction to use (e.g. mean). Default is ReductionType.MEAN. prompt : str or Prompt or None, optional Prompt from an align workflow to condition PoET model. query : str or bytes or Protein or Query or None, optional Query to use with prompt. use_query_structure_in_decoder : bool, optional Whether to use query structure in decoder. Default is True. Returns ------- UMAPModel A future that represents the fitted UMAP model. """ query_id = self.__resolve_query(query=query) return super().fit_umap( sequences=sequences, assay=assay, n_components=n_components, reduction=reduction, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, )
[docs] def fit_gp( self, assay: AssayMetadata | AssayDataset | str, properties: list[str], prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, **kwargs, ) -> "PredictorModel": """ Fit a Gaussian Process (GP) on assay using this embedding model and hyperparameters. Parameters ---------- assay : AssayMetadata or AssayDataset or str Assay to fit GP on. properties : list of str Properties in the assay to fit the GP on. prompt : str or Prompt or None, optional Prompt from an align workflow to condition PoET model. query : str or bytes or Protein or Query or None, optional Query to use with prompt. use_query_structure_in_decoder : bool, optional Whether to use query structure in decoder. Default is True. **kwargs Additional keyword arguments. Returns ------- PredictorModel A future that represents the trained predictor model. """ query_id = self.__resolve_query(query=query) return super().fit_gp( assay=assay, properties=properties, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, **kwargs, )