Source code for openprotein.app.models.embeddings.poet2

from typing import TYPE_CHECKING

from openprotein.base import APISession
from openprotein.protein import Protein
from openprotein.schemas import ModelMetadata, ReductionType
from openprotein.utils import uuid

from ..assaydata import AssayDataset, AssayMetadata
from ..prompt import Prompt, Query
from .base import EmbeddingModel
from .future import (
    EmbeddingsGenerateFuture,
    EmbeddingsResultFuture,
    EmbeddingsScoreFuture,
)
from .poet import PoETModel

if TYPE_CHECKING:
    from openprotein import OpenProtein

    from ..predictor import PredictorModel
    from ..svd import SVDModel
    from ..umap import UMAPModel


[docs] class PoET2Model(PoETModel, EmbeddingModel): """ Class for OpenProtein's foundation model PoET 2 - NB. PoET functions are dependent on a prompt supplied via the align endpoints. Examples -------- View specific model details (inc supported tokens) with the `?` operator. .. code-block:: python import openprotein session = openprotein.connect(username="user", password="password") session.embedding.poet2.<embeddings_method> """ model_id = "poet-2" # TODO - Add model to explicitly require prompt_id
[docs] def __init__( self, session: "OpenProtein", model_id: str, metadata: ModelMetadata | None = None, ): self.session = session self.id = model_id self._metadata = metadata
# could add prompt here? def __resolve_query( self, query: str | bytes | Protein | Query | None = None, ) -> str | None: if query is None: query_id = None elif ( isinstance(query, Protein) or isinstance(query, bytes) or (isinstance(query, str) and not uuid.is_valid_uuid(query)) ): query_ = self.session.prompt.create_query(query=query) query_id = query_.id else: query_id = query if isinstance(query, str) else query.id return query_id
[docs] def embed( self, sequences: list[bytes], reduction: ReductionType | None = ReductionType.MEAN, prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, ) -> EmbeddingsResultFuture: """ Embed sequences using this model. Parameters ---------- sequence : bytes Sequence to embed. reduction: str embeddings reduction to use (e.g. mean) prompt: str | Prompt Prompt or prompt_id or prompt from an align workflow to condition Poet model query: str | bytes | Protein | Query | None Query to use with prompt. Optional Returns ------- EmbeddingResultFuture A future object that returns the embeddings of the submitted sequences. """ query_id = self.__resolve_query(query=query) return super().embed( sequences=sequences, reduction=reduction, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, )
[docs] def logits( self, sequences: list[bytes], prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, ) -> EmbeddingsResultFuture: """ logit embeddings for sequences using this model. Parameters ---------- sequence : bytes Sequence to analyse. prompt: str | Prompt Prompt or prompt_id or prompt from an align workflow to condition Poet model query: str | bytes | Protein | Query | None Query to use with prompt. Optional Returns ------- EmbeddingResultFuture A future object that returns the logits of the submitted sequences. """ query_id = self.__resolve_query(query=query) return super().logits( sequences=sequences, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, )
[docs] def score( self, sequences: list[bytes], prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, ) -> EmbeddingsScoreFuture: """ Score query sequences using the specified prompt. Parameters ---------- sequence: list[bytes] Sequences to score. prompt: str | Prompt Prompt or prompt_id or prompt from an align workflow to condition Poet model query: str | bytes | Protein | Query | None Query to use with prompt. Optional Returns ------- EmbeddingsScoreFuture A future object that returns the scores of the submitted sequences. """ query_id = self.__resolve_query(query=query) return super().score( sequences=sequences, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, )
[docs] def single_site( self, sequence: bytes, prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, ) -> EmbeddingsScoreFuture: """ Score all single substitutions of the query sequence using the specified prompt. Parameters ---------- sequence: bytes Sequence to analyse. prompt: str | Prompt Prompt or prompt_id or prompt from an align workflow to condition Poet model query: str | bytes | Protein | Query | None Query to use with prompt. Optional Returns ------- EmbeddingsScoreFuture A future object that returns the scores of the mutated sequence. """ query_id = self.__resolve_query(query=query) return super().single_site( sequence=sequence, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, )
[docs] def generate( self, prompt: str | Prompt, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, num_samples: int = 100, temperature: float = 1.0, topk: float | None = None, topp: float | None = None, max_length: int = 1000, seed: int | None = None, ) -> EmbeddingsGenerateFuture: """ Generate protein sequences conditioned on a prompt. Parameters ---------- prompt: str | Prompt prompt from an align workflow to condition Poet model query: str | bytes | Protein | Query | None Query to use with prompt. Optional num_samples: int, optional The number of samples to generate, by default 100. temperature: float, optional The temperature for sampling. Higher values produce more random outputs, by default 1.0. topk: int, optional The number of top-k residues to consider during sampling, by default None. topp: float, optional The cumulative probability threshold for top-p sampling, by default None. max_length: int, optional The maximum length of generated proteins, by default 1000. seed: int, optional Seed for random number generation, by default a random number. Returns ------- EmbeddingsGenerateFuture A future object representing the status and information about the generation job. """ query_id = self.__resolve_query(query=query) return super().generate( prompt=prompt, num_samples=num_samples, temperature=temperature, topk=topk, topp=topp, max_length=max_length, seed=seed, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, )
[docs] def fit_svd( self, sequences: list[bytes] | list[str] | None = None, assay: AssayDataset | None = None, n_components: int = 1024, reduction: ReductionType | None = None, prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, ) -> "SVDModel": """ Fit an SVD on the embedding results of PoET. This function will create an SVDModel based on the embeddings from this model \ as well as the hyperparameters specified in the args. Parameters ---------- prompt: str | Prompt prompt from an align workflow to condition Poet model query: str | bytes | Protein | Query | None Query to use with prompt. Optional sequences : List[bytes] sequences to SVD n_components: int number of components in SVD. Will determine output shapes reduction: str embeddings reduction to use (e.g. mean) Returns ------- SVDModel A future that represents the fitted SVD model. """ query_id = self.__resolve_query(query=query) return super().fit_svd( sequences=sequences, assay=assay, n_components=n_components, reduction=reduction, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, )
[docs] def fit_umap( self, sequences: list[bytes] | list[str] | None = None, assay: AssayDataset | None = None, n_components: int = 2, reduction: ReductionType | None = ReductionType.MEAN, prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, ) -> "UMAPModel": """ Fit a UMAP on assay using PoET and hyperparameters. This function will create a UMAP based on the embeddings from this PoET model \ as well as the hyperparameters specified in the args. Parameters ---------- prompt: str | Prompt prompt from an align workflow to condition Poet model sequences : list[bytes] | None Optional sequences to fit UMAP with. Either use sequences or assay. sequences is preferred. assay: AssayDataset | None Optional assay containing sequences to fit UMAP with. Either use sequences or assay. Ignored if sequences are provided. n_components: int Number of components in UMAP fit. Will determine output shapes. Defaults to 2. reduction: ReductionType | None Embeddings reduction to use (e.g. mean). Defaults to MEAN. Returns ------- UMAPModel A future that represents the fitted UMAP model. """ query_id = self.__resolve_query(query=query) return super().fit_umap( sequences=sequences, assay=assay, n_components=n_components, reduction=reduction, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, )
[docs] def fit_gp( self, assay: AssayMetadata | AssayDataset | str, properties: list[str], prompt: str | Prompt | None = None, query: str | bytes | Protein | Query | None = None, use_query_structure_in_decoder: bool = True, **kwargs, ) -> "PredictorModel": """ Fit a GP on assay using this embedding model and hyperparameters. Parameters ---------- assay : AssayMetadata | str Assay to fit GP on. properties: list[str] Properties in the assay to fit the gp on. reduction : str Type of embedding reduction to use for computing features. PLM must use reduction. query: str | bytes | Protein | Query | None Query to use with prompt. Optional Returns ------- PredictorModel A future that represents the trained predictor model. """ query_id = self.__resolve_query(query=query) return super().fit_gp( assay=assay, properties=properties, prompt=prompt, query_id=query_id, use_query_structure_in_decoder=use_query_structure_in_decoder, **kwargs, )