Source code for openprotein.app.models.umap
import numpy as np
from openprotein.api import job as job_api
from openprotein.api import umap
from openprotein.base import APISession
from openprotein.schemas import UMAPEmbeddingsJob, UMAPFitJob, UMAPMetadata
from .embeddings import EmbeddingModel, EmbeddingsResultFuture
from .futures import Future
[docs]
class UMAPModel(Future):
"""
Class providing embedding endpoint for UMAP models. \
Also allows retrieving embeddings of sequences used to fit the UMAP with `get`.
Implements a Future to allow waiting for a fit job.
"""
job: UMAPFitJob
[docs]
def __init__(
self,
session: APISession,
job: UMAPFitJob | None = None,
metadata: UMAPMetadata | None = None,
):
"""Initializes with either job get or umap metadata get."""
if metadata is None:
# use job to fetch metadata
if job is None:
raise ValueError("Expected umap metadata or job")
metadata = umap.umap_get(session, job.job_id)
self._metadata = metadata
if job is None:
job = UMAPFitJob.create(
job_api.job_get(session=session, job_id=metadata.id)
)
self._sequences = None
self._embeddings = None
# getter initializes job if not provided
super().__init__(session, job)
def __str__(self) -> str:
return str(self.metadata)
def __repr__(self) -> str:
return repr(self.metadata)
@property
def id(self):
return self._metadata.id
@property
def n_components(self):
return self._metadata.n_components
@property
def n_neighbors(self):
return self._metadata.n_neighbors
@property
def min_dist(self):
return self._metadata.min_dist
@property
def sequence_length(self):
return self._metadata.sequence_length
@property
def reduction(self):
return self._metadata.reduction
@property
def metadata(self):
self._refresh_metadata()
return self._metadata
@property
def sequences(self):
if self._sequences is not None:
return self._sequences
self._sequences = self.get_inputs()
return self._sequences
@property
def embeddings(self):
if self._embeddings is not None:
return self._embeddings
data = umap.embed_get_batch_result(session=self.session, job_id=self.id)
embeddings = [
(seq, umap)
for seq, umap in zip(self.sequences, umap.embed_batch_decode(data))
]
self._embeddings = embeddings
return self._embeddings
def _refresh_metadata(self):
if not self._metadata.is_done():
self._metadata = umap.umap_get(self.session, self._metadata.id)
[docs]
def get_model(self) -> EmbeddingModel:
"""Fetch embeddings model"""
model = EmbeddingModel.create(session=self.session, model_id=self._metadata.id)
return model
@property
def model(self) -> EmbeddingModel:
return self.get_model()
[docs]
def delete(self) -> bool:
"""
Delete this UMAP model.
"""
return umap.umap_delete(self.session, self.id)
[docs]
def get(self, verbose: bool = False):
return self.embeddings
[docs]
def embed(
self, sequences: list[bytes] | list[str], **kwargs
) -> EmbeddingsResultFuture:
"""
Use this UMAP model to get reduced embeddings from input sequences.
Parameters
----------
sequences : List[bytes]
List of protein sequences.
Returns
-------
EmbeddingResultFuture
Class for further job manipulation.
"""
return EmbeddingsResultFuture.create(
session=self.session,
job=umap.umap_embed_post(
session=self.session, umap_id=self.id, sequences=sequences, **kwargs
),
sequences=sequences,
)
class UMAPEmbeddingResultFuture(EmbeddingsResultFuture, Future):
"""Future for manipulating results for embeddings-related requests."""
job: UMAPEmbeddingsJob