Source code for openprotein.models.foundation.boltzgen

"""BoltzGen model for protein structure and sequence design."""

import base64
import gzip
import io
import tarfile
from typing import Any, BinaryIO, Literal

from pydantic import BaseModel

from openprotein.base import APISession
from openprotein.common import ModelMetadata
from openprotein.jobs import Future, Job
from openprotein.models.base import ProteinModel
from openprotein.models.structure_generation import StructureGenerationFuture
from openprotein.molecules import Protein, Complex
from openprotein.prompt import PromptAPI, Query
from openprotein.scaffolds import Scaffolds

from .boltzgen_schema import BoltzGenDesignSpec


def _create_assets_archive(
    scaffolds: dict[str, str | bytes | BinaryIO] | None = None,
    extra_files: dict[str, str | bytes | BinaryIO] | None = None,
) -> bytes | None:
    """
    Create a gzipped tar archive from scaffolds and extra files.

    Returns base64-encoded gzipped tar bytes, or None if no files provided.
    """
    if not scaffolds and not extra_files:
        return None

    # Create in-memory tar.gz
    tar_buffer = io.BytesIO()

    with gzip.GzipFile(fileobj=tar_buffer, mode="wb") as gz:
        with tarfile.open(fileobj=gz, mode="w") as tar:
            # Add scaffolds
            if scaffolds:
                for filename, content in scaffolds.items():
                    # Read content
                    if isinstance(content, bytes):
                        data = content
                    elif isinstance(content, str):
                        # Assume it's a file path
                        with open(content, "rb") as f:
                            data = f.read()
                    else:
                        # BinaryIO
                        data = content.read()

                    # Add to tar
                    info = tarfile.TarInfo(name=f"scaffolds/{filename}")
                    info.size = len(data)
                    tar.addfile(info, io.BytesIO(data))

            # Add extra files
            if extra_files:
                for filename, content in extra_files.items():
                    # Read content
                    if isinstance(content, bytes):
                        data = content
                    elif isinstance(content, str):
                        with open(content, "rb") as f:
                            data = f.read()
                    else:
                        data = content.read()

                    # Add to tar
                    info = tarfile.TarInfo(name=f"extra/{filename}")
                    info.size = len(data)
                    tar.addfile(info, io.BytesIO(data))

    # Get the gzipped tar bytes
    tar_buffer.seek(0)
    return tar_buffer.read()


class BoltzGenRequest(BaseModel):
    "Specification for an BoltzGen request."

    N: int = 1
    query_id: str | None = None
    design_spec: BoltzGenDesignSpec | dict[str, Any] | None = None
    structure_text: str | None = None
    diffusion_batch_size: int | None = None
    step_scale: float | None = None
    noise_scale: float | None = None
    assets: str | None = None  # base64-encoded gzipped tar
    scaffold_set: str | None = None


class BoltzGenJob(Job):
    """Job schema for an BoltzGen request."""

    job_type: Literal["/models/boltzgen"]


[docs] class BoltzGenFuture(StructureGenerationFuture): """Future for handling the results of an RFdiffusion job.""" job: BoltzGenJob
[docs] def get_item(self, replicate: int = 0) -> Complex: """ Retrieve the output Complex for a specific design. Args: replicate (int): The 0-based index of the design to retrieve. Returns: Complex: The designed Complex. """ pdb = _boltzgen_api_result_get( session=self.session, job_id=self.id, replicate=replicate ) m = Complex.from_string(pdb, format="cif") return m
def _boltzgen_api_post( session: APISession, request: BoltzGenRequest, **kwargs ) -> BoltzGenJob: """ POST a request for BoltzGen design. Returns a Job object that can be used to retrieve results later. """ endpoint = "v1/design/models/boltzgen" body = request.model_dump(exclude_none=True) body.update(kwargs) response = session.post(endpoint, json=body) return BoltzGenJob.model_validate(response.json()) def _boltzgen_api_get_metadata(session: APISession) -> ModelMetadata: """ POST a request for BoltzGen design. Returns a Job object that can be used to retrieve results later. """ endpoint = f"v1/design/models/boltzgen" response = session.get(endpoint) return ModelMetadata.model_validate(response.json()) def _boltzgen_api_result_get( session: APISession, job_id: str, replicate: int = 0 ) -> str: """ POST a request for BoltzGen design. # Returns a Job object that can be used to retrieve results later. """ endpoint = f"v1/design/{job_id}/results" response = session.get(endpoint, params={"replicate": replicate}) return response.text
[docs] class BoltzGenModel(ProteinModel): """ BoltzGen model for generating de novo protein structures. This model supports functionalities like unconditional design, scaffolding, and binder design. """ model_id: str = "boltzgen" def __init__(self, session: APISession, model_id: str = "boltzgen"): # The model_id from the API might be more specific, e.g., "boltzgen-v1.1" super().__init__(session, model_id)
[docs] def get_metadata(self) -> ModelMetadata: return _boltzgen_api_get_metadata(session=self.session)
[docs] def generate( self, query: str | bytes | Protein | Complex | Query | None = None, design_spec: BoltzGenDesignSpec | dict[str, Any] | None = None, structure_file: str | bytes | BinaryIO | None = None, N: int = 1, diffusion_batch_size: int | None = None, step_scale: float | None = None, noise_scale: float | None = None, # scaffolds that can be provided for design scaffolds: dict[str, str | bytes | BinaryIO] | None = None, scaffold_set: Scaffolds | str | None = None, # extra structures that can be bundled together as assets extra_structure_files: dict[str, str | bytes | BinaryIO] | None = None, **kwargs, ) -> BoltzGenFuture: """ Run a protein structure generate job using BoltzGen. Parameters ---------- query : str or bytes or Protein or Complex or Query, optional A query representing the design specification. Either `query` or `design_spec` must be provided. `query` provides a unified way to represent design specifications on the OpenProtein platform. In this case, the structure mask of the containing Complex proteins are specified to be designed. Other parameters like binding, group, secondary structures, etc. are also passed through to BoltzGen. design_spec : BoltzGenDesignSpec | dict[str, Any] | None, optional The BoltzGen design specification to run. Either `query` or `design_spec` must be provided. `design_spec` exposes a low-level interface to using BoltzGen by accepting the YAML specification used by official BoltzGen examples. Can be a typed BoltzGenDesignSpec object or a dict representing the BoltzGen yaml request specification. Note: If the design_spec includes file paths, provide these extra files either using `scaffolds` or `extra_structure_files`. structure_file : str | bytes | BinaryIO | None, optional (Deprecated: use `extra_structure_files`) An input PDB/CIF file used for inpainting or other guided design tasks where parts of an existing structure are provided. This parameter provides the actual structure content that corresponds to any FileEntity `path` fields in the design_spec. Can be: - A file path (str) to read from - Raw file content (bytes) - A file-like object (BinaryIO) n : int, optional The number of unique design trajectories to run (default is 1). diffusion_batch_size : int, optional The batch size for diffusion sampling. Controls how many samples are processed in parallel during the diffusion process. step_scale : float, optional Scaling factor for the number of diffusion steps. Higher values may improve quality at the cost of longer generation time. noise_scale : float, optional Scaling factor for the noise schedule during diffusion. Controls the amount of noise added at each step of the reverse diffusion process. scaffolds : dict[str, str | bytes | BinaryIO] | None, optional Dictionary mapping scaffold filenames to their content. Each value can be: - A file path (str) to read from - Raw file content (bytes) - A file-like object (BinaryIO) These files will be packaged into a gzipped tar archive and made available to the design process under the 'scaffolds/' directory. scaffold_set : Scaffolds | str | None, optional A pre-defined scaffold set object. Alternative to providing individual scaffold files via the `scaffolds` parameter. extra_structure_files : dict[str, str | bytes | BinaryIO] | None, optional Dictionary mapping additional structure filenames to their content, with the same format options as `scaffolds`. These files will be packaged into the same archive under the 'extra/' directory and can be referenced in the design specification. Other Parameters ---------------- **kwargs : dict Additional keyword args that are passed directly to the boltzgen inference script. Overwrites any preceding options. Returns ------- BoltzGenFuture A future object that can be used to retrieve the results of the design job upon completion. """ # Ensure only query or design_spec is provided if (query is None and design_spec is None) or ( query is not None and design_spec is not None ): raise ValueError("Expected either `query` or `design_spec`") if query is not None: prompt_api = getattr(self.session, "prompt", None) assert isinstance(prompt_api, PromptAPI) query_id = prompt_api._resolve_query( query=query, force_structure=True ) # ensure we have a structure query else: query_id = None # Validate design_spec if it's a dict if isinstance(design_spec, dict): design_spec = BoltzGenDesignSpec.model_validate(design_spec) # Extract the string if isinstance(scaffold_set, Scaffolds): scaffold_set = scaffold_set.value request = BoltzGenRequest( N=N, query_id=query_id, design_spec=design_spec, diffusion_batch_size=diffusion_batch_size, step_scale=step_scale, noise_scale=noise_scale, scaffold_set=scaffold_set, ) # Handle structure_file if structure_file is not None: raise ValueError( "structure_file no longer accepted. use extra_structure_files instead to provide multiple structure files." ) # Create assets archive from scaffolds and extra files assets_bytes = _create_assets_archive( scaffolds=scaffolds, extra_files=extra_structure_files ) if assets_bytes: request.assets = base64.b64encode(assets_bytes).decode("utf-8") # Submit the job job = _boltzgen_api_post( session=self.session, request=request, **kwargs, ) return BoltzGenFuture(session=self.session, job=job, N=request.N)
predict = generate