Source code for openprotein.molecules.chains
"""Additional chains that can be used with OpenProtein."""
from dataclasses import dataclass, replace
from typing import ClassVar, Protocol
import gemmi
import openprotein.utils.cif as _cif_utils
from .protein import (
_extract_full_sequence_from_residues,
_extract_one_letter_from_full_sequence,
)
class _BasicSerde(Protocol):
_GEMMI_ENTITY_TYPE: ClassVar[gemmi.EntityType]
_GEMMI_POLYMER_TYPE: ClassVar[gemmi.PolymerType]
_structure_block: _cif_utils.StructureCIFBlock | None
def _make_structure(
self,
structure: gemmi.Structure | None = None,
model_idx: int = 0,
chain_id: str = "A",
entity_name: str = "1",
) -> gemmi.Structure:
assert (
self._structure_block is not None
), "only chains constructed directly from a structure file can be serialized for now"
assert (
self._structure_block.structure.input_format == gemmi.CoorFormat.Mmcif
), "only chains that were deserialized from cif can be serialized for now"
# Create an empty structure and add a model with a default chain.
if structure is None:
structure = gemmi.Structure()
# Get existing model or create new one
if len(structure) > 0:
model = structure[model_idx]
else:
model = structure.add_model(gemmi.Model(str(model_idx))) # type: ignore - gemmi 0.6 needs str
# Get existing chain
subchain = self._structure_block.structure[model_idx].get_subchain(chain_id)
assert len(subchain) > 0
# Create entity
if self._GEMMI_ENTITY_TYPE == gemmi.EntityType.Polymer:
entity = gemmi.Entity(entity_name)
entity.name = entity_name
entity.subchains = [chain_id]
entity.entity_type = self._GEMMI_ENTITY_TYPE
entity.polymer_type = self._GEMMI_POLYMER_TYPE
entity.full_sequence = self._structure_block.structure.get_entity_of(
subchain
).full_sequence
structure.entities.append(entity)
else:
matching_entities = [
entity for entity in structure.entities if chain_id in entity.subchains
]
if len(matching_entities) == 0:
original_entity = self._structure_block.structure.get_entity_of(
subchain
)
entity = gemmi.Entity(entity_name)
entity.name = entity_name
entity.subchains = original_entity.subchains
entity.entity_type = original_entity.entity_type
entity.polymer_type = original_entity.polymer_type
entity.full_sequence = original_entity.full_sequence
structure.entities.append(entity)
elif len(matching_entities) == 1:
pass
else:
raise ValueError("more matching entities found than expected")
# Create chain
chain = model.add_chain(gemmi.Chain(chain_id))
chain.append_residues(list(subchain.first_conformer()))
return structure
def _append_loop_data(
self, chain_id: str, sequence_loop: gemmi.cif.Loop, atom_loop: gemmi.cif.Loop
):
pass
[docs]
@dataclass(frozen=True, eq=False)
class DNA(_BasicSerde):
"""
Represents a DNA sequence.
Attributes:
sequence (str): The nucleotide sequence of the DNA.
"""
sequence: str
cyclic: bool = False
_GEMMI_ENTITY_TYPE: ClassVar[gemmi.EntityType] = gemmi.EntityType.Polymer
_GEMMI_POLYMER_TYPE: ClassVar[gemmi.PolymerType] = gemmi.PolymerType.Dna
_structure_block: _cif_utils.StructureCIFBlock | None = None
def __post_init__(self):
if not all(nt in set("ACGT") for nt in self.sequence.upper()):
raise ValueError("Sequence contains invalid DNA nucleotides.")
def __len__(self):
return len(self.sequence)
def copy(self) -> "DNA":
return replace(self)
@staticmethod
def _from_structure_block(
structure_block: _cif_utils.StructureCIFBlock, chain_id: str, model_idx: int
) -> "DNA":
assert structure_block.structure.input_format == gemmi.CoorFormat.Mmcif
structure = structure_block.structure
model = structure[model_idx]
polymer = model.get_subchain(chain_id)
assert len(polymer) > 0
# extract sequence
entity = structure.get_entity_of(polymer)
residues = list(polymer.first_conformer())
# TODO: consider utilizing polymer.make_one_letter_sequence() here or elsewhere
del polymer
if len(entity.full_sequence) > 0:
chain_seq = entity.full_sequence
elif entity.name in structure_block.full_sequences:
chain_seq, _ = structure_block.full_sequences[entity.name]
else:
chain_seq, _ = _extract_full_sequence_from_residues(residues=residues)
chain_seq = _extract_one_letter_from_full_sequence(full_sequence=chain_seq)
return DNA(sequence="".join(chain_seq), _structure_block=structure_block)
[docs]
@dataclass(frozen=True, eq=False)
class RNA(_BasicSerde):
"""
Represents an RNA sequence.
Attributes:
sequence (str): The nucleotide sequence of the RNA.
"""
sequence: str
cyclic: bool = False
_GEMMI_ENTITY_TYPE: ClassVar[gemmi.EntityType] = gemmi.EntityType.Polymer
_GEMMI_POLYMER_TYPE: ClassVar[gemmi.PolymerType] = gemmi.PolymerType.Rna
_structure_block: _cif_utils.StructureCIFBlock | None = None
def __post_init__(self):
if not all(nt in set("ACGU") for nt in self.sequence.upper()):
raise ValueError("Sequence contains invalid RNA nucleotides.")
def __len__(self):
return len(self.sequence)
def copy(self) -> "RNA":
return replace(self)
@staticmethod
def _from_structure_block(
structure_block: _cif_utils.StructureCIFBlock, chain_id: str, model_idx: int
) -> "RNA":
assert structure_block.structure.input_format == gemmi.CoorFormat.Mmcif
structure = structure_block.structure
model = structure[model_idx]
polymer = model.get_subchain(chain_id)
assert len(polymer) > 0
# extract sequence
entity = structure.get_entity_of(polymer)
residues = list(polymer.first_conformer())
# TODO: consider utilizing polymer.make_one_letter_sequence() here or elsewhere
del polymer
if len(entity.full_sequence) > 0:
chain_seq = entity.full_sequence
elif entity.name in structure_block.full_sequences:
chain_seq, _ = structure_block.full_sequences[entity.name]
else:
chain_seq, _ = _extract_full_sequence_from_residues(residues=residues)
chain_seq = _extract_one_letter_from_full_sequence(full_sequence=chain_seq)
return RNA(sequence="".join(chain_seq), _structure_block=structure_block)
[docs]
@dataclass(frozen=True, eq=False)
class Ligand(_BasicSerde):
"""
Represents a ligand with optional Chemical Component Dictionary (CCD) identifier and SMILES string.
Requires either a CCD identifier or SMILES string.
Attributes:
ccd (str | None): The CCD identifier for the ligand.
smiles (str | None): The SMILES representation of the ligand.
"""
ccd: str | None = None
smiles: str | None = None
_GEMMI_ENTITY_TYPE: ClassVar[gemmi.EntityType] = gemmi.EntityType.NonPolymer
_GEMMI_POLYMER_TYPE: ClassVar[gemmi.PolymerType] = gemmi.PolymerType.Unknown
_structure_block: _cif_utils.StructureCIFBlock | None = None
def __post_init__(self):
if (self.ccd is None and self.smiles is None) or (
self.ccd is not None and self.smiles is not None
):
raise ValueError("Exactly one of 'ccd' or 'smiles' must be provided.")
def copy(self) -> "Ligand":
return replace(self)
@staticmethod
def _from_structure_block(
structure_block: _cif_utils.StructureCIFBlock, chain_id: str, model_idx: int
) -> "Ligand":
assert structure_block.structure.input_format == gemmi.CoorFormat.Mmcif
subchain = structure_block.structure[model_idx].get_subchain(chain_id)
residues = list(subchain.first_conformer())
assert len(residues) == 1
return Ligand(ccd=residues[0].name, _structure_block=structure_block)