[1]:
%matplotlib inline

Getting started with PoET

This notebook will briefly cover how to run align and PoET workflows.

For more information please read the docs.

[2]:
import matplotlib.pyplot as plt
import json
import pandas as pd

Setup

Connect to the OpenProtein backend with your credentials:

[3]:
import openprotein

with open('secrets.config', 'r') as f:
    config = json.load(f)

session = openprotein.connect(username= config['username'], password= config['password'])

We will use a small sample of the AMIE PSEAE dataset as a demo, the full data is available on our website:

[4]:
dataset = pd.read_csv("./data/AMIE_PSEAE.csv")[['sequence']]
dataset.head(2)
[4]:
sequence
0 WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMK...
1 WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMK...

Create an MSA

We can create an MSA either from a seed, or by uploading a ready-made file. Here we will explore the seed workflow:

[5]:
# Create an MSA from a seed sequence
seed = dataset.sequence[0]

Start a ColabFold job to create an MSA:

[6]:
msa = session.poet.create_msa(seed.encode())
print(msa)


model_config={'protected_namespaces': ()} status=<JobStatus.SUCCESS: 'SUCCESS'> job_id='b0a87413-053b-4607-af05-e2a571e3fb83' job_type='/align/align' created_date=datetime.datetime(2023, 8, 7, 4, 16, 27, 948640) start_date=None end_date=datetime.datetime(2023, 8, 7, 4, 16, 27, 988728) prerequisite_job_id=None progress_message=None progress_counter=None num_records=None msa_id='b0a87413-053b-4607-af05-e2a571e3fb83'
[7]:
r = msa.wait()
list(r)[0:3]
[7]:
[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI0004660BEB',
  '-RHGDISSSNDTVGVAVVNYKMPRLHTVAEVLDNARKIADMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARSNDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI000730B3B9',
  '-RHGDISSSNDTVGVAVVNYKMPRLHSREEVLANAQKIADMVVGMKQGLPGMDLVIFPEYSLQGIMYDPAEMMETAVAIPGDETELLARACRKANVWGVFSLTGERHEEHPNKAPYNTLVLIDNKGEVVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISMIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKEQQVLMAKAMAWANNTYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSVSQIRDARANDQSQNHLYKILHRGYTGLNNSGEGDRGLAECPFEFYKTWVTDAEKARENVEKITRSTSGVAQCPVGRLPYEGEEKEA']]

We can examine our inputs:

[8]:
list(msa.get_input("RAW"))
[8]:
[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA']]

and the resulting MSA (limited here to 4 sequences for brevity):

[9]:
list(msa.get_input("GENERATED"))[0:4]
[9]:
[['seed',
  'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI0004660BEB',
  '-RHGDISSSNDTVGVAVVNYKMPRLHTVAEVLDNARKIADMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARSNDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA'],
 ['UniRef100_UPI000730B3B9',
  '-RHGDISSSNDTVGVAVVNYKMPRLHSREEVLANAQKIADMVVGMKQGLPGMDLVIFPEYSLQGIMYDPAEMMETAVAIPGDETELLARACRKANVWGVFSLTGERHEEHPNKAPYNTLVLIDNKGEVVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISMIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKEQQVLMAKAMAWANNTYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSVSQIRDARANDQSQNHLYKILHRGYTGLNNSGEGDRGLAECPFEFYKTWVTDAEKARENVEKITRSTSGVAQCPVGRLPYEGEEKEA'],
 ['UniRef100_UPI00235F2AA4',
  '-RHGDISSSNDTVGVAVVNYKMPRLHNREQVLDNAERIAAMIVGMKQGLPGMDLVIFPEYSLQGIMYDPAEMYETAVSIPGDETEIFSRACRKAGTWGVFSLTGERHEEHPRKAPYNTLVLINNKGEVVQKYRKIIPWCPIEGWYPGNQTFVSEGPKGLKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKEQQVLMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLHASGEGDKGLAECPFEFYKTWVNDAEKAREQVQAITRTTSGVAQCPVGKLPYDGLEKQA']]

Prompt

We can use this MSA to create a prompt with a sampling regime (see the docs for details):

[10]:
prompt = msa.sample_prompt(num_ensemble_prompts=3, random_seed=42)

[11]:
prompt.id # or prompt.job.job_id
[11]:
'87b127eb-d043-43ad-9f18-c52f0abad0f9'
[12]:
# We can wait, or not, for the prompt to create
prompt.wait() #not necessary but can
[12]:
<_csv.reader at 0x7ff91dd00dd0>

As we specified 3 prompts above we will have 3 different prompts all drawn from the same MSA:

[13]:
list(prompt.get_prompt(1))[0:3]
[13]:
[['UniRef100_A0A959K4C9',
  'GLMICYDTRFPEMARSLALAGAEIIIVPTAWPFPRVEHWQLLSRARAIENQCYVVTANRVGKDGQAIFCGNSRVIDPHGVVVSSASEDQEEIIYAEIKRDKLDFIRTRMPVFEHRRPDVY'],
 ['UniRef100_UPI00041A74DE',
  'GSVSAWDEALLIAAIQYPVPVIKRPEDIQVQVQQICKTIDSTKAGYPDLDLIVFPEYSAQGLNTKIWTYDEMLLSLESPEVDSFRQACIRNNIWGVFSLMERNEDPSQPPYNTAIIINNSGEIVLHYRKLQPWVPIEPWMPGNGMPVCGGPKGAKLAVCICHDGMFPELAREAAYKGCNVFIRISGYSTQVNDQWIWTNRTNAWQNLMYTVSVNLAGYDE'],
 ['UniRef100_A0A7W9FMQ2',
  'GGLNKSENGVVIGLVQLQLPVTVTRDDLARQTKRIVELVGKARRNNAGMDLVVFPEYALHGLSMDTNPAIMCDLDGPEVAAFKAACAEHRIWGCFSIMERNPGGNPYNSGIVIDDQGALKLYYRKLHPWVPVEPWEPGDGIPVIDGPKGAKLALIICHDGMFPEMARECAYKGAEIMIRTAGYTAPIRESWRFTNQANAFQNLMVTANVCMCGSDGTFDSMGEGMIVNFDGTVIAHGVTGRPEIITAEVRPDLVREARAGWGVENNIYQLWHRGYVAVKGGAMDCPYTFMQDMVAG']]
[14]:
list(prompt.get_prompt(2))[0:3]
[14]:
[['UniRef100_A0A194RN05',
  'FNTHIIIDNKGDIVQTYRKLHLFDESDFTSPGSHVVTPVDTPVGRIGLEICYDMRFPELSTTLGSMRADILTFPSAFTYTGMAHWHLLLRARAIENQCYVLAAAQTGHNAKRRSYGHALCVDPWGEVLADCEEEGPCYKIAEISLEKLADVRRNMPVFQHR'],
 ['UniRef100_A0A7W0G9W8',
  'GGSAILGPDGAYLAGPLYDEEGILYAELDPTRLAEERQRDPAGHYHRPDV'],
 ['UniRef100_A0A6F9EEE2',
  'RHGDISSSPDTVGVAVVNYKMPRLHTREQVLDNARKIADMIVGMKQGLPGMDLVVFPEYSTMGIMYDPDEMFETACTVPGEETEIFGRACREANTWGVFSLTGERHEEHPRKSPYNTLVLINNRGEIVQKYRKILPWAPIEGWYPGDKTYVSDGPKGLKVSLIICDDGNYPEIWRDCAMKGAELIVRPQGYMYPAKEQQIMMAKTMAWANNVYVAVANATGFDGVYSYFGHSAIIGFDGRTLGECGEEEYGIQYAELSISAIRDARQNWQSQNQLFKLLHRGYTGIYNSGDGDKGLAECPFDFYRTWVLDAKKAQENVEKITRTELTTACCPVGGLPYNGAEREA']]
[15]:
list(prompt.get_prompt(3))[0:3]
[15]:
[['UniRef100_UPI0009488FB3',
  'RHGDISSSPDTVGVAVVNYKMPRLHTKSDVLANAEQIADMIIGIKQGLPGMDLIVFPEYSTMGIMYDKDEMMATATTIPGEETAIFSAACKKANTWGVFSLTGEQHEEHPHKSPYNTLVLINNEGEIVQKYRKCIPWCPIEGWYPGDRTYVTTGPKGMKISLIICDDGNYPEIWRDCAMRGAELIVRCQGYMYPAKEQQVMMAKTMAWANNCYVAVANAAGFDGVYSYFGHSAIVGFDGRTLGECGEEDMGIQYAQLSVSQIRDARANDQSQNHLFKLLHRGYTGVHNSGDGDKGIADCPFEFYRTWVMDAEKAQSDVEAMTRDTIGVVDCPVGNLPAGASEKE'],
 ['UniRef100_UPI001BD4A459',
  'GSVSAWDEALLIAAIQYPVPVIKVPEDIQVQVRQICKTIDSTKAGYPDLDLIVFPEYSAQGLNTKIWTYDEMLLSLDSPEVDCFRQACIRNDIWGVFSVMERNEDSSQPPYNAAIIINNNGEIALHYRKLQPWVPIEPWMPGNGMPVCEGPKGAKLAVCICHDGMFPELAREAAYKGCNVFIRISGYSTQVNDQWIWTNRTNAWQNLMYTVSVNLAGYDEVFYYFGEGTICNYDGNVIQQGQRNPWEIVTAELFPRLADKARENWALENSIFNLGCRGYVGKPGGERANYLTWVRDLANGEYK'],
 ['UniRef100_UPI000248378F',
  'HGDISSSYDSVGVAVVNYKMPRLHTQDEVLANCNNIAEVIDGMKQGLPGLDLVIFPEYSTHGIMYDSQEMMDTASSIPGPETDIFSEACIRNKVWGVFSLTGERHEQHPDKVPYNTLILMNDQGDIVQKYRKIMPWTPIEGWYPGNCTYVTDGPKGLKISLIICDDGNYPEIWRDCVMKGAELVIRCQGYMYPAKEQQIIVSKAMAWMNNTYVAVANAAGFDGVYSYFGHSAIVGFDGRTLGECGEEENGIQYAALSKFSIRDFRKHAQSQNHLFKLLHRGYTGIINSGEGDQGMMECPYDFYREWVLDPESTKKKVEALTRPTVGTHECPIDGIP']]
[16]:
prompt1_seqs = [i[1] for i in list(prompt.get_prompt(1))]
prompt2_seqs = [i[1] for i in list(prompt.get_prompt(2))]
prompt3_seqs = [i[1] for i in list(prompt.get_prompt(3))]

print(f"N seqs in prompt1: {len(prompt1_seqs)}, prompt2: {len(prompt2_seqs)} prompt3: {len(prompt3_seqs)}")
print(f"Seqs found in all 3 prompts: {len(set(prompt1_seqs) & set(prompt2_seqs)  & set(prompt3_seqs))} ")
N seqs in prompt1: 44, prompt2: 44 prompt3: 46
Seqs found in all 3 prompts: 0
[17]:
msa.msa_id, prompt.prompt_id
[17]:
('b0a87413-053b-4607-af05-e2a571e3fb83',
 '87b127eb-d043-43ad-9f18-c52f0abad0f9')

Scoring with PoET

[18]:
seqs = [i.encode() for i in dataset.sequence] # prepare seqs from our dataset
[19]:
scorejob = session.poet.score(prompt.prompt_id, queries=seqs )
[20]:
score_results = scorejob.wait()
score_results[0]
[20]:
PoetScoreResult(model_config={'protected_namespaces': ()}, sequence=b'WRHGDISSSNDTVGVAVVNYKMPRLHTAAEVLDNARKIAEMIVGMKQGLPGMDLVVFPEYSLQGIMYDPAEMMETAVAIPGEETEIFSRACRKANVWGVFSLTGERHEEHPRKAPYNTLVLIDNNGEIVQKYRKIIPWCPIEGWYPGGQTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAELIVRCQGYMYPAKDQQVMMAKAMAWANNCYVAVANAAGFDGVYSYFGHSAIIGFDGRTLGECGEEEMGIQYAQLSLSQIRDARANDQSQNHLFKILHRGYSGLQASGDGDRGLAECPFEFYRTWVTDAEKARENVERLTRSTTGVAQCPVGRLPYEGLEKEA', score=[-67.385009765625, -161.78848266601562, -173.0670166015625], name='sequence-01')

Single site analysis with PoET

A similar flow yields a single site mutation analysis of a sequence:

[21]:
sspjob   = session.poet.single_site(prompt, sequence="AAPLAA".encode())
[22]:
ssp_results = sspjob.wait()
ssp_results[0:3]
[22]:
[PoetSSPResult(model_config={'protected_namespaces': ()}, sequence=b'input', score=[-28.7412109375, -28.05859375, -28.6044921875], name=None),
 PoetSSPResult(model_config={'protected_namespaces': ()}, sequence=b'A1R', score=[-30.0703125, -29.6171875, -29.744140625], name=None),
 PoetSSPResult(model_config={'protected_namespaces': ()}, sequence=b'A1N', score=[-30.44921875, -30.638671875, -31.3056640625], name=None)]

Generate de novo sequences

Lastly, we can use the generation workflow:

[23]:
genjob  = session.poet.generate(prompt.prompt_id, num_samples=10) #make 10 sequences based on our prompt

[24]:

gen_results = genjob.wait() gen_results[0]
[24]:
PoetScoreResult(model_config={'protected_namespaces': ()}, sequence=b'RHGDISSSRSGVGTAVVQYKLPRLHTRDEVLRNVEDIVNMIIGMKQGLPGMDLVIFPEYSLQGIMYDGNEMMKTASVIPGPETEAFVRACREHKVWGIFSLTGEQHEDHPHKSPYNTLILRDDKGNVVQKYRKILPWCPIEGWYPGDMTYVSEGPKGMKISLIICDDGNYPEIWRDCAMKGAEIIVRPQAYMYPAKDQQILMAKTMAWSNNVYAAVANAAGFDGVYTYFGHSAIIGFDGRTLGECGEEEGGVQYAELSISSIRDARKNNQSQNNLFKLNHRGYTGFSYSREKGKHAAECPYDFYKNWVNDPEKTQKGVEAITREKVGTQECPYDFLPVEETYRKPPAWFLTEDRTLLGMTPSQLPIGNRAEARRPLTNGCIAASQMNLKRAAGLKPEE', score=[-493.91375732421875, -462.3254089355469, -493.4800109863281], name='generated-sequence-1')

Resuming work

You can reload a prompt, MSA or PoET job to resume where you left off:

[25]:
old_msa = session.poet.load_msa_job(msa.msa_id)
old_msa.job
[25]:
Job(model_config={'protected_namespaces': ()}, status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='b0a87413-053b-4607-af05-e2a571e3fb83', job_type='/align/align', created_date=datetime.datetime(2023, 8, 7, 4, 16, 27, 948640), start_date=None, end_date=datetime.datetime(2023, 8, 7, 4, 16, 27, 988728), prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None)

The same functionality is present:

[26]:
new_prompt = old_msa.sample_prompt(10)
new_prompt.job
[26]:
PromptJob(model_config={'protected_namespaces': ()}, status=<JobStatus.PENDING: 'PENDING'>, job_id='8b85c413-01d4-4a4b-9253-e5c208664900', job_type='/align/prompt', created_date=datetime.datetime(2023, 8, 7, 4, 26, 34, 595942), start_date=None, end_date=None, prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None, msa_id=None, prompt_id='8b85c413-01d4-4a4b-9253-e5c208664900')
[27]:
oldprompt = session.poet.load_prompt_job(prompt.prompt_id)
oldprompt.job
[27]:
Job(model_config={'protected_namespaces': ()}, status=<JobStatus.SUCCESS: 'SUCCESS'>, job_id='87b127eb-d043-43ad-9f18-c52f0abad0f9', job_type='/align/prompt', created_date=datetime.datetime(2023, 8, 7, 4, 16, 30, 556160), start_date=datetime.datetime(2023, 8, 7, 4, 17, 25, 505727), end_date=datetime.datetime(2023, 8, 7, 4, 17, 29, 851708), prerequisite_job_id=None, progress_message=None, progress_counter=None, num_records=None)
[28]:
old_job = session.poet.load_poet_job(sspjob.job.job_id)
old_job.get()[0:3]
[28]:
[PoetSSPResult(model_config={'protected_namespaces': ()}, sequence=b'input', score=[-28.7412109375, -28.05859375, -28.6044921875], name=None),
 PoetSSPResult(model_config={'protected_namespaces': ()}, sequence=b'A1R', score=[-30.0703125, -29.6171875, -29.744140625], name=None),
 PoetSSPResult(model_config={'protected_namespaces': ()}, sequence=b'A1N', score=[-30.44921875, -30.638671875, -31.3056640625], name=None)]
[ ]: