ayyuce commited on
Commit
1ae49ba
·
verified ·
1 Parent(s): 4f07e20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -40
app.py CHANGED
@@ -1,47 +1,90 @@
1
  import streamlit as st
2
- from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig
3
- import json
4
- import os
5
-
6
- model_name = "ayyuce/NeoProtein-GPT"
7
- config_path = os.path.join(os.path.expanduser("~"), f".cache/huggingface/hub/models--{model_name.replace('/', '--')}/snapshots/d462becc43e0c3e4792cfa78efd029bed5dcfeb8/config.json")
8
-
9
- if not os.path.isfile(config_path):
10
- config = {
11
- "model_type": "gpt2",
12
- "architectures": ["GPT2LMHeadModel"],
13
- "vocab_size": 50257,
14
- "n_positions": 1024,
15
- "n_ctx": 1024,
16
- "n_embd": 768,
17
- "n_layer": 12,
18
- "n_head": 12,
19
- "activation_function": "gelu",
20
- "initializer_range": 0.02,
21
- "layer_norm_epsilon": 1e-5,
22
- "bos_token_id": 50256,
23
- "eos_token_id": 50256,
24
- }
25
- os.makedirs(os.path.dirname(config_path), exist_ok=True)
26
- with open(config_path, "w") as f:
27
- json.dump(config, f)
28
 
29
  @st.cache_resource(show_spinner=False)
30
- def load_generator():
31
- tokenizer = AutoTokenizer.from_pretrained(model_name)
32
- model = AutoModelForCausalLM.from_pretrained(model_name, config=AutoConfig.from_pretrained(model_name))
33
- gen_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
34
- return gen_pipeline
 
 
 
 
 
35
 
36
- st.title("NeoProtein-GPT")
37
- st.write("Welcome to the NeoProtein-GPT interface. Enter a protein prompt and generate new protein sequences!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- user_prompt = st.text_area("Enter your prompt", value="Design a novel protein sequence with a unique binding site:")
 
 
 
 
 
41
 
42
- if st.button("Generate Protein Sequence"):
43
- with st.spinner("Generating sequence..."):
44
- outputs = load_generator()(user_prompt, max_new_tokens=200, do_sample=True, temperature=0.7)
45
- generated_text = outputs[0]["generated_text"]
46
- st.subheader("Generated Sequence:")
47
- st.code(generated_text, language="python")
 
1
  import streamlit as st
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ import torch
4
+
5
+ st.set_page_config(page_title="NeoProtein Designer", page_icon="🧬")
6
+ st.title("🧬 NeoProtein-GPT Protein Designer")
7
+ st.markdown("""
8
+ ### Design novel protein sequences with unique binding sites
9
+ *Using the [NeoProtein-GPT](https://huggingface.co/ayyuce/NeoProtein-GPT) model from Hugging Face*
10
+ """)
11
+
12
+ with st.sidebar:
13
+ st.header("Parameters")
14
+ binding_motif = st.text_input("Binding site motif (e.g., AXXC):", help="Use X for wildcard positions")
15
+ seq_length = st.slider("Sequence length", 50, 500, 150)
16
+ temperature = st.slider("Temperature (creativity)", 0.1, 2.0, 1.0)
17
+ num_sequences = st.slider("Number of sequences", 1, 5, 3)
 
 
 
 
 
 
 
 
 
 
18
 
19
  @st.cache_resource(show_spinner=False)
20
+ def load_model():
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ "ayyuce/NeoProtein-GPT",
23
+ device_map="cpu",
24
+ torch_dtype=torch.float32
25
+ )
26
+ tokenizer = AutoTokenizer.from_pretrained("ayyuce/NeoProtein-GPT")
27
+ return model, tokenizer
28
+
29
+ model, tokenizer = load_model()
30
 
31
+ def generate_sequences():
32
+ if not binding_motif:
33
+ st.error("Please enter a binding motif")
34
+ return
35
+
36
+ prompt = f"<start>BindingMotif:{binding_motif}<start>Seq:"
37
+
38
+ try:
39
+ inputs = tokenizer(prompt, return_tensors="pt")
40
+ input_length = inputs.input_ids.shape[1]
41
+
42
+ outputs = model.generate(
43
+ inputs.input_ids,
44
+ max_length=input_length + seq_length,
45
+ temperature=temperature,
46
+ do_sample=True,
47
+ top_k=50,
48
+ top_p=0.95,
49
+ num_return_sequences=num_sequences,
50
+ pad_token_id=tokenizer.eos_token_id
51
+ )
52
+
53
+ generated_sequences = [
54
+ tokenizer.decode(output[input_length:], skip_special_tokens=True)
55
+ for output in outputs
56
+ ]
57
+
58
+ return generated_sequences
59
+
60
+ except Exception as e:
61
+ st.error(f"Generation failed: {str(e)}")
62
+ return []
63
 
64
+ if st.button("Generate Protein Sequences"):
65
+ with st.spinner("Designing novel proteins..."):
66
+ sequences = generate_sequences()
67
+
68
+ if sequences:
69
+ st.subheader("Generated Sequences")
70
+ for i, seq in enumerate(sequences):
71
+ st.markdown(f"""
72
+ **Sequence #{i+1}**
73
+ ```fasta
74
+ {seq}
75
+ ```
76
+ """)
77
+ st.divider()
78
 
79
+ st.markdown("""
80
+ ### How to use:
81
+ 1. Enter your target binding motif using single-letter amino acid codes
82
+ 2. Adjust parameters in the sidebar
83
+ 3. Click the generate button
84
+ 4. Results will appear in FASTA format
85
 
86
+ **Example motifs:**
87
+ - `GHXXXH` for histidine-rich motifs
88
+ - `CXXC` for disulfide bond motifs
89
+ - `DE` for acidic patches
90
+ """)