Oscarshih commited on
Commit
c8decf2
·
1 Parent(s): 82e6639

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +27 -0
README.md ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python=
2
+ import nlp2
3
+ import json
4
+ from datasets import load_dataset
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
+ from asrp.code2voice_model.hubert import hifigan_hubert_layer6_code100
7
+ import IPython.display as ipd
8
+
9
+ tokenizer = AutoTokenizer.from_pretrained("Oscarshih/long-t5-base-SQA-15ep")
10
+ model = AutoModelForSeq2SeqLM.from_pretrained("Oscarshih/long-t5-base-SQA-15ep")
11
+ dataset = load_dataset("voidful/NMSQA-CODE")
12
+ cs = hifigan_hubert_layer6_code100()
13
+
14
+ qa_item = dataset['dev'][0]
15
+ question_unit = json.loads(qa_item['hubert_100_question_unit'])[0]["merged_code"]
16
+ context_unit = json.loads(qa_item['hubert_100_context_unit'])[0]["merged_code"]
17
+ answer_unit = json.loads(qa_item['hubert_100_answer_unit'])[0]["merged_code"]
18
+
19
+ # groundtruth answer
20
+ ipd.Audio(data=cs(answer_unit), autoplay=False, rate=cs.sample_rate)
21
+
22
+ # predict answer
23
+ inputs = tokenizer("".join([f"v_tok_{i}" for i in question_unit]) + "".join([f"v_tok_{i}" for i in context_unit]), return_tensors="pt")
24
+ code = tokenizer.batch_decode(model.generate(**inputs,max_length=1024))[0]
25
+ code = [int(i) for i in code.replace("</s>","").replace("<s>","").split("v_tok_")[1:]]
26
+ ipd.Audio(data=cs(code), autoplay=False, rate=cs.sample_rate)
27
+ ```