ArmelRandy commited on
Commit
b3d1867
·
1 Parent(s): 320fb86

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -12
app.py CHANGED
@@ -3,14 +3,19 @@ import json
3
  import shutil
4
  import gradio as gr
5
  from datasets import load_dataset
6
- from huggingface_hub import Repository
 
 
 
7
 
8
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
9
 
10
  def load_data():
11
  dataset = load_dataset("ArmelR/oasst1_guanaco_english", use_auth_token=HF_TOKEN)
12
  return dataset
13
 
 
14
  samples = load_data()
15
  splits = list(samples.keys())
16
  languages = ["Wolof"]
@@ -36,19 +41,24 @@ def identity(index, split):
36
  return ds["prompt"], ds["completion"]
37
 
38
  def save(index, language, split, prompt, completion):
 
 
 
39
  if len(prompt) != 0 and len(completion) != 0 :
40
  print("Saving ...")
41
- with open("/home/user/app/output.jsonl", "a") as fout :
42
- fout.write(
43
- json.dumps(
44
- {
45
- "prompt" : prompt,
46
- "completion" : completion,
47
- "language" : language,
48
- "index" : index
49
- }
50
- )+"\n"
51
- )
 
 
52
  next_index = min(1+index, len(samples[split])-1)
53
  return next_index, samples[split][next_index]["prompt"], samples[split][next_index]["completion"], "", ""
54
  else :
 
3
  import shutil
4
  import gradio as gr
5
  from datasets import load_dataset
6
+ from huggingface_hub import upload_file
7
+ from io import StringIO
8
+ import pandas as pd
9
+ import datetime
10
 
11
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
12
+ DIALOGUES_DATASET = "ArmelRandy/MT_dialogues"
13
 
14
  def load_data():
15
  dataset = load_dataset("ArmelR/oasst1_guanaco_english", use_auth_token=HF_TOKEN)
16
  return dataset
17
 
18
+
19
  samples = load_data()
20
  splits = list(samples.keys())
21
  languages = ["Wolof"]
 
41
  return ds["prompt"], ds["completion"]
42
 
43
  def save(index, language, split, prompt, completion):
44
+ buffer = StringIO()
45
+ timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")
46
+ file_name = f"prompts_{timestamp}.jsonl"
47
  if len(prompt) != 0 and len(completion) != 0 :
48
  print("Saving ...")
49
+ data = {"prompt": prompt, "completion": completion, "language": language, "index": index}
50
+ pd.DataFrame([data]).to_json(buffer, orient="records", lines=True)
51
+ # Push to Hub
52
+ upload_file(
53
+ path_in_repo=f"{now.date()}/{now.hour}/{file_name}",
54
+ path_or_fileobj=buffer.getvalue().encode(),
55
+ repo_id=DIALOGUES_DATASET,
56
+ token=HF_TOKEN,
57
+ repo_type="dataset",
58
+ )
59
+
60
+ # Clean and rerun
61
+ buffer.close()
62
  next_index = min(1+index, len(samples[split])-1)
63
  return next_index, samples[split][next_index]["prompt"], samples[split][next_index]["completion"], "", ""
64
  else :