Spaces:
Runtime error
Runtime error
Nick Canu
commited on
Commit
·
b0829c1
1
Parent(s):
394d881
add app
Browse files- .gitattributes +2 -33
- .gitignore +1 -0
- .streamlit/config.toml +6 -0
- .vscode/launch.json +16 -0
- Home.py +348 -0
- Model_Constants_Template.py +7 -0
- Model_Step_Data/slim_df.parquet.gzip +3 -0
- Model_Step_Data/vector_df.parquet.gzip +3 -0
- Persistent Objects/current_keys.gz +0 -0
- Persistent Objects/token_search.gz +0 -0
- README.md +48 -13
- Stream_to_Output/GameCleaner.py +144 -0
- Stream_to_Output/requirements.txt +6 -0
- __pycache__/Model_Constants.cpython-39.pyc +0 -0
- __pycache__/description_generator.cpython-39.pyc +0 -0
- __pycache__/title_generator.cpython-39.pyc +0 -0
- description_generator.py +120 -0
- requirements.txt +11 -0
- t5_model/config.json +60 -0
- t5_model/generation_config.json +7 -0
- t5_model/pytorch_model.bin +3 -0
- t5_model/special_tokens_map.json +107 -0
- t5_model/spiece.model +0 -0
- t5_model/tokenizer_config.json +114 -0
- title_generator.py +148 -0
.gitattributes
CHANGED
@@ -1,34 +1,3 @@
|
|
1 |
-
*.
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.gzip filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
*.pkl filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Model_Constants.py
|
.streamlit/config.toml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[theme]
|
2 |
+
primaryColor="#e76020"
|
3 |
+
backgroundColor="#FDFFFC"
|
4 |
+
secondaryBackgroundColor="#6E896A"
|
5 |
+
textColor="#0f0f0d"
|
6 |
+
font="monospace"
|
.vscode/launch.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
// Use IntelliSense to learn about possible attributes.
|
3 |
+
// Hover to view descriptions of existing attributes.
|
4 |
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
5 |
+
"version": "0.2.0",
|
6 |
+
"configurations": [
|
7 |
+
{
|
8 |
+
"name": "Python: Module",
|
9 |
+
"type": "python",
|
10 |
+
"request": "launch",
|
11 |
+
"module": "streamlit",
|
12 |
+
"args": ["run", "Home.py"],
|
13 |
+
"justMyCode": true
|
14 |
+
}
|
15 |
+
]
|
16 |
+
}
|
Home.py
ADDED
@@ -0,0 +1,348 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
st.set_page_config(page_title='Auto-BG: The Game Concept Generator', layout='wide')
|
4 |
+
|
5 |
+
def application():
|
6 |
+
###Imports
|
7 |
+
import pandas as pd
|
8 |
+
import numpy as np
|
9 |
+
import re
|
10 |
+
import urllib
|
11 |
+
import pickle
|
12 |
+
import spacy
|
13 |
+
from spacy.tokens import DocBin
|
14 |
+
from title_generator import Title_Generator
|
15 |
+
import gzip
|
16 |
+
import io
|
17 |
+
from description_generator import input_manager, model_control
|
18 |
+
|
19 |
+
#UI Session Variables
|
20 |
+
if 'desc_iter' not in st.session_state:
|
21 |
+
st.session_state.desc_iter = 0
|
22 |
+
if 'title_iter' not in st.session_state:
|
23 |
+
st.session_state.title_iter = 0
|
24 |
+
if 'output_dict' not in st.session_state:
|
25 |
+
st.session_state.output_dict = {}
|
26 |
+
if 'inputs' not in st.session_state:
|
27 |
+
st.session_state.inputs = []
|
28 |
+
if 'cur_pair' not in st.session_state:
|
29 |
+
st.session_state.cur_pair = ("","Run me!")
|
30 |
+
if 'f_d' not in st.session_state:
|
31 |
+
st.session_state.f_d = None
|
32 |
+
if 'g_d' not in st.session_state:
|
33 |
+
st.session_state.g_d = None
|
34 |
+
if 'm_d' not in st.session_state:
|
35 |
+
st.session_state.m_d = None
|
36 |
+
if 'c_d' not in st.session_state:
|
37 |
+
st.session_state.c_d = None
|
38 |
+
if 'coop_d' not in st.session_state:
|
39 |
+
st.session_state.coop_d = 0
|
40 |
+
|
41 |
+
#non-ui helper functions
|
42 |
+
#reader code extended from https://gist.github.com/thearn/5424244 for alternate load format
|
43 |
+
def reader(url):
|
44 |
+
url_file = io.BytesIO(urllib.request.urlopen(url).read())
|
45 |
+
f = gzip.GzipFile(fileobj=url_file)
|
46 |
+
data = f.read()
|
47 |
+
obj = pickle.loads(data)
|
48 |
+
f.close()
|
49 |
+
return obj
|
50 |
+
|
51 |
+
def token_expand(url):
|
52 |
+
nlp = spacy.blank("en")
|
53 |
+
url_file = urllib.request.urlopen(url)
|
54 |
+
f = gzip.GzipFile(fileobj=url_file)
|
55 |
+
data = f.read()
|
56 |
+
obj = pickle.loads(data)
|
57 |
+
f.close()
|
58 |
+
doc_bin = DocBin().from_bytes(obj)
|
59 |
+
docs = list(doc_bin.get_docs(nlp.vocab))
|
60 |
+
return (docs[1:9],docs[9:192],docs[192:276],docs[276:3901])
|
61 |
+
|
62 |
+
def revert_cats(gt, mec, cat, fam, coop):
|
63 |
+
gt = ["game_type_" + x for x in gt]
|
64 |
+
mec = ["mechanic_" + x for x in mec]
|
65 |
+
cat = ["category_" + x for x in cat]
|
66 |
+
fam = ["family_" + x for x in fam if x != "Game: [redacted]"]
|
67 |
+
if coop == 1:
|
68 |
+
co = ["cooperative", "mechanic_Cooperative Game"]
|
69 |
+
else:
|
70 |
+
co = []
|
71 |
+
|
72 |
+
final_list = [gt,mec,cat,fam, co]
|
73 |
+
return [item for sublist in final_list for item in sublist]
|
74 |
+
|
75 |
+
def builder(ip):
|
76 |
+
ks = iman.input_parser(iman.set_input(ip))
|
77 |
+
mctrl.prompt_formatter(ks)
|
78 |
+
descs = []
|
79 |
+
for status in np.arange(0,3):
|
80 |
+
desc = mctrl.call_api(status=status)
|
81 |
+
clean_desc = mctrl.resp_cleanup(desc)
|
82 |
+
inter_pair = Tgen.candidate_generator(clean_desc)
|
83 |
+
out = Tgen.candidate_score(inter_pair,ex_check)
|
84 |
+
descs.append(out)
|
85 |
+
st.sidebar.success("Prompt " +str(status+1)+ " generated!")
|
86 |
+
st.session_state.output_dict = {0:descs[0],1:descs[1],2:descs[2]}
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
def title_check(next=0):
|
91 |
+
if next==1:
|
92 |
+
if st.session_state.title_iter == (len(st.session_state.output_dict[st.session_state.desc_iter]['titles'])-1):
|
93 |
+
st.session_state.title_iter = 0
|
94 |
+
else:
|
95 |
+
st.session_state.title_iter +=1
|
96 |
+
elif next==-1:
|
97 |
+
if st.session_state.title_iter == 0:
|
98 |
+
st.session_state.title_iter = (len(st.session_state.output_dict[st.session_state.desc_iter]['titles'])-1)
|
99 |
+
else:
|
100 |
+
st.session_state.title_iter -=1
|
101 |
+
else:
|
102 |
+
st.session_state.title_iter = 0
|
103 |
+
|
104 |
+
cur_title = st.session_state.output_dict[st.session_state.desc_iter]['titles'][st.session_state.title_iter][0]
|
105 |
+
desc = re.sub(re.compile("__"),cur_title,st.session_state.output_dict[st.session_state.desc_iter]['text'])
|
106 |
+
|
107 |
+
return (cur_title, desc.lstrip())
|
108 |
+
|
109 |
+
def show_title(val):
|
110 |
+
out = title_check(next=val)
|
111 |
+
st.session_state.cur_pair = out
|
112 |
+
|
113 |
+
def PT_button_clicked():
|
114 |
+
show_title(-1)
|
115 |
+
|
116 |
+
def NT_button_clicked():
|
117 |
+
show_title(1)
|
118 |
+
|
119 |
+
def PD_button_clicked():
|
120 |
+
if st.session_state.desc_iter == 0:
|
121 |
+
st.session_state.desc_iter = 2
|
122 |
+
st.session_state.title_iter = 0
|
123 |
+
else:
|
124 |
+
st.session_state.desc_iter -= 1
|
125 |
+
st.session_state.title_iter = 0
|
126 |
+
show_title(0)
|
127 |
+
|
128 |
+
def ND_button_clicked():
|
129 |
+
if st.session_state.desc_iter == 2:
|
130 |
+
st.session_state.desc_iter = 0
|
131 |
+
st.session_state.title_iter = 0
|
132 |
+
else:
|
133 |
+
st.session_state.desc_iter += 1
|
134 |
+
st.session_state.title_iter = 0
|
135 |
+
show_title(0)
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
###Variables
|
140 |
+
|
141 |
+
###Data
|
142 |
+
@st.cache_resource
|
143 |
+
def fetch_data():
|
144 |
+
slim_df = pd.read_parquet('https://github.com/canunj/Auto-BoardGame/blob/main/Model_Step_Data/slim_df.parquet.gzip?raw=true')
|
145 |
+
search_tokens = token_expand("https://github.com/canunj/Auto-BoardGame/blob/main/Persistent%20Objects/token_search.gz?raw=true")
|
146 |
+
vector_df = pd.read_parquet('https://github.com/canunj/Auto-BoardGame/blob/main/Model_Step_Data/vector_df.parquet.gzip?raw=true')
|
147 |
+
category_keys = reader("https://github.com/canunj/Auto-BoardGame/blob/main/Persistent%20Objects/current_keys.gz?raw=true")
|
148 |
+
coop = [1,0]
|
149 |
+
st.sidebar.success("Fetched Data!")
|
150 |
+
return slim_df, search_tokens, vector_df, category_keys, coop
|
151 |
+
|
152 |
+
slim_df, search_tokens, vector_df, category_keys, coop = fetch_data()
|
153 |
+
|
154 |
+
ex_check = ["[Ee]verquest","[Cc]ivilization [Ii][IiVv]","[Cc]ivilization(?=:)","[Cc]ivilization [Ii][Ii]",
|
155 |
+
"[Cc]ivilization [Ii][Ii][Ii]","[Cc]ivilization V","[Aa]ge [Oo]f [Ee]mpires [Ii][Ii2]([Ii]|\b)", "[Rr]avenloft|[Cc]astle [Rr]avenloft",
|
156 |
+
"[Ss]cythe(?=:|\b)","[Dd]ungeons [&Aa][ n][Dd ][ Ddr][Ddra][rg][oa][gn][os](ns|\b)",
|
157 |
+
"[Aa]ge [Oo]f [Ee]mpires [Ii][Ii]: [Tt]he [Aa]ge [Oo]f [Kk]ings","[Aa]ge [Oo]f [Ee]mpires 2: [Tt]he [Aa]ge [Oo]f [Kk]ings",
|
158 |
+
"[Aa]ge [Oo]f [Ee]mpires","Doctor Who"]
|
159 |
+
|
160 |
+
###Models
|
161 |
+
@st.cache_resource
|
162 |
+
def setup_models():
|
163 |
+
return Title_Generator('./t5_model', slim_df), input_manager(vector_df, slim_df, search_tokens), model_control(apikey=st.secrets.key,model_id=st.secrets.model)
|
164 |
+
|
165 |
+
Tgen, iman, mctrl = setup_models()
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
#UI
|
170 |
+
|
171 |
+
#Intro
|
172 |
+
st.title("""Auto-BG: The Game Concept Generator""")
|
173 |
+
|
174 |
+
with st.expander("How to use", expanded=True):
|
175 |
+
st.write(
|
176 |
+
"""
|
177 |
+
Discover the concept for your next favorite game!
|
178 |
+
|
179 |
+
How do you use Auto-BG?
|
180 |
+
|
181 |
+
Pick any set of tags from four selectors below: Family, Game, Mechanic, and Category.
|
182 |
+
If you are looking to lose together - activate the cooperative toggle.
|
183 |
+
|
184 |
+
See ? icons for detailed information on each type of tag.
|
185 |
+
|
186 |
+
Select any pre-configured demo below to see how Auto-BG works on the tag set for a popular board game.
|
187 |
+
"""
|
188 |
+
)
|
189 |
+
|
190 |
+
results = st.empty()
|
191 |
+
|
192 |
+
with st.expander('Demos'):
|
193 |
+
|
194 |
+
st.write("""These buttons run Auto-BG on the tag set for real games you might be familiar with,
|
195 |
+
choose a button and the corresponding tags automatically fill the selectors below.
|
196 |
+
Press run and see how Auto-BG creates an alternate concept for these hit titles!
|
197 |
+
""")
|
198 |
+
|
199 |
+
b1, b2, b3 = st.columns(3)
|
200 |
+
|
201 |
+
with b1:
|
202 |
+
SoC = st.button('Catan', use_container_width=True)
|
203 |
+
if SoC:
|
204 |
+
st.session_state.f_d = [
|
205 |
+
'Animals: Sheep',
|
206 |
+
'Components: Hexagonal Tiles',
|
207 |
+
'Components: Wooden pieces & boards'
|
208 |
+
]
|
209 |
+
st.session_state.g_d = ['Family Game', 'Strategy Game']
|
210 |
+
st.session_state.m_d = [
|
211 |
+
'Hexagon Grid',
|
212 |
+
'Network and Route Building',
|
213 |
+
'Random Production',
|
214 |
+
'Trading',
|
215 |
+
'Variable Set-up'
|
216 |
+
]
|
217 |
+
st.session_state.c_d = [
|
218 |
+
'Economic',
|
219 |
+
'Negotiation'
|
220 |
+
]
|
221 |
+
st.session_state.coop_d = 0
|
222 |
+
|
223 |
+
with b2:
|
224 |
+
TtR = st.button('Ticket to Ride', use_container_width=True)
|
225 |
+
if TtR:
|
226 |
+
st.session_state.f_d = [
|
227 |
+
'Components: Map (Continental / National scale)',
|
228 |
+
'Continents: North America',
|
229 |
+
'Country: USA'
|
230 |
+
]
|
231 |
+
st.session_state.g_d = ['Family Game']
|
232 |
+
st.session_state.m_d = [
|
233 |
+
'Contracts',
|
234 |
+
'End Game Bonuses',
|
235 |
+
'Network and Route Building',
|
236 |
+
'Push Your Luck',
|
237 |
+
'Set Collection'
|
238 |
+
]
|
239 |
+
st.session_state.c_d = [
|
240 |
+
'Trains'
|
241 |
+
]
|
242 |
+
st.session_state.coop_d = 0
|
243 |
+
|
244 |
+
with b3:
|
245 |
+
P = st.button('Pandemic', use_container_width=True)
|
246 |
+
if P:
|
247 |
+
st.session_state.f_d = [
|
248 |
+
'Components: Map (Global Scale)',
|
249 |
+
'Components: Multi-Use Cards',
|
250 |
+
'Medical: Diseases',
|
251 |
+
'Region: The World',
|
252 |
+
'Theme: Science'
|
253 |
+
]
|
254 |
+
st.session_state.g_d = ['Family Game', 'Strategy Game']
|
255 |
+
st.session_state.m_d = [
|
256 |
+
'Action Points',
|
257 |
+
'Point to Point Movement',
|
258 |
+
'Trading',
|
259 |
+
'Variable Player Powers'
|
260 |
+
]
|
261 |
+
st.session_state.c_d = [
|
262 |
+
'Medical'
|
263 |
+
]
|
264 |
+
st.session_state.coop_d = 1
|
265 |
+
|
266 |
+
#Form
|
267 |
+
with st.expander("Auto-BG", expanded=True):
|
268 |
+
|
269 |
+
col1, col2 = st.columns(2)
|
270 |
+
|
271 |
+
with col1:
|
272 |
+
Family_v = st.multiselect("Family", options=pd.Series(category_keys[4][8:]), key='Family', default=st.session_state.f_d, max_selections=6, help='Descriptive niches for groupings of games.\n Maximum of six choices.')
|
273 |
+
|
274 |
+
with col2:
|
275 |
+
Game_v = st.multiselect("Game", options=pd.Series(category_keys[1]), key='Game', default=st.session_state.g_d, max_selections=2, help='Top level genres - Family, Strategy, etc.\n Maximum of two choices.')
|
276 |
+
|
277 |
+
col3, col4 = st.columns(2)
|
278 |
+
|
279 |
+
with col3:
|
280 |
+
Category_v = st.multiselect("Category", options=pd.Series(category_keys[3]), key='Category', default=st.session_state.c_d, max_selections=3, help='Expanded genre tags.\n Maximum of three choices.')
|
281 |
+
|
282 |
+
with col4:
|
283 |
+
Mechanics_v = st.multiselect("Mechanics", options=pd.Series([x for x in category_keys[2] if x != "Cooperative Game"]), key='Mechanic', default=st.session_state.m_d, max_selections=5, help='Game rules!\n Maximum of five choices.')
|
284 |
+
|
285 |
+
Cooperative_v = st.checkbox('Cooperative?', value=st.session_state.coop_d, key='CoopCheck')
|
286 |
+
|
287 |
+
run = st.button("Run Model", use_container_width=True)
|
288 |
+
|
289 |
+
if run:
|
290 |
+
if st.session_state.inputs == revert_cats(Game_v, Mechanics_v, Category_v, Family_v, Cooperative_v):
|
291 |
+
st.write('Inputs did not change, results currently loaded.')
|
292 |
+
else:
|
293 |
+
|
294 |
+
st.session_state.desc_iter = 0
|
295 |
+
st.session_state.title_iter = 0
|
296 |
+
st.session_state.output_dict = {}
|
297 |
+
|
298 |
+
if Cooperative_v == True:
|
299 |
+
Mechanics_v.append('Cooperative Game')
|
300 |
+
|
301 |
+
st.session_state.inputs = revert_cats(Game_v, Mechanics_v, Category_v, Family_v, Cooperative_v)
|
302 |
+
builder(st.session_state.inputs)
|
303 |
+
st.session_state.cur_pair = title_check()
|
304 |
+
|
305 |
+
if st.session_state.output_dict == {}:
|
306 |
+
results.empty()
|
307 |
+
else:
|
308 |
+
with results.expander('Results', expanded=True):
|
309 |
+
|
310 |
+
st.write(
|
311 |
+
"""
|
312 |
+
#### Title:
|
313 |
+
""")
|
314 |
+
|
315 |
+
|
316 |
+
|
317 |
+
st.write(st.session_state.cur_pair[0])
|
318 |
+
|
319 |
+
|
320 |
+
t_col1, t_col2 = st.columns(2)
|
321 |
+
with t_col1:
|
322 |
+
st.button("See Previous Title", on_click=PT_button_clicked, use_container_width=True)
|
323 |
+
|
324 |
+
with t_col2:
|
325 |
+
st.button("See Next Title", on_click=NT_button_clicked, use_container_width=True)
|
326 |
+
|
327 |
+
st.write(
|
328 |
+
"""
|
329 |
+
#### Description:
|
330 |
+
""")
|
331 |
+
st.write(st.session_state.cur_pair[1].replace('$','\$'))
|
332 |
+
|
333 |
+
d_col1, d_col2 = st.columns(2)
|
334 |
+
with d_col1:
|
335 |
+
st.button("See Previous Description", on_click=PD_button_clicked, use_container_width=True)
|
336 |
+
|
337 |
+
with d_col2:
|
338 |
+
st.button("See Next Description", on_click=ND_button_clicked, use_container_width=True)
|
339 |
+
|
340 |
+
|
341 |
+
|
342 |
+
page_names_to_funcs = {
|
343 |
+
"Application": application
|
344 |
+
}
|
345 |
+
|
346 |
+
demo_name = st.sidebar.selectbox("Choose a page:", page_names_to_funcs.keys())
|
347 |
+
page_names_to_funcs[demo_name]()
|
348 |
+
|
Model_Constants_Template.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def SEND_KEY():
|
2 |
+
KEY = ""
|
3 |
+
return KEY
|
4 |
+
|
5 |
+
def SEND_MODEL():
|
6 |
+
OAI_MODEL = ""
|
7 |
+
return OAI_MODEL
|
Model_Step_Data/slim_df.parquet.gzip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8eb032341c8bacc24ffee96e2a1b3201a0ab6c2837567956ba1ddb9492e056dc
|
3 |
+
size 16243764
|
Model_Step_Data/vector_df.parquet.gzip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eaf463f341982a460862da6ee77bbed38ad92ad36c4aef10bc031828681ef83f
|
3 |
+
size 3803902
|
Persistent Objects/current_keys.gz
ADDED
Binary file (39.7 kB). View file
|
|
Persistent Objects/token_search.gz
ADDED
Binary file (144 kB). View file
|
|
README.md
CHANGED
@@ -1,13 +1,48 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[icon banner image placeholder]
|
2 |
+
|
3 |
+
# Auto-BG
|
4 |
+
LLM-based text generation tool for creating board game concepts (description & title)
|
5 |
+
|
6 |
+
The Auto-BG (Board Game) tool is a text generation tool for creating board game concepts. It utilizes multiple large-language models to generate board game titles and descriptions tailored from user-input tags based on BoardGameGeek.com. The models used in this project include a trained T5 sequence-to-sequence model, primarily for title generation, and a robust GPT3 model for board game description generation. The T5 model was initially presented by Raffel et al. in ["Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer"](https://arxiv.org/pdf/1910.10683.pdf). The GPT3 model builds from Brown et al.'s work in ["Language Models are Few-Shot Learners"](https://arxiv.org/pdf/1910.10683.pdf).
|
7 |
+
|
8 |
+
|
9 |
+
## Table of Contents
|
10 |
+
- Features and Demo
|
11 |
+
- Examples
|
12 |
+
- Project Structure
|
13 |
+
- Customizing Auto-BG
|
14 |
+
- Citations and Licensing
|
15 |
+
|
16 |
+
## Features and Demo
|
17 |
+
The main features of this application include:
|
18 |
+
|
19 |
+
A user-friendly interface for Auto-BG can be found at (homepage).
|
20 |
+
|
21 |
+
## Examples
|
22 |
+
|
23 |
+
## Project Structure
|
24 |
+
|
25 |
+
## Customizing Auto-BG
|
26 |
+
NOTE: Auto-BG uses a fine-tuned GPT-3 Curie model that will be inaccessible without an organizational API key,
|
27 |
+
the below instructions are for advanced users interested in remixing Auto-BG with a new generator model.
|
28 |
+
|
29 |
+
In order to run this application, you will need the following:
|
30 |
+
1. An OpenAI account and API key
|
31 |
+
2. All libraries specified in both the primary and data processing requirements.txt files
|
32 |
+
3. A raw stream JSON file of BoardGameGeek data, formatted to match output from the Recommend.Games scraper
|
33 |
+
|
34 |
+
To implement a new instance of Auto-BG, follow these steps:
|
35 |
+
1. Clone the repository onto your local machine
|
36 |
+
2. Install the required packages listed in both 'requirements.txt' files using pip
|
37 |
+
3. Download the trained T5 model or provide a path to an alternate T5 model.
|
38 |
+
4. Placing the JSON data file in Stream_to_Output, run GameCleaner.py - this provides all required data files.
|
39 |
+
|
40 |
+
5. Prepare training prompts - convert all active keys to period stopped tokens in a string for each game.
|
41 |
+
6. Fine-tune a selected model following the instructions at: https://platform.openai.com/docs/guides/fine-tuning
|
42 |
+
NOTE: Auto-BG uses a Curie model with a lowered learning rate running for fewer epochs.
|
43 |
+
|
44 |
+
8. Create a Model_Constants.py file with your personal API key and model instance based on the template above.
|
45 |
+
9. You now have a customized instance of Auto-BG!
|
46 |
+
|
47 |
+
## Citations and Licensing
|
48 |
+
Auto-BG is licensed under CC BY-NC-SA 2.0, original data sourced from Recommend.Games @GitLab
|
Stream_to_Output/GameCleaner.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import re
|
4 |
+
import nltk
|
5 |
+
from nltk.corpus import stopwords
|
6 |
+
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
|
7 |
+
import spacy
|
8 |
+
from langdetect import detect
|
9 |
+
import pickle
|
10 |
+
import gzip
|
11 |
+
nltk.download('stopwords')
|
12 |
+
|
13 |
+
#function definitions
|
14 |
+
|
15 |
+
#strips values out of encoded stream lists
|
16 |
+
def text_col_cleaner(frame, cols, pattern):
|
17 |
+
|
18 |
+
pattern = re.compile(pattern)
|
19 |
+
|
20 |
+
for col in cols:
|
21 |
+
frame[col] = frame[col].map(lambda x: [re.findall(pattern,val)[0].strip() for val in x], na_action='ignore')
|
22 |
+
return frame
|
23 |
+
|
24 |
+
#converts specified columns to one-hot
|
25 |
+
def encode_columns(frame):
|
26 |
+
targets = list(frame.columns)
|
27 |
+
for t in targets:
|
28 |
+
one_hot = pd.get_dummies(frame[t].apply(pd.Series).stack(),prefix=t).groupby(level=0).sum()
|
29 |
+
frame = pd.concat([frame,one_hot],axis=1)
|
30 |
+
return frame
|
31 |
+
|
32 |
+
#custom text processor for tokenizing descriptions by Kuan Chen & Nick Canu
|
33 |
+
def doc_text_preprocessing(ser):
|
34 |
+
nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat'])
|
35 |
+
|
36 |
+
"""text processing steps"""
|
37 |
+
stop_words=set(stopwords.words('english'))
|
38 |
+
stop_words.update(['game','player','players','games', 'also',
|
39 |
+
'description','publisher'])
|
40 |
+
|
41 |
+
single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
|
42 |
+
to_lower_func=lambda c: c.lower()
|
43 |
+
|
44 |
+
lemma_text=[preprocess_string(
|
45 |
+
' '.join([token.lemma_ for token in desc]
|
46 |
+
),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
|
47 |
+
strip_multiple_whitespaces,single_letter_replace,to_lower_func]
|
48 |
+
) for desc in ser.apply(lambda x: nlp(x))]
|
49 |
+
|
50 |
+
tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
|
51 |
+
|
52 |
+
return tokenize_text
|
53 |
+
|
54 |
+
#performs english language detection on the descriptions w/langdetect then additionally drops games using non-english characters in the name
|
55 |
+
def lang_cleanup(frame):
|
56 |
+
nlp=spacy.load("en_core_web_sm")
|
57 |
+
frame['description']=frame['description'].fillna('no words')
|
58 |
+
frame = frame[frame['description']!='no words']
|
59 |
+
frame['cleaned_descriptions']=doc_text_preprocessing(frame['description'])
|
60 |
+
|
61 |
+
detected_lang = []
|
62 |
+
for word in frame.cleaned_descriptions:
|
63 |
+
word=', '.join(word)
|
64 |
+
detected_lang.append(detect(word))
|
65 |
+
frame['lang'] = detected_lang
|
66 |
+
frame = frame[frame['lang']=='en']
|
67 |
+
|
68 |
+
non_eng_title_filter = frame['name'].str.contains('[^\x00-\x7f]', flags=re.IGNORECASE)
|
69 |
+
return frame[~non_eng_title_filter]
|
70 |
+
|
71 |
+
|
72 |
+
#column name stripper for creating key values
|
73 |
+
def column_fixer(frame,targ):
|
74 |
+
return [col.replace(targ, "").strip('"') for col in frame.columns if col.startswith(targ)]
|
75 |
+
|
76 |
+
#creates key list for defining web app lists & nlp tokens of the same unknown input search
|
77 |
+
def key_collator(frame):
|
78 |
+
nlp=spacy.load("en_core_web_sm")
|
79 |
+
fam = column_fixer(frame,'family_')
|
80 |
+
gt = column_fixer(frame,'game_type_')
|
81 |
+
mec = column_fixer(frame,'mechanic_')
|
82 |
+
cat = column_fixer(frame,'category_')
|
83 |
+
|
84 |
+
current_keys = (['cooperative'],gt,mec,cat,fam)
|
85 |
+
|
86 |
+
fam_keys = [nlp(w) for w in fam]
|
87 |
+
gt_keys = [nlp(w) for w in gt]
|
88 |
+
mec_keys = [nlp(w) for w in mec]
|
89 |
+
cat_keys = [nlp(w) for w in cat]
|
90 |
+
|
91 |
+
search_tokens = (gt_keys,mec_keys,cat_keys,fam_keys)
|
92 |
+
|
93 |
+
return current_keys, search_tokens
|
94 |
+
|
95 |
+
|
96 |
+
#-----------
|
97 |
+
|
98 |
+
#reading in raw file & removing unranked and compilation game items
|
99 |
+
df = pd.read_json(r'./bgg_GameItem.jl', lines=True)
|
100 |
+
df['rank'] = df['rank'].fillna(0).astype(int)
|
101 |
+
df = df[(df['rank']>0) & (df['compilation']!=1)]
|
102 |
+
|
103 |
+
#separating and cleaning the one-hot target columns
|
104 |
+
in_df = text_col_cleaner(frame = df[['game_type','mechanic','category','family']],
|
105 |
+
cols = ['game_type','mechanic','category','family'],
|
106 |
+
pattern = re.compile("([\S ]+)(?=:)"))
|
107 |
+
|
108 |
+
print('Text has been cleaned, now encoding one-hot columns')
|
109 |
+
|
110 |
+
#encoding one-hot columns and rejoining to features for output
|
111 |
+
proc_df = encode_columns(in_df)
|
112 |
+
step = df[['name','description','cooperative']]
|
113 |
+
join_df = pd.concat([step,proc_df.drop(['game_type','mechanic','category','family',
|
114 |
+
'game_type_Amiga','game_type_Arcade','game_type_Atari ST',
|
115 |
+
'game_type_Commodore 64'],axis=1)],axis=1)
|
116 |
+
|
117 |
+
print('Columns encoded, now performing english language detection and cleanup')
|
118 |
+
|
119 |
+
#english language detection steps & first data save
|
120 |
+
eng_df = lang_cleanup(join_df)
|
121 |
+
eng_df = eng_df.loc[:,~eng_df.columns.duplicated()].copy().reset_index(drop=True).fillna(0)
|
122 |
+
|
123 |
+
print('Creating vector-only dataframe & saving output')
|
124 |
+
|
125 |
+
#vector only data for operations
|
126 |
+
vector_df = eng_df.copy().drop(['name','description','cleaned_descriptions','lang'],axis=1)
|
127 |
+
|
128 |
+
eng_df.to_parquet('game_data.parquet.gzip',compression='gzip')
|
129 |
+
vector_df.to_parquet('game_vectors.parquet.gzip',compression='gzip')
|
130 |
+
|
131 |
+
print('Creating key lists')
|
132 |
+
|
133 |
+
#creating key lists - 1. string list of values by feature class for defining input selections & 2. nlp processed list for unknown input search
|
134 |
+
keys, search_toks = key_collator(vector_df)
|
135 |
+
|
136 |
+
with gzip.open("current_keys.gz", "wb") as f:
|
137 |
+
pickle.dump(keys, f)
|
138 |
+
f.close()
|
139 |
+
|
140 |
+
with gzip.open("key_search_tokens.gz", "wb") as f:
|
141 |
+
pickle.dump(search_toks, f)
|
142 |
+
f.close()
|
143 |
+
|
144 |
+
print('File creation is complete')
|
Stream_to_Output/requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gensim==4.3.1
|
2 |
+
langdetect==1.0.9
|
3 |
+
nltk==3.8.1
|
4 |
+
numpy==1.24.2
|
5 |
+
pandas==1.3.2
|
6 |
+
spacy==3.5.1
|
__pycache__/Model_Constants.cpython-39.pyc
ADDED
Binary file (457 Bytes). View file
|
|
__pycache__/description_generator.cpython-39.pyc
ADDED
Binary file (4.62 kB). View file
|
|
__pycache__/title_generator.cpython-39.pyc
ADDED
Binary file (6.8 kB). View file
|
|
description_generator.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import numpy as np
|
3 |
+
import re
|
4 |
+
import spacy
|
5 |
+
import openai
|
6 |
+
from operator import itemgetter
|
7 |
+
#user input manager class
|
8 |
+
class input_manager:
|
9 |
+
|
10 |
+
#initialize key dictionary from vector data frame and set community top N
|
11 |
+
def __init__(self,key_df, slim_df, search_tokens, top_n=10):
|
12 |
+
self.key_df = key_df
|
13 |
+
self.slim_df = slim_df
|
14 |
+
self.search_tokens = search_tokens
|
15 |
+
self.key = dict(zip(list(key_df.columns),np.zeros(len(key_df.columns))))
|
16 |
+
self.top_n = top_n
|
17 |
+
|
18 |
+
#translate input text to vector
|
19 |
+
def set_input(self,input_cats):
|
20 |
+
#need setup to apply correct group tag to values
|
21 |
+
nlp=spacy.load("en_core_web_md")
|
22 |
+
#separate known/unknown features
|
23 |
+
k_flags = [cat for cat in input_cats if cat in list(self.key.keys())]
|
24 |
+
unk_flags = [cat for cat in input_cats if cat not in list(self.key.keys())]
|
25 |
+
|
26 |
+
#process within feature class similarity for each unknown input
|
27 |
+
if len(unk_flags)>0:
|
28 |
+
|
29 |
+
outs = []
|
30 |
+
for word in unk_flags:
|
31 |
+
if re.match(r"game_type_",word):
|
32 |
+
tok = nlp(word.split("_")[-1])
|
33 |
+
mtch = max([(key,key.similarity(tok)) for key in self.search_tokens[0]],key=itemgetter(1))
|
34 |
+
#if no known match is found (model doesn't recognize input word), we're going to discard - other solutions performance prohibitive
|
35 |
+
if mtch[1]>0:
|
36 |
+
outs.append("game_type_"+mtch[0])
|
37 |
+
elif re.match(r"mechanic_",word):
|
38 |
+
tok = nlp(word.split("_")[-1])
|
39 |
+
mtch = max([(key,key.similarity(tok)) for key in self.search_tokens[1]],key=itemgetter(1))
|
40 |
+
if mtch[1]>0:
|
41 |
+
outs.append("mechanic_"+mtch[0])
|
42 |
+
elif re.match(r"category_",word):
|
43 |
+
tok = nlp(word.split("_")[-1])
|
44 |
+
mtch=max([(key,key.similarity(tok)) for key in self.search_tokens[2]],key=itemgetter(1))
|
45 |
+
if mtch[1]>0:
|
46 |
+
outs.append("category_"+mtch[0])
|
47 |
+
elif re.match(r"family_",word):
|
48 |
+
tok = nlp(word.split("_")[-1])
|
49 |
+
mtch=max([(key,key.similarity(tok)) for key in self.search_tokens[3]],key=itemgetter(1))
|
50 |
+
if mtch[1]>0:
|
51 |
+
outs.append("family_"+str(mtch[0]))
|
52 |
+
|
53 |
+
#if unks are processed, rejoin nearest match to known.
|
54 |
+
k_flags = list(set(k_flags+outs))
|
55 |
+
|
56 |
+
#preserve global key and ouput copy w/input keys activated to 1
|
57 |
+
d = self.key.copy()
|
58 |
+
for cat in k_flags:
|
59 |
+
d[cat] = 1.0
|
60 |
+
|
61 |
+
# DELETE ME
|
62 |
+
return d
|
63 |
+
|
64 |
+
def input_parser(self,in_vec):
|
65 |
+
#extracting keys from processed vector
|
66 |
+
ks = [k for k,v in in_vec.items() if v == 1]
|
67 |
+
|
68 |
+
return ks
|
69 |
+
|
70 |
+
class model_control:
|
71 |
+
def __init__(self, apikey, model_id):
|
72 |
+
self.api_key = apikey
|
73 |
+
openai.api_key = self.api_key
|
74 |
+
|
75 |
+
self.prompt = None
|
76 |
+
|
77 |
+
self.model = openai.FineTune.retrieve(id=model_id).fine_tuned_model
|
78 |
+
|
79 |
+
def prompt_formatter(self,ks):
|
80 |
+
self.prompt = ". ".join(ks) + "\n\n###\n\n"
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
def call_api(self,status=0):
|
85 |
+
if status == 0:
|
86 |
+
temp=0.5
|
87 |
+
pres=0.7
|
88 |
+
elif status == 1:
|
89 |
+
temp=0.4
|
90 |
+
pres=0.6
|
91 |
+
elif status == 2:
|
92 |
+
temp=0.5
|
93 |
+
pres=0.8
|
94 |
+
|
95 |
+
answer = openai.Completion.create(
|
96 |
+
model=self.model,
|
97 |
+
prompt=self.prompt,
|
98 |
+
max_tokens=512,
|
99 |
+
temperature=temp,
|
100 |
+
stop=["END"],
|
101 |
+
presence_penalty=pres,
|
102 |
+
frequency_penalty=0.5
|
103 |
+
)
|
104 |
+
return answer['choices'][0]['text']
|
105 |
+
|
106 |
+
def resp_cleanup(self,text):
|
107 |
+
|
108 |
+
if ((text[-1] != "!") & (text[-1] != ".") & (text[-1] != "?")):
|
109 |
+
text = " ".join([e+'.' for e in text.split('.')[0:-1] if e])
|
110 |
+
|
111 |
+
sent = re.split(r'([.?!:])', text)
|
112 |
+
phrases = ["[Dd]esigned by","[Dd]esigner of","[Aa]rt by","[Aa]rtist of","[Pp]ublished","[Pp]ublisher of"]
|
113 |
+
|
114 |
+
pat = re.compile("(?:" + "|".join(phrases) + ")")
|
115 |
+
fix = re.compile("(?<=[.!?])[.!?]")
|
116 |
+
|
117 |
+
text = re.sub(fix,'',''.join([s for s in sent if pat.search(s) == None]))
|
118 |
+
|
119 |
+
|
120 |
+
return text
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gensim==4.3.1
|
2 |
+
langdetect==1.0.9
|
3 |
+
nltk==3.8.1
|
4 |
+
numpy==1.24.2
|
5 |
+
openai==0.27.2
|
6 |
+
pandas==1.3.2
|
7 |
+
scikit_learn==1.2.2
|
8 |
+
spacy==3.5.1
|
9 |
+
streamlit==1.20.0
|
10 |
+
torch==2.0.0
|
11 |
+
transformers==4.27.3
|
t5_model/config.json
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Michau/t5-base-en-generate-headline",
|
3 |
+
"architectures": [
|
4 |
+
"T5ForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"d_ff": 3072,
|
7 |
+
"d_kv": 64,
|
8 |
+
"d_model": 768,
|
9 |
+
"decoder_start_token_id": 0,
|
10 |
+
"dense_act_fn": "relu",
|
11 |
+
"dropout_rate": 0.1,
|
12 |
+
"eos_token_id": 1,
|
13 |
+
"feed_forward_proj": "relu",
|
14 |
+
"initializer_factor": 1.0,
|
15 |
+
"is_encoder_decoder": true,
|
16 |
+
"is_gated_act": false,
|
17 |
+
"layer_norm_epsilon": 1e-06,
|
18 |
+
"model_type": "t5",
|
19 |
+
"n_positions": 512,
|
20 |
+
"num_decoder_layers": 12,
|
21 |
+
"num_heads": 12,
|
22 |
+
"num_layers": 12,
|
23 |
+
"output_past": true,
|
24 |
+
"pad_token_id": 0,
|
25 |
+
"relative_attention_max_distance": 128,
|
26 |
+
"relative_attention_num_buckets": 32,
|
27 |
+
"task_specific_params": {
|
28 |
+
"summarization": {
|
29 |
+
"early_stopping": true,
|
30 |
+
"length_penalty": 2.0,
|
31 |
+
"max_length": 200,
|
32 |
+
"min_length": 30,
|
33 |
+
"no_repeat_ngram_size": 3,
|
34 |
+
"num_beams": 4,
|
35 |
+
"prefix": "summarize: "
|
36 |
+
},
|
37 |
+
"translation_en_to_de": {
|
38 |
+
"early_stopping": true,
|
39 |
+
"max_length": 300,
|
40 |
+
"num_beams": 4,
|
41 |
+
"prefix": "translate English to German: "
|
42 |
+
},
|
43 |
+
"translation_en_to_fr": {
|
44 |
+
"early_stopping": true,
|
45 |
+
"max_length": 300,
|
46 |
+
"num_beams": 4,
|
47 |
+
"prefix": "translate English to French: "
|
48 |
+
},
|
49 |
+
"translation_en_to_ro": {
|
50 |
+
"early_stopping": true,
|
51 |
+
"max_length": 300,
|
52 |
+
"num_beams": 4,
|
53 |
+
"prefix": "translate English to Romanian: "
|
54 |
+
}
|
55 |
+
},
|
56 |
+
"torch_dtype": "float32",
|
57 |
+
"transformers_version": "4.26.1",
|
58 |
+
"use_cache": true,
|
59 |
+
"vocab_size": 32128
|
60 |
+
}
|
t5_model/generation_config.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"decoder_start_token_id": 0,
|
4 |
+
"eos_token_id": 1,
|
5 |
+
"pad_token_id": 0,
|
6 |
+
"transformers_version": "4.26.1"
|
7 |
+
}
|
t5_model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3f73b04bb3e12b9bd1f02b88f98648da9c317f734a61e9805ae385c1c57671d
|
3 |
+
size 891702929
|
t5_model/special_tokens_map.json
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<extra_id_0>",
|
4 |
+
"<extra_id_1>",
|
5 |
+
"<extra_id_2>",
|
6 |
+
"<extra_id_3>",
|
7 |
+
"<extra_id_4>",
|
8 |
+
"<extra_id_5>",
|
9 |
+
"<extra_id_6>",
|
10 |
+
"<extra_id_7>",
|
11 |
+
"<extra_id_8>",
|
12 |
+
"<extra_id_9>",
|
13 |
+
"<extra_id_10>",
|
14 |
+
"<extra_id_11>",
|
15 |
+
"<extra_id_12>",
|
16 |
+
"<extra_id_13>",
|
17 |
+
"<extra_id_14>",
|
18 |
+
"<extra_id_15>",
|
19 |
+
"<extra_id_16>",
|
20 |
+
"<extra_id_17>",
|
21 |
+
"<extra_id_18>",
|
22 |
+
"<extra_id_19>",
|
23 |
+
"<extra_id_20>",
|
24 |
+
"<extra_id_21>",
|
25 |
+
"<extra_id_22>",
|
26 |
+
"<extra_id_23>",
|
27 |
+
"<extra_id_24>",
|
28 |
+
"<extra_id_25>",
|
29 |
+
"<extra_id_26>",
|
30 |
+
"<extra_id_27>",
|
31 |
+
"<extra_id_28>",
|
32 |
+
"<extra_id_29>",
|
33 |
+
"<extra_id_30>",
|
34 |
+
"<extra_id_31>",
|
35 |
+
"<extra_id_32>",
|
36 |
+
"<extra_id_33>",
|
37 |
+
"<extra_id_34>",
|
38 |
+
"<extra_id_35>",
|
39 |
+
"<extra_id_36>",
|
40 |
+
"<extra_id_37>",
|
41 |
+
"<extra_id_38>",
|
42 |
+
"<extra_id_39>",
|
43 |
+
"<extra_id_40>",
|
44 |
+
"<extra_id_41>",
|
45 |
+
"<extra_id_42>",
|
46 |
+
"<extra_id_43>",
|
47 |
+
"<extra_id_44>",
|
48 |
+
"<extra_id_45>",
|
49 |
+
"<extra_id_46>",
|
50 |
+
"<extra_id_47>",
|
51 |
+
"<extra_id_48>",
|
52 |
+
"<extra_id_49>",
|
53 |
+
"<extra_id_50>",
|
54 |
+
"<extra_id_51>",
|
55 |
+
"<extra_id_52>",
|
56 |
+
"<extra_id_53>",
|
57 |
+
"<extra_id_54>",
|
58 |
+
"<extra_id_55>",
|
59 |
+
"<extra_id_56>",
|
60 |
+
"<extra_id_57>",
|
61 |
+
"<extra_id_58>",
|
62 |
+
"<extra_id_59>",
|
63 |
+
"<extra_id_60>",
|
64 |
+
"<extra_id_61>",
|
65 |
+
"<extra_id_62>",
|
66 |
+
"<extra_id_63>",
|
67 |
+
"<extra_id_64>",
|
68 |
+
"<extra_id_65>",
|
69 |
+
"<extra_id_66>",
|
70 |
+
"<extra_id_67>",
|
71 |
+
"<extra_id_68>",
|
72 |
+
"<extra_id_69>",
|
73 |
+
"<extra_id_70>",
|
74 |
+
"<extra_id_71>",
|
75 |
+
"<extra_id_72>",
|
76 |
+
"<extra_id_73>",
|
77 |
+
"<extra_id_74>",
|
78 |
+
"<extra_id_75>",
|
79 |
+
"<extra_id_76>",
|
80 |
+
"<extra_id_77>",
|
81 |
+
"<extra_id_78>",
|
82 |
+
"<extra_id_79>",
|
83 |
+
"<extra_id_80>",
|
84 |
+
"<extra_id_81>",
|
85 |
+
"<extra_id_82>",
|
86 |
+
"<extra_id_83>",
|
87 |
+
"<extra_id_84>",
|
88 |
+
"<extra_id_85>",
|
89 |
+
"<extra_id_86>",
|
90 |
+
"<extra_id_87>",
|
91 |
+
"<extra_id_88>",
|
92 |
+
"<extra_id_89>",
|
93 |
+
"<extra_id_90>",
|
94 |
+
"<extra_id_91>",
|
95 |
+
"<extra_id_92>",
|
96 |
+
"<extra_id_93>",
|
97 |
+
"<extra_id_94>",
|
98 |
+
"<extra_id_95>",
|
99 |
+
"<extra_id_96>",
|
100 |
+
"<extra_id_97>",
|
101 |
+
"<extra_id_98>",
|
102 |
+
"<extra_id_99>"
|
103 |
+
],
|
104 |
+
"eos_token": "</s>",
|
105 |
+
"pad_token": "<pad>",
|
106 |
+
"unk_token": "<unk>"
|
107 |
+
}
|
t5_model/spiece.model
ADDED
Binary file (792 kB). View file
|
|
t5_model/tokenizer_config.json
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"additional_special_tokens": [
|
3 |
+
"<extra_id_0>",
|
4 |
+
"<extra_id_1>",
|
5 |
+
"<extra_id_2>",
|
6 |
+
"<extra_id_3>",
|
7 |
+
"<extra_id_4>",
|
8 |
+
"<extra_id_5>",
|
9 |
+
"<extra_id_6>",
|
10 |
+
"<extra_id_7>",
|
11 |
+
"<extra_id_8>",
|
12 |
+
"<extra_id_9>",
|
13 |
+
"<extra_id_10>",
|
14 |
+
"<extra_id_11>",
|
15 |
+
"<extra_id_12>",
|
16 |
+
"<extra_id_13>",
|
17 |
+
"<extra_id_14>",
|
18 |
+
"<extra_id_15>",
|
19 |
+
"<extra_id_16>",
|
20 |
+
"<extra_id_17>",
|
21 |
+
"<extra_id_18>",
|
22 |
+
"<extra_id_19>",
|
23 |
+
"<extra_id_20>",
|
24 |
+
"<extra_id_21>",
|
25 |
+
"<extra_id_22>",
|
26 |
+
"<extra_id_23>",
|
27 |
+
"<extra_id_24>",
|
28 |
+
"<extra_id_25>",
|
29 |
+
"<extra_id_26>",
|
30 |
+
"<extra_id_27>",
|
31 |
+
"<extra_id_28>",
|
32 |
+
"<extra_id_29>",
|
33 |
+
"<extra_id_30>",
|
34 |
+
"<extra_id_31>",
|
35 |
+
"<extra_id_32>",
|
36 |
+
"<extra_id_33>",
|
37 |
+
"<extra_id_34>",
|
38 |
+
"<extra_id_35>",
|
39 |
+
"<extra_id_36>",
|
40 |
+
"<extra_id_37>",
|
41 |
+
"<extra_id_38>",
|
42 |
+
"<extra_id_39>",
|
43 |
+
"<extra_id_40>",
|
44 |
+
"<extra_id_41>",
|
45 |
+
"<extra_id_42>",
|
46 |
+
"<extra_id_43>",
|
47 |
+
"<extra_id_44>",
|
48 |
+
"<extra_id_45>",
|
49 |
+
"<extra_id_46>",
|
50 |
+
"<extra_id_47>",
|
51 |
+
"<extra_id_48>",
|
52 |
+
"<extra_id_49>",
|
53 |
+
"<extra_id_50>",
|
54 |
+
"<extra_id_51>",
|
55 |
+
"<extra_id_52>",
|
56 |
+
"<extra_id_53>",
|
57 |
+
"<extra_id_54>",
|
58 |
+
"<extra_id_55>",
|
59 |
+
"<extra_id_56>",
|
60 |
+
"<extra_id_57>",
|
61 |
+
"<extra_id_58>",
|
62 |
+
"<extra_id_59>",
|
63 |
+
"<extra_id_60>",
|
64 |
+
"<extra_id_61>",
|
65 |
+
"<extra_id_62>",
|
66 |
+
"<extra_id_63>",
|
67 |
+
"<extra_id_64>",
|
68 |
+
"<extra_id_65>",
|
69 |
+
"<extra_id_66>",
|
70 |
+
"<extra_id_67>",
|
71 |
+
"<extra_id_68>",
|
72 |
+
"<extra_id_69>",
|
73 |
+
"<extra_id_70>",
|
74 |
+
"<extra_id_71>",
|
75 |
+
"<extra_id_72>",
|
76 |
+
"<extra_id_73>",
|
77 |
+
"<extra_id_74>",
|
78 |
+
"<extra_id_75>",
|
79 |
+
"<extra_id_76>",
|
80 |
+
"<extra_id_77>",
|
81 |
+
"<extra_id_78>",
|
82 |
+
"<extra_id_79>",
|
83 |
+
"<extra_id_80>",
|
84 |
+
"<extra_id_81>",
|
85 |
+
"<extra_id_82>",
|
86 |
+
"<extra_id_83>",
|
87 |
+
"<extra_id_84>",
|
88 |
+
"<extra_id_85>",
|
89 |
+
"<extra_id_86>",
|
90 |
+
"<extra_id_87>",
|
91 |
+
"<extra_id_88>",
|
92 |
+
"<extra_id_89>",
|
93 |
+
"<extra_id_90>",
|
94 |
+
"<extra_id_91>",
|
95 |
+
"<extra_id_92>",
|
96 |
+
"<extra_id_93>",
|
97 |
+
"<extra_id_94>",
|
98 |
+
"<extra_id_95>",
|
99 |
+
"<extra_id_96>",
|
100 |
+
"<extra_id_97>",
|
101 |
+
"<extra_id_98>",
|
102 |
+
"<extra_id_99>"
|
103 |
+
],
|
104 |
+
"eos_token": "</s>",
|
105 |
+
"extra_ids": 100,
|
106 |
+
"model_max_length": 512,
|
107 |
+
"name_or_path": "Michau/t5-base-en-generate-headline",
|
108 |
+
"pad_token": "<pad>",
|
109 |
+
"sp_model_kwargs": {},
|
110 |
+
"special_tokens_map_file": "/root/.cache/huggingface/hub/models--Michau--t5-base-en-generate-headline/snapshots/f526532f788c45b6b6288286e5ef929fa768ef6a/special_tokens_map.json",
|
111 |
+
"tokenizer_class": "T5Tokenizer",
|
112 |
+
"truncate": true,
|
113 |
+
"unk_token": "<unk>"
|
114 |
+
}
|
title_generator.py
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import re
|
3 |
+
from nltk.corpus import stopwords
|
4 |
+
from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
|
5 |
+
import spacy
|
6 |
+
import torch
|
7 |
+
from transformers import T5ForConditionalGeneration,T5Tokenizer
|
8 |
+
|
9 |
+
#Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen
|
10 |
+
def doc_text_preprocessing(ser):
|
11 |
+
nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat'])
|
12 |
+
|
13 |
+
"""text processing steps"""
|
14 |
+
import re
|
15 |
+
stop_words=set(stopwords.words('english'))
|
16 |
+
|
17 |
+
single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
|
18 |
+
to_lower_func=lambda c: c.lower()
|
19 |
+
lemma_text=[preprocess_string(
|
20 |
+
' '.join([token.lemma_ for token in desc]
|
21 |
+
),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
|
22 |
+
strip_multiple_whitespaces,single_letter_replace,to_lower_func]
|
23 |
+
) for desc in ser.apply(lambda x: nlp(x))]
|
24 |
+
|
25 |
+
tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
|
26 |
+
|
27 |
+
return tokenize_text
|
28 |
+
|
29 |
+
class Title_Generator:
|
30 |
+
|
31 |
+
def __init__(self, path, df):
|
32 |
+
self.model = T5ForConditionalGeneration.from_pretrained(path)
|
33 |
+
self.tokenizer = T5Tokenizer.from_pretrained(path)
|
34 |
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
35 |
+
self.model.to(self.device)
|
36 |
+
self.game_df = df
|
37 |
+
|
38 |
+
self.title_iter = -1
|
39 |
+
self.out_titles = None
|
40 |
+
self.best_title = None
|
41 |
+
self.description = None
|
42 |
+
|
43 |
+
|
44 |
+
def candidate_generator(self, description):
|
45 |
+
text = "headline: " + description
|
46 |
+
|
47 |
+
encoding = self.tokenizer.encode_plus(text, return_tensors = "pt")
|
48 |
+
input_ids = encoding["input_ids"].to(self.device)
|
49 |
+
attention_masks = encoding["attention_mask"].to(self.device)
|
50 |
+
|
51 |
+
candidates = []
|
52 |
+
|
53 |
+
beam_outputs = self.model.generate(
|
54 |
+
input_ids = input_ids,
|
55 |
+
attention_mask = attention_masks,
|
56 |
+
max_length = 64,
|
57 |
+
num_beams = 16,
|
58 |
+
num_beam_groups=4,
|
59 |
+
num_return_sequences=8,
|
60 |
+
diversity_penalty=.1,
|
61 |
+
repetition_penalty=.9,
|
62 |
+
early_stopping = True)
|
63 |
+
|
64 |
+
for result in beam_outputs:
|
65 |
+
res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','')
|
66 |
+
candidates.append(res)
|
67 |
+
|
68 |
+
return candidates, description
|
69 |
+
|
70 |
+
def candidate_score(self,candidates,ex_check=None):
|
71 |
+
import random
|
72 |
+
from operator import itemgetter
|
73 |
+
|
74 |
+
if ex_check != None:
|
75 |
+
pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "|" + "|".join(ex_check) +"))")
|
76 |
+
desc = re.sub(pat, "__", candidates[1])
|
77 |
+
else:
|
78 |
+
pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))")
|
79 |
+
desc = re.sub(pat, "__", candidates[1])
|
80 |
+
|
81 |
+
|
82 |
+
if re.search(re.compile(re.escape("__")), desc):
|
83 |
+
reg = re.compile("("+"|".join(ex_check) + ")")
|
84 |
+
hold = candidates[0]
|
85 |
+
gen_desc = re.sub(re.compile(re.escape("__")),"",desc)
|
86 |
+
candidates = self.candidate_generator(gen_desc)
|
87 |
+
next = [cand for cand in candidates[0]+hold if not reg.search(cand)]
|
88 |
+
candidates = (next, desc)
|
89 |
+
|
90 |
+
#backup load function, will refactor
|
91 |
+
nlp=spacy.load("en_core_web_md")
|
92 |
+
|
93 |
+
#check for existing games and duplicates
|
94 |
+
#transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o
|
95 |
+
def transform(L):
|
96 |
+
S = set(L)
|
97 |
+
return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())]
|
98 |
+
|
99 |
+
|
100 |
+
clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0]))
|
101 |
+
clean_cand_step = transform(clean_cand_step)
|
102 |
+
|
103 |
+
clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and',
|
104 |
+
re.sub(re.compile('(?<=\S) (([(]|\b)[Ss]econd [Ee]dition([)]|\b)|[Ss]econd [Ee]dition|2[Nn][Dd] [Ee]dition|([(]|\b)[Tt]hird [Ee]dition([)]|\b)|3[Rr][Dd] [Ee]dition)|["]Second Edition["]'),"",
|
105 |
+
re.sub(re.compile("(?<=[a-z])'S"),"'s",
|
106 |
+
re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x))))
|
107 |
+
for x in clean_cand_step]
|
108 |
+
|
109 |
+
|
110 |
+
clean_cand = []
|
111 |
+
for cand in clean_cand_step:
|
112 |
+
try:
|
113 |
+
inter = cand.split(":")
|
114 |
+
if inter[0].lower()==inter[1].lower():
|
115 |
+
clean_cand.append(inter[0])
|
116 |
+
else:
|
117 |
+
clean_cand.append(cand)
|
118 |
+
except:
|
119 |
+
clean_cand.append(cand)
|
120 |
+
|
121 |
+
#text processing
|
122 |
+
token_cand = doc_text_preprocessing(pd.Series(clean_cand))
|
123 |
+
token_art = doc_text_preprocessing(pd.Series([candidates[1]]))
|
124 |
+
sim = [nlp(title) for title in [" ".join(title) for title in token_cand]]
|
125 |
+
doc = nlp(" ".join(token_art[0]))
|
126 |
+
|
127 |
+
#scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't)
|
128 |
+
#it assigns a random probability to populate
|
129 |
+
|
130 |
+
scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]]
|
131 |
+
|
132 |
+
out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True)
|
133 |
+
|
134 |
+
pat = re.compile("(?<=[!.?])(?=[^\s])")
|
135 |
+
pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]|[Ff]rom the [Dd]esigner[: ]|[Gg]ame [Dd]escription)")
|
136 |
+
pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame|: [Tt]he [Gg]ame")
|
137 |
+
pat4 = re.compile("[Tt]he __")
|
138 |
+
pat5 = re.compile("__ [Gg]ame")
|
139 |
+
pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __")
|
140 |
+
|
141 |
+
desc = re.sub(pat," ",candidates[1])
|
142 |
+
desc = re.sub(pat2,"",desc)
|
143 |
+
desc = re.sub(pat3,"",desc)
|
144 |
+
desc = re.sub(pat4,"__",desc)
|
145 |
+
desc = re.sub(pat5,"__",desc)
|
146 |
+
desc = re.sub(pat6,"__",desc)
|
147 |
+
|
148 |
+
return {'text':desc,'titles':out_titles}
|