Spaces:

AutoBG
/

Auto-BoardGame

Runtime error

App Files Files Community

Nick Canu commited on Apr 9, 2023

Commit

b0829c1

1 Parent(s): 394d881

add app

Browse files

Files changed (25) hide show

.gitattributes +2 -33
.gitignore +1 -0
.streamlit/config.toml +6 -0
.vscode/launch.json +16 -0
Home.py +348 -0
Model_Constants_Template.py +7 -0
Model_Step_Data/slim_df.parquet.gzip +3 -0
Model_Step_Data/vector_df.parquet.gzip +3 -0
Persistent Objects/current_keys.gz +0 -0
Persistent Objects/token_search.gz +0 -0
README.md +48 -13
Stream_to_Output/GameCleaner.py +144 -0
Stream_to_Output/requirements.txt +6 -0
__pycache__/Model_Constants.cpython-39.pyc +0 -0
__pycache__/description_generator.cpython-39.pyc +0 -0
__pycache__/title_generator.cpython-39.pyc +0 -0
description_generator.py +120 -0
requirements.txt +11 -0
t5_model/config.json +60 -0
t5_model/generation_config.json +7 -0
t5_model/pytorch_model.bin +3 -0
t5_model/special_tokens_map.json +107 -0
t5_model/spiece.model +0 -0
t5_model/tokenizer_config.json +114 -0
title_generator.py +148 -0

.gitattributes CHANGED Viewed

@@ -1,34 +1,3 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.gzip filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ Model_Constants.py

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,6 @@

+[theme]
+primaryColor="#e76020"
+backgroundColor="#FDFFFC"
+secondaryBackgroundColor="#6E896A"
+textColor="#0f0f0d"
+font="monospace"

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Module",
+            "type": "python",
+            "request": "launch",
+            "module": "streamlit",
+            "args": ["run", "Home.py"],
+            "justMyCode": true
+        }
+    ]
+}

Home.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import streamlit as st
+st.set_page_config(page_title='Auto-BG: The Game Concept Generator', layout='wide')
+def application():
+    ###Imports
+    import pandas as pd
+    import numpy as np
+    import re
+    import urllib
+    import pickle
+    import spacy
+    from spacy.tokens import DocBin
+    from title_generator import Title_Generator
+    import gzip
+    import io
+    from description_generator import input_manager, model_control
+    #UI Session Variables
+    if 'desc_iter' not in st.session_state:
+        st.session_state.desc_iter = 0
+    if 'title_iter' not in st.session_state:
+        st.session_state.title_iter = 0
+    if 'output_dict' not in st.session_state:
+        st.session_state.output_dict = {}
+    if 'inputs' not in st.session_state:
+        st.session_state.inputs = []
+    if 'cur_pair' not in st.session_state:
+        st.session_state.cur_pair = ("","Run me!")
+    if 'f_d' not in st.session_state:
+        st.session_state.f_d = None
+    if 'g_d' not in st.session_state:
+        st.session_state.g_d = None
+    if 'm_d' not in st.session_state:
+        st.session_state.m_d = None
+    if 'c_d' not in st.session_state:
+        st.session_state.c_d = None
+    if 'coop_d' not in st.session_state:
+        st.session_state.coop_d = 0
+    #non-ui helper functions
+    #reader code extended from https://gist.github.com/thearn/5424244 for alternate load format
+    def reader(url):
+        url_file = io.BytesIO(urllib.request.urlopen(url).read())
+        f = gzip.GzipFile(fileobj=url_file)
+        data = f.read()
+        obj = pickle.loads(data)
+        f.close()
+        return obj
+    def token_expand(url):
+        nlp = spacy.blank("en")
+        url_file = urllib.request.urlopen(url)
+        f = gzip.GzipFile(fileobj=url_file)
+        data = f.read()
+        obj = pickle.loads(data)
+        f.close()
+        doc_bin = DocBin().from_bytes(obj)
+        docs = list(doc_bin.get_docs(nlp.vocab))
+        return (docs[1:9],docs[9:192],docs[192:276],docs[276:3901])
+    def revert_cats(gt, mec, cat, fam, coop):
+        gt = ["game_type_" + x for x in gt]
+        mec = ["mechanic_" + x for x in mec]
+        cat = ["category_" + x for x in cat]
+        fam = ["family_" + x for x in fam if x != "Game: [redacted]"]
+        if coop == 1:
+            co  = ["cooperative", "mechanic_Cooperative Game"]
+        else:
+            co  = []
+        final_list = [gt,mec,cat,fam, co]
+        return [item for sublist in final_list for item in sublist]
+    def builder(ip):
+        ks = iman.input_parser(iman.set_input(ip))
+        mctrl.prompt_formatter(ks)
+        descs = []
+        for status in np.arange(0,3):
+            desc = mctrl.call_api(status=status)
+            clean_desc = mctrl.resp_cleanup(desc)
+            inter_pair = Tgen.candidate_generator(clean_desc)
+            out = Tgen.candidate_score(inter_pair,ex_check)
+            descs.append(out)
+            st.sidebar.success("Prompt " +str(status+1)+ " generated!")
+        st.session_state.output_dict = {0:descs[0],1:descs[1],2:descs[2]}
+    def title_check(next=0):
+        if next==1:
+            if st.session_state.title_iter == (len(st.session_state.output_dict[st.session_state.desc_iter]['titles'])-1):
+                st.session_state.title_iter = 0
+            else:
+                st.session_state.title_iter +=1
+        elif next==-1:
+            if st.session_state.title_iter == 0:
+                st.session_state.title_iter = (len(st.session_state.output_dict[st.session_state.desc_iter]['titles'])-1)
+            else:
+                st.session_state.title_iter -=1
+        else:
+            st.session_state.title_iter = 0
+        cur_title = st.session_state.output_dict[st.session_state.desc_iter]['titles'][st.session_state.title_iter][0]
+        desc = re.sub(re.compile("__"),cur_title,st.session_state.output_dict[st.session_state.desc_iter]['text'])
+        return (cur_title, desc.lstrip())
+    def show_title(val):
+        out = title_check(next=val)
+        st.session_state.cur_pair = out
+    def PT_button_clicked():
+        show_title(-1)
+    def NT_button_clicked():
+        show_title(1)
+    def PD_button_clicked():
+        if st.session_state.desc_iter == 0:
+            st.session_state.desc_iter = 2
+            st.session_state.title_iter = 0
+        else:
+            st.session_state.desc_iter -= 1
+            st.session_state.title_iter = 0
+        show_title(0)
+    def ND_button_clicked():
+        if st.session_state.desc_iter == 2:
+                st.session_state.desc_iter = 0
+                st.session_state.title_iter = 0
+        else:
+            st.session_state.desc_iter += 1
+            st.session_state.title_iter = 0
+        show_title(0)
+    ###Variables
+    ###Data
+    @st.cache_resource
+    def fetch_data():
+        slim_df = pd.read_parquet('https://github.com/canunj/Auto-BoardGame/blob/main/Model_Step_Data/slim_df.parquet.gzip?raw=true')
+        search_tokens = token_expand("https://github.com/canunj/Auto-BoardGame/blob/main/Persistent%20Objects/token_search.gz?raw=true")
+        vector_df = pd.read_parquet('https://github.com/canunj/Auto-BoardGame/blob/main/Model_Step_Data/vector_df.parquet.gzip?raw=true')
+        category_keys = reader("https://github.com/canunj/Auto-BoardGame/blob/main/Persistent%20Objects/current_keys.gz?raw=true")
+        coop = [1,0]
+        st.sidebar.success("Fetched Data!")
+        return slim_df, search_tokens, vector_df, category_keys, coop
+    slim_df, search_tokens, vector_df, category_keys, coop = fetch_data()
+    ex_check = ["[Ee]verquest","[Cc]ivilization [Ii][IiVv]","[Cc]ivilization(?=:)","[Cc]ivilization [Ii][Ii]",
+            "[Cc]ivilization [Ii][Ii][Ii]","[Cc]ivilization V","[Aa]ge [Oo]f [Ee]mpires [Ii][Ii2]([Ii]|\b)", "[Rr]avenloft|[Cc]astle [Rr]avenloft",
+            "[Ss]cythe(?=:|\b)","[Dd]ungeons [&Aa][ n][Dd ][ Ddr][Ddra][rg][oa][gn][os](ns|\b)",
+            "[Aa]ge [Oo]f [Ee]mpires [Ii][Ii]: [Tt]he [Aa]ge [Oo]f [Kk]ings","[Aa]ge [Oo]f [Ee]mpires 2: [Tt]he [Aa]ge [Oo]f [Kk]ings",
+            "[Aa]ge [Oo]f [Ee]mpires","Doctor Who"]
+    ###Models
+    @st.cache_resource
+    def setup_models():
+        return Title_Generator('./t5_model', slim_df), input_manager(vector_df, slim_df, search_tokens),  model_control(apikey=st.secrets.key,model_id=st.secrets.model)
+    Tgen, iman, mctrl = setup_models()
+    #UI
+    #Intro
+    st.title("""Auto-BG: The Game Concept Generator""")
+    with st.expander("How to use", expanded=True):
+        st.write(
+            """
+            Discover the concept for your next favorite game!
+            How do you use Auto-BG?
+            Pick any set of tags from four selectors below: Family, Game, Mechanic, and Category.
+            If you are looking to lose together - activate the cooperative toggle.
+            See ? icons for detailed information on each type of tag.
+            Select any pre-configured demo below to see how Auto-BG works on the tag set for a popular board game.
+            """
+        )
+    results = st.empty()
+    with st.expander('Demos'):
+        st.write("""These buttons run Auto-BG on the tag set for real games you might be familiar with,
+                 choose a button and the corresponding tags automatically fill the selectors below.
+                 Press run and see how Auto-BG creates an alternate concept for these hit titles!
+                 """)
+        b1, b2, b3 =  st.columns(3)
+    with b1:
+        SoC = st.button('Catan', use_container_width=True)
+        if SoC:
+            st.session_state.f_d = [
+                'Animals: Sheep',
+                'Components: Hexagonal Tiles',
+                'Components: Wooden pieces & boards'
+                ]
+            st.session_state.g_d = ['Family Game', 'Strategy Game']
+            st.session_state.m_d = [
+                'Hexagon Grid',
+                'Network and Route Building',
+                'Random Production',
+                'Trading',
+                'Variable Set-up'
+                ]
+            st.session_state.c_d = [
+                'Economic',
+                'Negotiation'
+                ]
+            st.session_state.coop_d = 0
+    with b2:
+        TtR = st.button('Ticket to Ride', use_container_width=True)
+        if TtR:
+            st.session_state.f_d = [
+                'Components: Map (Continental / National scale)',
+                'Continents: North America',
+                'Country: USA'
+                ]
+            st.session_state.g_d = ['Family Game']
+            st.session_state.m_d = [
+                'Contracts',
+                'End Game Bonuses',
+                'Network and Route Building',
+                'Push Your Luck',
+                'Set Collection'
+                ]
+            st.session_state.c_d = [
+                'Trains'
+                ]
+            st.session_state.coop_d = 0
+    with b3:
+        P = st.button('Pandemic', use_container_width=True)
+        if P:
+            st.session_state.f_d = [
+                'Components: Map (Global Scale)',
+                'Components: Multi-Use Cards',
+                'Medical: Diseases',
+                'Region: The World',
+                'Theme: Science'
+                ]
+            st.session_state.g_d = ['Family Game', 'Strategy Game']
+            st.session_state.m_d = [
+                'Action Points',
+                'Point to Point Movement',
+                'Trading',
+                'Variable Player Powers'
+                ]
+            st.session_state.c_d = [
+                'Medical'
+                ]
+            st.session_state.coop_d = 1
+    #Form
+    with st.expander("Auto-BG", expanded=True):
+        col1, col2 = st.columns(2)
+        with col1:
+            Family_v = st.multiselect("Family", options=pd.Series(category_keys[4][8:]), key='Family', default=st.session_state.f_d, max_selections=6, help='Descriptive niches for groupings of games.\n Maximum of six choices.')
+        with col2:
+            Game_v = st.multiselect("Game", options=pd.Series(category_keys[1]), key='Game', default=st.session_state.g_d, max_selections=2, help='Top level genres - Family, Strategy, etc.\n Maximum of two choices.')
+        col3, col4 = st.columns(2)
+        with col3:
+            Category_v = st.multiselect("Category", options=pd.Series(category_keys[3]), key='Category', default=st.session_state.c_d, max_selections=3, help='Expanded genre tags.\n Maximum of three choices.')
+        with col4:
+            Mechanics_v = st.multiselect("Mechanics", options=pd.Series([x for x in category_keys[2] if x != "Cooperative Game"]), key='Mechanic', default=st.session_state.m_d, max_selections=5, help='Game rules!\n Maximum of five choices.')
+        Cooperative_v = st.checkbox('Cooperative?', value=st.session_state.coop_d, key='CoopCheck')
+        run = st.button("Run Model", use_container_width=True)
+        if run:
+            if st.session_state.inputs  == revert_cats(Game_v, Mechanics_v, Category_v, Family_v, Cooperative_v):
+                st.write('Inputs did not change, results currently loaded.')
+            else:
+                st.session_state.desc_iter = 0
+                st.session_state.title_iter = 0
+                st.session_state.output_dict = {}
+                if Cooperative_v == True:
+                    Mechanics_v.append('Cooperative Game')
+                st.session_state.inputs  = revert_cats(Game_v, Mechanics_v, Category_v, Family_v, Cooperative_v)
+                builder(st.session_state.inputs)
+                st.session_state.cur_pair = title_check()
+    if st.session_state.output_dict == {}:
+        results.empty()
+    else:
+        with results.expander('Results', expanded=True):
+            st.write(
+                """
+                #### Title:
+                """)
+            st.write(st.session_state.cur_pair[0])
+            t_col1, t_col2 = st.columns(2)
+            with t_col1:
+                st.button("See Previous Title", on_click=PT_button_clicked, use_container_width=True)
+            with t_col2:
+                st.button("See Next Title", on_click=NT_button_clicked, use_container_width=True)
+            st.write(
+                """
+                ####  Description:
+                """)
+            st.write(st.session_state.cur_pair[1].replace('$','\$'))
+            d_col1, d_col2 = st.columns(2)
+            with d_col1:
+                st.button("See Previous Description", on_click=PD_button_clicked, use_container_width=True)
+            with d_col2:
+                st.button("See Next Description", on_click=ND_button_clicked, use_container_width=True)
+page_names_to_funcs = {
+    "Application": application
+}
+demo_name = st.sidebar.selectbox("Choose a page:", page_names_to_funcs.keys())
+page_names_to_funcs[demo_name]()

Model_Constants_Template.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def SEND_KEY():
+    KEY = ""
+    return KEY
+def SEND_MODEL():
+    OAI_MODEL = ""
+    return OAI_MODEL

Model_Step_Data/slim_df.parquet.gzip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8eb032341c8bacc24ffee96e2a1b3201a0ab6c2837567956ba1ddb9492e056dc
+size 16243764

Model_Step_Data/vector_df.parquet.gzip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eaf463f341982a460862da6ee77bbed38ad92ad36c4aef10bc031828681ef83f
+size 3803902

Persistent Objects/current_keys.gz ADDED Viewed

Binary file (39.7 kB). View file

Persistent Objects/token_search.gz ADDED Viewed

Binary file (144 kB). View file

README.md CHANGED Viewed

@@ -1,13 +1,48 @@
----
-title: Auto BoardGame
-emoji: 🦀
-colorFrom: pink
-colorTo: pink
-sdk: streamlit
-sdk_version: 1.17.0
-app_file: app.py
-pinned: false
-license: cc-by-nc-sa-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+[icon banner image placeholder]
+# Auto-BG
+LLM-based text generation tool for creating board game concepts (description & title)
+The Auto-BG (Board Game) tool is a text generation tool for creating board game concepts. It utilizes multiple large-language models to generate board game titles and descriptions tailored from user-input tags based on BoardGameGeek.com. The models used in this project include a trained T5 sequence-to-sequence model, primarily for title generation, and a robust GPT3 model for board game description generation. The T5 model was initially presented by Raffel et al. in ["Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer"](https://arxiv.org/pdf/1910.10683.pdf). The GPT3 model builds from Brown et al.'s work in ["Language Models are Few-Shot Learners"](https://arxiv.org/pdf/1910.10683.pdf).
+## Table of Contents
+- Features and Demo
+- Examples
+- Project Structure
+- Customizing Auto-BG
+- Citations and Licensing
+## Features and Demo
+The main features of this application include:
+A user-friendly interface for Auto-BG can be found at (homepage).
+## Examples
+## Project Structure
+## Customizing Auto-BG
+NOTE: Auto-BG uses a fine-tuned GPT-3 Curie model that will be inaccessible without an organizational API key,
+the below instructions are for advanced users interested in remixing Auto-BG with a new generator model.
+In order to run this application, you will need the following:
+1. An OpenAI account and API key
+2. All libraries specified in both the primary and data processing requirements.txt files
+3. A raw stream JSON file of BoardGameGeek data, formatted to match output from the Recommend.Games scraper
+To implement a new instance of Auto-BG, follow these steps:
+1. Clone the repository onto your local machine
+2. Install the required packages listed in both 'requirements.txt' files using pip
+3. Download the trained T5 model or provide a path to an alternate T5 model.
+4. Placing the JSON data file in Stream_to_Output, run GameCleaner.py - this provides all required data files.
+5. Prepare training prompts - convert all active keys to period stopped tokens in a string for each game.
+6. Fine-tune a selected model following the instructions at: https://platform.openai.com/docs/guides/fine-tuning
+NOTE: Auto-BG uses a Curie model with a lowered learning rate running for fewer epochs.
+8. Create a Model_Constants.py file with your personal API key and model instance based on the template above.
+9. You now have a customized instance of Auto-BG!
+## Citations and Licensing
+Auto-BG is licensed under CC BY-NC-SA 2.0, original data sourced from Recommend.Games @GitLab

Stream_to_Output/GameCleaner.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import pandas as pd
+import numpy as np
+import re
+import nltk
+from nltk.corpus import stopwords
+from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
+import spacy
+from langdetect import detect
+import pickle
+import gzip
+nltk.download('stopwords')
+#function definitions
+#strips values out of encoded stream lists
+def text_col_cleaner(frame, cols, pattern):
+    pattern = re.compile(pattern)
+    for col in cols:
+      frame[col] = frame[col].map(lambda x: [re.findall(pattern,val)[0].strip() for val in x], na_action='ignore')
+    return frame
+#converts specified columns to one-hot
+def encode_columns(frame):
+    targets = list(frame.columns)
+    for t in targets:
+        one_hot = pd.get_dummies(frame[t].apply(pd.Series).stack(),prefix=t).groupby(level=0).sum()
+        frame = pd.concat([frame,one_hot],axis=1)
+    return frame
+#custom text processor for tokenizing descriptions by Kuan Chen & Nick Canu
+def doc_text_preprocessing(ser):
+  nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat'])
+  """text processing steps"""
+  stop_words=set(stopwords.words('english'))
+  stop_words.update(['game','player','players','games', 'also',
+                     'description','publisher'])
+  single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
+  to_lower_func=lambda c: c.lower()
+  lemma_text=[preprocess_string(
+      ' '.join([token.lemma_ for token in desc]
+          ),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
+             strip_multiple_whitespaces,single_letter_replace,to_lower_func]
+             ) for desc in ser.apply(lambda x: nlp(x))]
+  tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
+  return tokenize_text
+#performs english language detection on the descriptions w/langdetect then additionally drops games using non-english characters in the name
+def lang_cleanup(frame):
+  nlp=spacy.load("en_core_web_sm")
+  frame['description']=frame['description'].fillna('no words')
+  frame = frame[frame['description']!='no words']
+  frame['cleaned_descriptions']=doc_text_preprocessing(frame['description'])
+  detected_lang = []
+  for word in frame.cleaned_descriptions:
+    word=', '.join(word)
+    detected_lang.append(detect(word))
+  frame['lang'] = detected_lang
+  frame = frame[frame['lang']=='en']
+  non_eng_title_filter = frame['name'].str.contains('[^\x00-\x7f]', flags=re.IGNORECASE)
+  return frame[~non_eng_title_filter]
+#column name stripper for creating key values
+def column_fixer(frame,targ):
+  return [col.replace(targ, "").strip('"') for col in frame.columns if col.startswith(targ)]
+#creates key list for defining web app lists & nlp tokens of the same unknown input search
+def key_collator(frame):
+  nlp=spacy.load("en_core_web_sm")
+  fam = column_fixer(frame,'family_')
+  gt = column_fixer(frame,'game_type_')
+  mec = column_fixer(frame,'mechanic_')
+  cat = column_fixer(frame,'category_')
+  current_keys = (['cooperative'],gt,mec,cat,fam)
+  fam_keys = [nlp(w) for w in fam]
+  gt_keys = [nlp(w) for w in gt]
+  mec_keys = [nlp(w) for w in mec]
+  cat_keys = [nlp(w) for w in cat]
+  search_tokens = (gt_keys,mec_keys,cat_keys,fam_keys)
+  return current_keys, search_tokens
+#-----------
+#reading in raw file & removing unranked and compilation game items
+df = pd.read_json(r'./bgg_GameItem.jl', lines=True)
+df['rank'] = df['rank'].fillna(0).astype(int)
+df = df[(df['rank']>0) & (df['compilation']!=1)]
+#separating and cleaning the one-hot target columns
+in_df = text_col_cleaner(frame = df[['game_type','mechanic','category','family']],
+                    cols = ['game_type','mechanic','category','family'],
+                    pattern = re.compile("([\S ]+)(?=:)"))
+print('Text has been cleaned, now encoding one-hot columns')
+#encoding one-hot columns and rejoining to features for output
+proc_df = encode_columns(in_df)
+step = df[['name','description','cooperative']]
+join_df = pd.concat([step,proc_df.drop(['game_type','mechanic','category','family',
+    'game_type_Amiga','game_type_Arcade','game_type_Atari ST',
+    'game_type_Commodore 64'],axis=1)],axis=1)
+print('Columns encoded, now performing english language detection and cleanup')
+#english language detection steps & first data save
+eng_df = lang_cleanup(join_df)
+eng_df = eng_df.loc[:,~eng_df.columns.duplicated()].copy().reset_index(drop=True).fillna(0)
+print('Creating vector-only dataframe & saving output')
+#vector only data for operations
+vector_df = eng_df.copy().drop(['name','description','cleaned_descriptions','lang'],axis=1)
+eng_df.to_parquet('game_data.parquet.gzip',compression='gzip')
+vector_df.to_parquet('game_vectors.parquet.gzip',compression='gzip')
+print('Creating key lists')
+#creating key lists - 1. string list of values by feature class for defining input selections & 2. nlp processed list for unknown input search
+keys, search_toks = key_collator(vector_df)
+with gzip.open("current_keys.gz", "wb") as f:
+    pickle.dump(keys, f)
+f.close()
+with gzip.open("key_search_tokens.gz", "wb") as f:
+    pickle.dump(search_toks, f)
+f.close()
+print('File creation is complete')

Stream_to_Output/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gensim==4.3.1
+langdetect==1.0.9
+nltk==3.8.1
+numpy==1.24.2
+pandas==1.3.2
+spacy==3.5.1

__pycache__/Model_Constants.cpython-39.pyc ADDED Viewed

Binary file (457 Bytes). View file

__pycache__/description_generator.cpython-39.pyc ADDED Viewed

Binary file (4.62 kB). View file

__pycache__/title_generator.cpython-39.pyc ADDED Viewed

Binary file (6.8 kB). View file

description_generator.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import numpy as np
+import re
+import spacy
+import openai
+from operator import itemgetter
+#user input manager class
+class input_manager:
+  #initialize key dictionary from vector data frame and set community top N
+  def __init__(self,key_df, slim_df, search_tokens, top_n=10):
+        self.key_df = key_df
+        self.slim_df = slim_df
+        self.search_tokens = search_tokens
+        self.key = dict(zip(list(key_df.columns),np.zeros(len(key_df.columns))))
+        self.top_n = top_n
+  #translate input text to vector
+  def set_input(self,input_cats):
+    #need setup to apply correct group tag to values
+    nlp=spacy.load("en_core_web_md")
+    #separate known/unknown features
+    k_flags = [cat for cat in input_cats if cat in list(self.key.keys())]
+    unk_flags = [cat for cat in input_cats if cat not in list(self.key.keys())]
+    #process within feature class similarity for each unknown input
+    if len(unk_flags)>0:
+      outs = []
+      for word in unk_flags:
+        if re.match(r"game_type_",word):
+          tok = nlp(word.split("_")[-1])
+          mtch = max([(key,key.similarity(tok)) for key in self.search_tokens[0]],key=itemgetter(1))
+          #if no known match is found (model doesn't recognize input word), we're going to discard - other solutions performance prohibitive
+          if mtch[1]>0:
+            outs.append("game_type_"+mtch[0])
+        elif re.match(r"mechanic_",word):
+          tok = nlp(word.split("_")[-1])
+          mtch = max([(key,key.similarity(tok)) for key in self.search_tokens[1]],key=itemgetter(1))
+          if mtch[1]>0:
+            outs.append("mechanic_"+mtch[0])
+        elif re.match(r"category_",word):
+          tok = nlp(word.split("_")[-1])
+          mtch=max([(key,key.similarity(tok)) for key in self.search_tokens[2]],key=itemgetter(1))
+          if mtch[1]>0:
+            outs.append("category_"+mtch[0])
+        elif re.match(r"family_",word):
+          tok = nlp(word.split("_")[-1])
+          mtch=max([(key,key.similarity(tok)) for key in self.search_tokens[3]],key=itemgetter(1))
+          if mtch[1]>0:
+            outs.append("family_"+str(mtch[0]))
+      #if unks are processed, rejoin nearest match to known.
+      k_flags = list(set(k_flags+outs))
+    #preserve global key and ouput copy w/input keys activated to 1
+    d = self.key.copy()
+    for cat in k_flags:
+      d[cat] = 1.0
+    # DELETE ME
+    return d
+  def input_parser(self,in_vec):
+    #extracting keys from processed vector
+    ks = [k for k,v in in_vec.items() if v == 1]
+    return ks
+class model_control:
+  def __init__(self, apikey, model_id):
+    self.api_key = apikey
+    openai.api_key = self.api_key
+    self.prompt = None
+    self.model = openai.FineTune.retrieve(id=model_id).fine_tuned_model
+  def prompt_formatter(self,ks):
+    self.prompt = ". ".join(ks) + "\n\n###\n\n"
+  def call_api(self,status=0):
+    if status == 0:
+      temp=0.5
+      pres=0.7
+    elif status == 1:
+      temp=0.4
+      pres=0.6
+    elif status == 2:
+      temp=0.5
+      pres=0.8
+    answer = openai.Completion.create(
+      model=self.model,
+      prompt=self.prompt,
+      max_tokens=512,
+      temperature=temp,
+      stop=["END"],
+      presence_penalty=pres,
+      frequency_penalty=0.5
+    )
+    return answer['choices'][0]['text']
+  def resp_cleanup(self,text):
+    if ((text[-1] != "!") & (text[-1] != ".") & (text[-1] != "?")):
+      text = " ".join([e+'.' for e in text.split('.')[0:-1] if e])
+    sent = re.split(r'([.?!:])', text)
+    phrases = ["[Dd]esigned by","[Dd]esigner of","[Aa]rt by","[Aa]rtist of","[Pp]ublished","[Pp]ublisher of"]
+    pat = re.compile("(?:" + "|".join(phrases) + ")")
+    fix = re.compile("(?<=[.!?])[.!?]")
+    text = re.sub(fix,'',''.join([s for s in sent if pat.search(s) == None]))
+    return text

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gensim==4.3.1
+langdetect==1.0.9
+nltk==3.8.1
+numpy==1.24.2
+openai==0.27.2
+pandas==1.3.2
+scikit_learn==1.2.2
+spacy==3.5.1
+streamlit==1.20.0
+torch==2.0.0
+transformers==4.27.3

t5_model/config.json ADDED Viewed

	@@ -0,0 +1,60 @@

+{
+  "_name_or_path": "Michau/t5-base-en-generate-headline",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "d_ff": 3072,
+  "d_kv": 64,
+  "d_model": 768,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 512,
+  "num_decoder_layers": 12,
+  "num_heads": 12,
+  "num_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "task_specific_params": {
+    "summarization": {
+      "early_stopping": true,
+      "length_penalty": 2.0,
+      "max_length": 200,
+      "min_length": 30,
+      "no_repeat_ngram_size": 3,
+      "num_beams": 4,
+      "prefix": "summarize: "
+    },
+    "translation_en_to_de": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to German: "
+    },
+    "translation_en_to_fr": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to French: "
+    },
+    "translation_en_to_ro": {
+      "early_stopping": true,
+      "max_length": 300,
+      "num_beams": 4,
+      "prefix": "translate English to Romanian: "
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "use_cache": true,
+  "vocab_size": 32128
+}

t5_model/generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.26.1"
+}

t5_model/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3f73b04bb3e12b9bd1f02b88f98648da9c317f734a61e9805ae385c1c57671d
+size 891702929

t5_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,107 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

t5_model/spiece.model ADDED Viewed

Binary file (792 kB). View file

t5_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,114 @@

+{
+  "additional_special_tokens": [
+    "<extra_id_0>",
+    "<extra_id_1>",
+    "<extra_id_2>",
+    "<extra_id_3>",
+    "<extra_id_4>",
+    "<extra_id_5>",
+    "<extra_id_6>",
+    "<extra_id_7>",
+    "<extra_id_8>",
+    "<extra_id_9>",
+    "<extra_id_10>",
+    "<extra_id_11>",
+    "<extra_id_12>",
+    "<extra_id_13>",
+    "<extra_id_14>",
+    "<extra_id_15>",
+    "<extra_id_16>",
+    "<extra_id_17>",
+    "<extra_id_18>",
+    "<extra_id_19>",
+    "<extra_id_20>",
+    "<extra_id_21>",
+    "<extra_id_22>",
+    "<extra_id_23>",
+    "<extra_id_24>",
+    "<extra_id_25>",
+    "<extra_id_26>",
+    "<extra_id_27>",
+    "<extra_id_28>",
+    "<extra_id_29>",
+    "<extra_id_30>",
+    "<extra_id_31>",
+    "<extra_id_32>",
+    "<extra_id_33>",
+    "<extra_id_34>",
+    "<extra_id_35>",
+    "<extra_id_36>",
+    "<extra_id_37>",
+    "<extra_id_38>",
+    "<extra_id_39>",
+    "<extra_id_40>",
+    "<extra_id_41>",
+    "<extra_id_42>",
+    "<extra_id_43>",
+    "<extra_id_44>",
+    "<extra_id_45>",
+    "<extra_id_46>",
+    "<extra_id_47>",
+    "<extra_id_48>",
+    "<extra_id_49>",
+    "<extra_id_50>",
+    "<extra_id_51>",
+    "<extra_id_52>",
+    "<extra_id_53>",
+    "<extra_id_54>",
+    "<extra_id_55>",
+    "<extra_id_56>",
+    "<extra_id_57>",
+    "<extra_id_58>",
+    "<extra_id_59>",
+    "<extra_id_60>",
+    "<extra_id_61>",
+    "<extra_id_62>",
+    "<extra_id_63>",
+    "<extra_id_64>",
+    "<extra_id_65>",
+    "<extra_id_66>",
+    "<extra_id_67>",
+    "<extra_id_68>",
+    "<extra_id_69>",
+    "<extra_id_70>",
+    "<extra_id_71>",
+    "<extra_id_72>",
+    "<extra_id_73>",
+    "<extra_id_74>",
+    "<extra_id_75>",
+    "<extra_id_76>",
+    "<extra_id_77>",
+    "<extra_id_78>",
+    "<extra_id_79>",
+    "<extra_id_80>",
+    "<extra_id_81>",
+    "<extra_id_82>",
+    "<extra_id_83>",
+    "<extra_id_84>",
+    "<extra_id_85>",
+    "<extra_id_86>",
+    "<extra_id_87>",
+    "<extra_id_88>",
+    "<extra_id_89>",
+    "<extra_id_90>",
+    "<extra_id_91>",
+    "<extra_id_92>",
+    "<extra_id_93>",
+    "<extra_id_94>",
+    "<extra_id_95>",
+    "<extra_id_96>",
+    "<extra_id_97>",
+    "<extra_id_98>",
+    "<extra_id_99>"
+  ],
+  "eos_token": "</s>",
+  "extra_ids": 100,
+  "model_max_length": 512,
+  "name_or_path": "Michau/t5-base-en-generate-headline",
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": "/root/.cache/huggingface/hub/models--Michau--t5-base-en-generate-headline/snapshots/f526532f788c45b6b6288286e5ef929fa768ef6a/special_tokens_map.json",
+  "tokenizer_class": "T5Tokenizer",
+  "truncate": true,
+  "unk_token": "<unk>"
+}

title_generator.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import pandas as pd
+import re
+from nltk.corpus import stopwords
+from gensim.parsing import preprocess_string, strip_tags, strip_numeric, strip_multiple_whitespaces, stem_text, strip_punctuation, remove_stopwords
+import spacy
+import torch
+from transformers import T5ForConditionalGeneration,T5Tokenizer
+#Custom text tokenizer from https://github.com/canunj/deconstructing_games by N Canu & K Chen
+def doc_text_preprocessing(ser):
+    nlp=spacy.load("en_core_web_sm", exclude=['parser','ner','textcat'])
+    """text processing steps"""
+    import re
+    stop_words=set(stopwords.words('english'))
+    single_letter_replace=lambda c: re.sub("\s+\w{1}\s+|\n|-|—",'',c)
+    to_lower_func=lambda c: c.lower()
+    lemma_text=[preprocess_string(
+        ' '.join([token.lemma_ for token in desc]
+            ),[remove_stopwords,strip_numeric,strip_punctuation,strip_tags,
+               strip_multiple_whitespaces,single_letter_replace,to_lower_func]
+               ) for desc in ser.apply(lambda x: nlp(x))]
+    tokenize_text=[[word for word in string if word not in stop_words] for string in lemma_text]
+    return tokenize_text
+class Title_Generator:
+    def __init__(self, path, df):
+        self.model = T5ForConditionalGeneration.from_pretrained(path)
+        self.tokenizer = T5Tokenizer.from_pretrained(path)
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model.to(self.device)
+        self.game_df = df
+        self.title_iter = -1
+        self.out_titles = None
+        self.best_title = None
+        self.description = None
+    def candidate_generator(self, description):
+        text =  "headline: " + description
+        encoding = self.tokenizer.encode_plus(text, return_tensors = "pt")
+        input_ids = encoding["input_ids"].to(self.device)
+        attention_masks = encoding["attention_mask"].to(self.device)
+        candidates = []
+        beam_outputs = self.model.generate(
+            input_ids = input_ids,
+            attention_mask = attention_masks,
+            max_length = 64,
+            num_beams = 16,
+            num_beam_groups=4,
+            num_return_sequences=8,
+            diversity_penalty=.1,
+            repetition_penalty=.9,
+            early_stopping = True)
+        for result in beam_outputs:
+            res = self.tokenizer.decode(result).replace('<pad> ','').replace('</s>','').replace('<pad>','')
+            candidates.append(res)
+        return candidates, description
+    def candidate_score(self,candidates,ex_check=None):
+        import random
+        from operator import itemgetter
+        if ex_check != None:
+            pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "|" + "|".join(ex_check) +"))")
+            desc = re.sub(pat, "__", candidates[1])
+        else:
+            pat = re.compile("((?:" + "|".join(map(re.escape, candidates[0]+[cand.upper() for cand in candidates[0]])) + "))")
+            desc = re.sub(pat, "__", candidates[1])
+        if re.search(re.compile(re.escape("__")), desc):
+            reg = re.compile("("+"|".join(ex_check) + ")")
+            hold = candidates[0]
+            gen_desc = re.sub(re.compile(re.escape("__")),"",desc)
+            candidates = self.candidate_generator(gen_desc)
+            next = [cand for cand in candidates[0]+hold if not reg.search(cand)]
+            candidates = (next, desc)
+        #backup load function, will refactor
+        nlp=spacy.load("en_core_web_md")
+        #check for existing games and duplicates
+        #transform function from https://stackoverflow.com/questions/42165779/python-how-to-remove-duplicate-valuescase-insensitive-from-a-list-with-same-o
+        def transform(L):
+            S = set(L)
+            return [item.title() for item in L if item.lower() not in S and not S.add(item.lower())]
+        clean_cand_step = list(set([game[0] for game in list(zip(candidates[0],[len(self.game_df[self.game_df.name.isin([x])]) for x in candidates[0]])) if game[1]==0]))
+        clean_cand_step = transform(clean_cand_step)
+        clean_cand_step = [re.sub(re.compile("(?<=[ ])And(?=[ ])"),'and',
+                                  re.sub(re.compile('(?<=\S) (([(]|\b)[Ss]econd [Ee]dition([)]|\b)|[Ss]econd [Ee]dition|2[Nn][Dd] [Ee]dition|([(]|\b)[Tt]hird [Ee]dition([)]|\b)|3[Rr][Dd] [Ee]dition)|["]Second Edition["]'),"",
+                                  re.sub(re.compile("(?<=[a-z])'S"),"'s",
+                                  re.sub(re.compile("(?<=[ ])Of(?=[ ])"),"of",x))))
+                                  for x in clean_cand_step]
+        clean_cand = []
+        for cand in clean_cand_step:
+            try:
+                inter = cand.split(":")
+                if inter[0].lower()==inter[1].lower():
+                    clean_cand.append(inter[0])
+                else:
+                    clean_cand.append(cand)
+            except:
+                clean_cand.append(cand)
+        #text processing
+        token_cand = doc_text_preprocessing(pd.Series(clean_cand))
+        token_art = doc_text_preprocessing(pd.Series([candidates[1]]))
+        sim = [nlp(title) for title in [" ".join(title) for title in token_cand]]
+        doc = nlp(" ".join(token_art[0]))
+        #scores cosine similarity between generated titles and body text, if the word is unknown (i.e. generator knows it but spacy doesn't)
+        #it assigns a random probability to populate
+        scores = [x if x !=0 else random.uniform(.3, .7) for x in [tok.similarity(doc) for tok in sim]]
+        out_titles = sorted(list(zip(clean_cand,scores)),key=itemgetter(1),reverse=True)
+        pat = re.compile("(?<=[!.?])(?=[^\s])")
+        pat2 = re.compile("([Ff]rom the [Pp]ublisher[: ]|[Ff]rom the [Dd]esigner[: ]|[Gg]ame [Dd]escription)")
+        pat3 = re.compile(": [Tt]he [Gg]ame: [Tt]he [Gg]ame|: [Tt]he [Gg]ame")
+        pat4 = re.compile("[Tt]he __")
+        pat5 = re.compile("__ [Gg]ame")
+        pat6 = re.compile("[Tt]he [Gg]ame [Oo]f __")
+        desc = re.sub(pat," ",candidates[1])
+        desc = re.sub(pat2,"",desc)
+        desc = re.sub(pat3,"",desc)
+        desc = re.sub(pat4,"__",desc)
+        desc = re.sub(pat5,"__",desc)
+        desc = re.sub(pat6,"__",desc)
+        return {'text':desc,'titles':out_titles}