Spaces:

Andreas99
/

LitBench-UI

Build error

App Files Files Community

Andreas99 commited on Mar 11

Commit

3647b6e

verified ·

1 Parent(s): 4b13db1

Upload 30 files

Browse files

Files changed (31) hide show

.gitattributes +7 -0
configs/alpaca.json +6 -0
configs/cleaning_config.yaml +203 -0
configs/config.yaml +55 -0
configs/config_noUI.yaml +37 -0
configs/latex_commands.yaml +162 -0
datasets/computer_vision.gexf +0 -0
datasets/economics.gexf +3 -0
datasets/finance.gexf +3 -0
datasets/large_language_models.gexf +3 -0
datasets/quantative_biology.gexf +3 -0
datasets/quantum_physics.gexf +3 -0
datasets/robotics.gexf +3 -0
datasets/telecommunications.gexf +3 -0
litbench_pipeline.py +746 -0
retriever/retriever.py +129 -0
tasks/abs_2_title.py +23 -0
tasks/abs_completion.py +29 -0
tasks/citation_sentence.py +25 -0
tasks/gen_related_work.py +430 -0
tasks/influential_papers.py +41 -0
tasks/intro_2_abs.py +28 -0
tasks/link_pred.py +23 -0
tasks/paper_retrieval.py +21 -0
train.py +385 -0
utils/de-macro.py +1110 -0
utils/def_handle.py +75 -0
utils/gradio_utils.py +20 -0
utils/graph_utils.py +111 -0
utils/latexpand +713 -0
utils/utils.py +701 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,7 @@

+datasets/economics.gexf filter=lfs diff=lfs merge=lfs -text
+datasets/finance.gexf filter=lfs diff=lfs merge=lfs -text
+datasets/large_language_models.gexf filter=lfs diff=lfs merge=lfs -text
+datasets/quantative_biology.gexf filter=lfs diff=lfs merge=lfs -text
+datasets/quantum_physics.gexf filter=lfs diff=lfs merge=lfs -text
+datasets/robotics.gexf filter=lfs diff=lfs merge=lfs -text
+datasets/telecommunications.gexf filter=lfs diff=lfs merge=lfs -text

configs/alpaca.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "description": "Template used by Alpaca-LoRA.",
+    "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
+    "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
+    "response_split": "### Response:"
+}

configs/cleaning_config.yaml ADDED Viewed

	@@ -0,0 +1,203 @@

+patterns_and_insertions:
+    [
+        {
+            "pattern" : '(?:\\figcomp{\s*)(?P<first>.*?)\s*}\s*{\s*(?P<second>.*?)\s*}\s*{\s*(?P<third>.*?)\s*}',
+            "insertion" : '\parbox[c]{{ {second} \linewidth}} {{ \includegraphics[width= {third} \linewidth]{{figures/{first} }} }}',
+            "description" : "Replace figcomp",
+        },
+    ]
+verbose: False
+commands_to_delete: [
+    'footnote',
+    'footnote ',
+    'crdata',
+    'appendixhead',
+    'selectlanguage',
+    'name',
+    'expandafter',
+    'copyrightyear',
+    'acmYear',
+    'acmBooktitle',
+    'acmPrice',
+    'authorcontributions',
+    'thanksref',
+    'funding',
+    'conflictsofinterest',
+    'externalbibliography',
+    'acmDOI',
+    'acmISBN',
+    'acmConference',
+    'titleheader',
+    'affil',
+    'authorrunning',
+    'pagenumbering',
+    'enlargethispage',
+    'author',
+    'AuthorNames',
+    'author\n',
+    'Author\n',
+    'Author',
+    'fntext',
+    'icmlauthor',
+    'icmlauthor\n',
+    'icmladdress',
+    'icmladdress\n',
+    'received',
+    'runninghead',
+    'bstctlcite',
+    'slugger',
+    'tocauthor',
+    'author\*',
+    'vspace\*',
+    '\write18',
+    'hspace\*',
+    'vspace',
+    'hspace',
+    'maketitle',
+    'institute',
+    'label',
+    'urlstyle',
+    'acks',
+    'tnoteref',
+    'Appendix',
+    'urlstyle',
+    'url',
+    'editor',
+    'ccsdesc',
+    'cortext',
+    'bibliography',
+    'permission',
+    'usetikzlibrary',
+    'thanks',
+    'thispagestyle',
+    'abovedisplayskip',
+    'belowdisplayskip',
+    'bibliographystyle',
+    'IfSubStringInString',
+    'hyphenation',
+    'theoremstyle',
+    'colorbox',
+    'textcolor',
+    'color',
+    'caption',
+    'thlabel',
+    'fancyfoot',
+    'captionof',
+    'settopmatter',
+    'IEEEtriggeratref',
+    'IEEEauthorblockN',
+    'IEEEauthorblockA',
+    'IEEEauthorblockN\n',
+    'IEEEauthorblockA\n',
+    'IEEEauthorrefmark',
+    'orcid',
+    'typeout',
+    'fancyhead',
+    'pagestyle',
+    'biboptions',
+    'affiliation',
+    'address',
+    'institution',
+    'printalgoIEEE',
+    'date',
+    'authornote',
+    'numberofauthors',
+    'footnotetext',
+    'email',
+    'reftitle',
+    'setcopyright',
+    'ead',
+    'deleted',
+    'includegraphics',
+    'comment',
+    'abstract',
+    'replaced',
+    'xspace',
+]
+commands_only_to_delete: [
+    'titlerunning',
+    'runningtitle',
+    'title',
+    'title\*',
+    'accept',
+    'added',
+    'icmltitle',
+    'textsuperscript',
+    'texttt',
+    'textsc',
+    'textit',
+    'mathit',
+    'makebox',
+    'mbox',
+    'textbf',
+    'acl',
+    'textnormal',
+    'texttt ',
+    'textsc ',
+    'textit ',
+    'mathit ',
+    'textbf ',
+    'gls',
+    'Gls',
+    'glspl',
+    'textnormal ',
+    'inlinetitle',
+    'mbox',
+    'hl',
+    'highlight',
+    'IEEEraisesectionheading',
+    'IEEEtitleabstractindextext',
+    'IEEEPARstart',
+    'it',
+    'added',
+    'paragraph',
+    'paragraph\*',
+    'MakeLowercase',
+    'emph',
+    'emph ',
+    'text',
+    'acp',
+    'ac'
+]
+environments_to_delete: [
+    'icmlauthorlist',
+    'tikzpicture',
+    'groupplot',
+    'biography',
+    'IEEEbiographynophoto',
+    'acronym',
+    'MSCcodes',
+    'IEEEbiography',
+    'figure',
+    'AMS',
+    'acknowledgement',
+    'acknowledgments',
+    'acknowledgements',
+    'figure\*',
+    'minipage',
+    'table',
+    'table\*',
+    'glist',
+    'tabular',
+    'tabular\*',
+    'center',
+    'remark',
+    'algorithm',
+    'algorithmic',
+    'CCSXML',
+    'acks',
+    'lstlisting',
+    'tabu',
+    'algorithm\*',
+    'algorithmic\*',
+    'longtable',
+    'sidewaystable\*',
+    'sidewaystable',
+    'appendices',
+    'wrapfigure',
+    'appendix'
+]

configs/config.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+data_downloading:
+  download_directory: "quant_bio_retrieval/"    # directory where the papers will be downloaded and the graph will be saved
+  gexf_file: "test_graph.gexf"                  # name of the graph file that will be created only if downloading option is true
+  processing:
+    random_seed: 10
+    keep_unstructured_content: false            # keep unstructured content of the papers as graph node attribute if true
+    arxiv_rate_limit: 3                         # time in seconds to wait between each arxiv api call to avoid ban
+retriever:
+  embedder: BAAI/bge-large-en-v1.5
+  num_retrievals: 30000
+  load_arxiv_embeds: True                       # load arxiv embeddings from huggingface if true else generate them
+inference:
+  base_model: meta-llama/Meta-Llama-3-8B
+  pretrained_model: "models/Robotics/Meta-LLama-3-8B-Quantative-Robotics" # used only if training option is false
+  generation_args:
+    max_new_tokens: 1000
+    do_sample: True
+    top_p: 0.9
+    top_k: 50
+    temperature: 0.7
+    no_repeat_ngram_size: 2
+    num_beams: 1
+  gen_related_work_instruct_model: meta-llama/Llama-3.1-8B-Instruct      # Model assisting at the generation of related work instructions
+training:
+  predefined_graph_path: "robotics.gexf"                                 # path to the graph dataset used for fine-tuning only if downloading option is false
+  trainer_args:
+    per_device_train_batch_size: 4
+    warmup_steps: 100
+    num_train_epochs: 1
+    learning_rate: 0.0002
+    lr_scheduler_type: 'cosine'
+    fp16: true
+    logging_steps: 1
+    save_steps: 50
+    trainer_output_dir: trainer_outputs/
+  tokenizer:
+    max_length: 1024
+  qlora:
+    rank: 8
+    lora_alpha: 32
+    lora_dropout: 0.05
+    target_modules:          # modules for which to train lora adapters
+    - q_proj
+    - k_proj
+    - v_proj
+    - o_proj
+# Used only if training option is true to save and load the fine-tuned model
+model_saving:
+  model_name: llama_1b_qlora_uncensored
+  model_output_dir: models   # model saved in {model_output_dir}/{model_name}_{index} # model saved in {model_output_dir}/{model_name}_{index} after fine-tuning completion
+  index: 1

configs/config_noUI.yaml ADDED Viewed

	@@ -0,0 +1,37 @@

+# Note: In order to train the model and then evaluate the same model, you need to
+# make sure that the base_model to be the same in both the train and eval sections
+# Evaluation configuration
+eval:
+  base_model: meta-llama/Llama-3.2-1B
+  graph_path: datasets/quantum_graph.gexf
+  model_name: llama_1b_qlora_uncensored
+# Training configuration
+training:
+  graph_path: datasets/quantum_graph.gexf # path to the graph file to train on
+  base_model: meta-llama/Llama-3.2-1B
+  trainer_args:
+    per_device_train_batch_size: 4
+    warmup_steps: 100
+    num_train_epochs: 1
+    learning_rate: 0.0002
+    lr_scheduler_type: 'cosine'
+    fp16: true
+    logging_steps: 1
+    save_steps: 50
+    trainer_output_dir: trainer_outputs/
+  tokenizer:
+    max_length: 1024
+  qlora:
+    rank: 8
+    lora_alpha: 32
+    lora_dropout: 0.05
+    target_modules:  # modules for which to train lora adapters
+    - q_proj
+    - k_proj
+    - v_proj
+    - o_proj
+  model_saving:
+    model_output_dir: models  # model saved in {model_output_dir}/{model_name} after fine-tuning completion
+    model_name: llama_1b_qlora_uncensored

configs/latex_commands.yaml ADDED Viewed

	@@ -0,0 +1,162 @@

+verbatim_to_delete: [
+    '\IEEEpeerreviewmaketitle',
+    '\normalcolor',
+    '\ifCLASSOPTIONcaptionsoff',
+    '\pagebreak',
+    '\makeatletter',
+    '\makeatother',
+    '\maketitle',
+    '\preface',
+    '\eShell',
+    '\medskip',
+    '\tableofcontents',
+    '\begin{@twocolumnfalse}',
+    '\end{@twocolumnfalse}',
+    '\bgroup',
+    '\egroup',
+    '\ifnalpaper1',
+    '\let\thefootnote',
+    '\begin{spacing}{2.0}',
+    '\end{landscape}',
+    '\begin{landscape}',
+    '\begin{doublespacing}',
+    '\end{doublespacing}',
+    '\begin{spacing}',
+    '\end{spacing}',
+    '\printbibliography',
+    '\begin{sloppypar}',
+    '\end{sloppypar}',
+    '\ifbd',
+    '\iftr',
+    '\fussy',
+    '\sloppy',
+    '\emergencystretch',
+    '\hideLIPIcs',
+    '\tolerance',
+    '\hbadness',
+    '\bShell',
+    '\glsresetall',
+    '\copyrightnotice',
+    '\copyright',
+    '\centering',
+    '\immediate',
+    '\doublespacing',
+    '\flushbottom',
+    '\printAffiliationsAndNotice',
+    '\IEEEpubid',
+    '\twocolumn',
+    '\noindent',
+    '\indent',
+    '\onecolumn',
+    '\ignore',
+    '\selectfont',
+    '\raggedbottom',
+    '\IEEEoverridecommandlockouts',
+    '\newline',
+    '\tiny',
+    '\break',
+    '\mainmatter',
+    '\let\rc\rangle',
+    '\let\lc\langle',
+    '\acresetall',
+    '\acknowledgments',
+    '\begin{section}',
+    '\begin{small}',
+    '\end{small}',
+    '\relax',
+    '\ninept',
+    '\FloatBarrier',
+    '\boldmath',
+    '\end{section}',
+    '\Huge',
+    '\fancyhf',
+    '\fancyhead',
+    '\begin{frontmatter}',
+    '\end{frontmatter}',
+    '\clearpage',
+    '\huge',
+    '\newpage',
+    '\IEEEdisplaynontitleabstractindextext',
+    '\DontPrintSemicolon',
+    '\Large',
+    '\LARGE',
+    '\ifCLASSOPTIONcompsoc',
+    '\ifCLASSOPTIONonecolumn',
+    '\xspace',
+    '\large',
+    '\acmcopyr',
+    '\flushleft',
+    '\newpage',
+    '\protect',
+    '\begingroup',
+    '\endgroup',
+    '\bigskip',
+    '\smallskip',
+    '\small',
+    '\left',
+    '\right',
+    '\vfill',
+    '\hfill',
+    '\begin{appendices}',
+    '\IEEEQED',
+    '\leavevmode',
+    '\footnotesize',
+    '\nonumber',
+    '\scriptsize',
+    '\IEEEpubidadjcol',
+    '\balance',
+    '\normalsize',
+    '\ifloguseIEEEConf',
+    '\else',
+    '\fi',
+    '\bf ',
+    '\it ',
+    '\verb ',
+    '\tt ',
+    '\em ',
+    '\par '
+]
+two_arguments: [
+    'IEEEPARstart',
+    'pgfdeclareplotmark',
+    'setcounter',
+    'texorpdfstring',
+    'fontsize',
+    'addtocounter',
+    'addtolength'
+]
+three_arguments: [
+    'definecolor'
+]
+two_arguments_elaborate: [
+    'markboth',
+    'setlength',
+    'pgfdeclareplotmark',
+    'icmlsetsymbol',
+    'texorpdfstring',
+    'conferenceinfo',
+    'acrodef',
+    'icmlcorrespondingauthor',
+    'pdfbookmark',
+    'icmlaffiliation',
+    'icmlcorrespondingauthor'
+]
+three_arguments_elaborate: [
+    'ifthenelse',
+    'addcontentsline'
+]
+replace_comments: [
+    'def\\',
+    'def ',
+    'newglossaryentry',
+    'newtheorem',
+    'newcommand',
+    'renewcommand',
+    'newenvironment',
+    'renewenvironment'
+]

datasets/computer_vision.gexf ADDED Viewed

The diff for this file is too large to render. See raw diff

datasets/economics.gexf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb70ebc1810a487c967dc7d196b4b32ffb8237870502d85496633c7ec639e3a3
+size 17470884

datasets/finance.gexf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4739b443038f71850022a93ecf64c97d2fde08c492f716ee0d3e7a9d657302a2
+size 18446416

datasets/large_language_models.gexf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6b8fa29877039cb8b4665889421cdd4488752bcad6f512ac0b05678a73107c1
+size 18262076

datasets/quantative_biology.gexf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99666f9881f04850511a4e424243ed28228ae7cfd28859fc2f9104faa4e8bb4b
+size 20476813

datasets/quantum_physics.gexf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7abcb0d7aec2edb5461161ff613fe87176a4eb9c1522c2b0c1903fbccf66f74
+size 20678301

datasets/robotics.gexf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:05681fb68e8168d530cc5bd037f5a6236135446be05862e2be9bf61b548d2c98
+size 19601009

datasets/telecommunications.gexf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0f89f57398abcc70065c2756e15fcd2792c34f6ac9137ba1a8521a5dd8cd22c
+size 14749926

litbench_pipeline.py ADDED Viewed

	@@ -0,0 +1,746 @@

+from train import *
+from utils.utils import *
+from utils.graph_utils import *
+from utils.gradio_utils import *
+from retriever.retriever import retriever
+from tasks.abs_2_title import abs_2_title
+from tasks.abs_completion import abs_completion
+from tasks.citation_sentence import citation_sentence
+from tasks.intro_2_abs import intro_2_abs
+from tasks.link_pred import link_pred
+from tasks.paper_retrieval import paper_retrieval
+from tasks.influential_papers import influential_papers
+from tasks.gen_related_work import gen_related_work
+import random
+import json
+import os
+import re
+import networkx as nx
+import tarfile
+import gzip
+import time
+import urllib.request
+from tqdm import tqdm
+from colorama import Fore
+import wandb
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteriaList, TextIteratorStreamer, pipeline
+from threading import Thread
+import signal
+import gzip
+import time
+import torch
+from peft.peft_model import PeftModel
+from datasets import load_dataset
+# Function to determine the chatbot's first message based on user choices
+def setup(download_option, train_option):
+    download_papers.value = (download_option == "Download Paper")
+    train_model.value = (train_option == "Train")
+    if download_option == "Download Paper":
+        initial_message = [{"role": "assistant", "content": "Hello, what domain are you interested in?"}]
+    elif download_option != "Download Paper" and train_option == "Train":
+        initial_message = [{"role": "assistant", "content": "What domain is your graph about?"}]
+    else:
+        initial_message = [{"role": "assistant", "content": "Please provide your task prompt."}]
+    return gr.update(visible=False), gr.update(visible=True), f"Download: {download_option}\nTrain: {train_option}", initial_message
+# Function to toggle the selected task based on user input
+def update_button_styles(selected_task):
+    """Update button styles based on selection."""
+    return [gr.update(variant="primary" if selected_task == prompt else "secondary") for prompt in task_list]
+# Fetch and store arXiv source files
+def fetch_arxiv_papers(papers_to_download):
+    # Download the arXiv metadata file if it doesn't exist
+    dataset = 'datasets/arxiv-metadata-oai-snapshot.json'
+    data = []
+    if not os.path.exists(dataset):
+        os.system("wget https://huggingface.co/spaces/ddiddu/simsearch/resolve/main/arxiv-metadata-oai-snapshot.json -P ./datasets")
+    with open(dataset, 'r') as f:
+        for line in f:
+            data.append(json.loads(line))
+    papers = [d for d in data]
+    paper_ids = [d['id'] for d in data]
+    paper_titles = [
+        (
+            re.sub(r' +', ' ', re.sub(r'[\n]+', ' ', paper['title']))
+            .replace("\\emph", "")
+            .replace("\\emp", "")
+            .replace("\\em", "")
+            .replace(",", "")
+            .replace("{", "")
+            .replace("}", "")
+            .strip(".")
+            .strip()
+            .strip(".")
+            .lower()
+        )
+        for paper in papers
+    ]
+    paper_dict = {
+        k:v
+        for k,v in zip(paper_titles, paper_ids)
+    }
+    total_papers = len(papers_to_download)
+    download_progress_bar=gr.Progress()
+    llm_resp = []
+    results = {
+        "Number of papers": 0,
+        "Number of latex papers": 0,
+        "Number of bib files": 0,
+        "Number of bbl files": 0,
+        "Number of inline files": 0,
+        "Number of introductions found": 0,
+        "Number of related works found": 0,
+        "Number of succesful finding of extracts": 0
+    }
+    num_papers, num_edges, t, iter_ind = 0, 0, 0, 0
+    graph = {}
+    arxiv_rate_lim = config['data_downloading']['processing']['arxiv_rate_limit']
+    for paper_name in tqdm(papers_to_download):
+        results["Number of papers"] += 1
+        print(
+            Fore.BLUE + "Number of papers processed: {} \n Number of edges found: {} \n Time of previous iter: {} \n Now processing paper: {} \n\n"
+            .format(num_papers, num_edges, time.time()-t, paper_name) + Fore.RESET
+        )
+        t = time.time()
+        num_papers += 1
+        # Prepare the paper name for downloading and saving
+        paper_name_download = paper_name
+        if re.search(r'[a-zA-Z]', paper_name) is not None:
+            paper_name = "".join(paper_name.split('/'))
+        tar_file_path = save_zip_directory + paper_name + '.tar.gz'
+        # Attempt to download the paper source files from arXiv
+        try:
+            # Track start time for download
+            t1 = time.time()
+            urllib.request.urlretrieve(
+            "https://arxiv.org/src/"+paper_name_download,
+            tar_file_path)
+        except Exception as e:
+            print("Couldn't download paper {}".format(paper_name))
+            # Skip to the next paper if download fails
+            continue
+        # Define the directory where the paper will be extracted
+        extracted_dir = save_directory + paper_name + '/'
+        isExist = os.path.exists(extracted_dir)
+        if not isExist:
+            os.makedirs(extracted_dir)
+        # Attempt to extract the tar.gz archive
+        try:
+            tar = tarfile.open(tar_file_path)
+            tar.extractall(extracted_dir)
+            tar.close()
+        except Exception as e:
+            # If tar extraction fails, attempt to read and extract using gzip
+            try:
+                with gzip.open(tar_file_path, 'rb') as f:
+                    file_content = f.read()
+                # Save the extracted content as a .tex file
+                with open(extracted_dir+paper_name+'.tex', 'w') as f:
+                    f.write(file_content.decode())
+            except Exception as e:
+                print("Could not extract paper id: {}".format(paper_name))
+                # Skip this paper if extraction fails
+                continue
+        try:
+            # Perform initial cleaning and get the main TeX file
+            initial_clean(extracted_dir, config=False)
+            main_file = get_main(extracted_dir)
+            # If no main TeX file is found, remove the downloaded archive and continue
+            if main_file == None:
+                print("No tex files found")
+                os.remove(tar_file_path)
+                continue
+            # Check if the main TeX file contains a valid LaTeX document
+            h = check_begin(main_file)
+            if h == True:
+                results["Number of latex papers"] += 1
+                # Flag to check for internal bibliography
+                check_internal = 0
+                # Dictionary to store bibliographic references
+                final_library = {}
+                # Identify bibliography files (.bib or .bbl)
+                bib_files = find_bib(extracted_dir)
+                if bib_files == []:
+                    bbl_files = find_bbl(extracted_dir)
+                    if bbl_files == []:
+                        # No external bibliography found
+                        check_internal = 1
+                    else:
+                        final_library = get_library_bbl(bbl_files)
+                        results["Number of bbl files"] += 1
+                else:
+                    results["Number of bib files"] += 1
+                    final_library = get_library_bib(bib_files)
+                # Apply post-processing to clean the TeX document
+                main_file = post_processing(extracted_dir, main_file)
+                # Read the cleaned LaTeX document content
+                descr = main_file
+                content = read_tex_file(descr)
+                # If configured, store the raw content in the graph
+                if config['data_downloading']['processing']['keep_unstructured_content']:
+                    graph[paper_name] = {'content': content}
+                else:
+                    graph[paper_name] = {}
+                # Check for inline bibliography within the LaTeX document
+                if check_internal == 1:
+                    beginning_bib = '\\begin{thebibliography}'
+                    end_bib = '\\end{thebibliography}'
+                    if content.find(beginning_bib) != -1 and content.find(end_bib) != -1:
+                        bibliography = content[content.find(beginning_bib):content.find(end_bib) + len(end_bib)]
+                        save_bbl = os.path.join(extracted_dir, "bibliography.bbl")
+                        results["Number of inline files"] += 1
+                        with open(save_bbl, "w") as f:
+                            f.write(bibliography)
+                        final_library = get_library_bbl([save_bbl])
+                # If no valid bibliography is found, skip processing citations
+                if final_library == {}:
+                    print("No library found...")
+                    continue
+                # Extract relevant sections such as "Related Work" and "Introduction"
+                related_works = get_related_works(content)
+                if related_works  != '':
+                    graph[paper_name]['Related Work'] = related_works
+                    results["Number of intro/related found"] += 1
+                intro = get_intro(content)
+                if intro  != '':
+                    graph[paper_name]['Introduction'] = intro
+                    results["Number of introductions found"] += 1
+                # Extract citation sentences from the introduction and related works
+                sentences_citing = get_citing_sentences(intro + '\n' + related_works)
+                # Map citations to corresponding papers
+                raw_sentences_citing = {}
+                for k,v in sentences_citing.items():
+                    new_values = []
+                    for item in v:
+                        try:
+                            new_values.append(paper_dict[final_library[item]['title']])
+                        except Exception as e:
+                            pass
+                    if new_values != []:
+                        raw_sentences_citing[k] = new_values
+                # Construct citation edges
+                edges_set = []
+                for k,v in raw_sentences_citing.items():
+                    for item in v:
+                        edges_set.append((paper_name_download, item, {"sentence":k}))
+                iter_ind +=1
+                if len(edges_set) !=0:
+                    results["Number of succesful finding of extracts"] += 1
+                    graph[paper_name]['Citations'] = edges_set
+                    num_edges += len(edges_set)
+                # Save progress after every 10 iterations
+                if iter_ind % 10 == 0:
+                    print("Saving graph now")
+                    with open(save_path, 'w') as f:
+                        json.dump(results, f)
+                    with open(save_graph, 'w') as f:
+                        json.dump(graph, f)
+        except Exception as e:
+            print("Could not get main paper {}".format(paper_name))
+        # Update the progress bar after processing each paper
+        download_progress_bar(num_papers / total_papers)
+        # Ensure a minimum time gap of 3 seconds between iterations to avoid bans from arXiv
+        t2 = time.time()  # End time
+        elapsed_time = t2 - t1
+        if elapsed_time < arxiv_rate_lim:
+            time.sleep(arxiv_rate_lim - elapsed_time)
+    # Final saving of processed data
+    with open(save_graph, 'w') as f:
+        json.dump(graph, f)
+    with open(save_path, 'w') as f:
+        json.dump(results, f)
+    # Log final completion message
+    llm_resp.append("✅ Successfully downloaded and cleaned {} papers.".format(results["Number of latex papers"]))
+    return "\n".join(llm_resp)
+# Chat prediction function
+def predict(message, history, selected_task):
+    global model
+    # Initialize the conversation string
+    conversation = ""
+    # Parse the history: Gradio `type="messages"` uses dictionaries with 'role' and 'content'
+    for item in history:
+        if item["role"] == "assistant":
+            conversation += f"<bot>: {item['content']}\n"
+        elif item["role"] == "user":
+            conversation += f"<human>: {item['content']}\n"
+    # Add the user's current message to the conversation
+    conversation += f"<human>: {message}\n<bot>:"
+    # Handle preferences
+    if len(history) == 0:
+        if not download_papers.value and not train_model.value:
+            yield "✅ Using model from configuration file..."
+            adapter_path = config["inference"]["pretrained_model"]
+            peft_model = PeftModel.from_pretrained(model, adapter_path, torch_dtype=torch.float16)
+            # change the global model with peft model
+            model = peft_model
+        time.sleep(2.5)
+    if not (len(history) == 0 and (train_model.value or download_papers.value)):
+        # Streamer for generating responses
+        streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+        stop = StopOnTokens()
+        generate_kwargs = {
+            "streamer": streamer,
+            "max_new_tokens": config['inference']['generation_args']["max_new_tokens"],
+            "do_sample": config['inference']['generation_args']["do_sample"],
+            "top_p": config['inference']['generation_args']["top_p"],
+            "top_k": config['inference']['generation_args']["top_k"],
+            "temperature": config['inference']['generation_args']["temperature"],
+            "no_repeat_ngram_size": config['inference']['generation_args']["no_repeat_ngram_size"],
+            "num_beams": config['inference']['generation_args']["num_beams"],
+            "stopping_criteria": StoppingCriteriaList([stop]),
+        }
+        def generate_response(model, generate_kwargs, selected_task):
+            global advanced_tasks_out
+            has_predefined_template = generate_kwargs["streamer"].tokenizer.chat_template is not None
+            if selected_task == "Abstract Completion":
+                prompt = abs_completion(message, template, has_predefined_template)
+            elif selected_task == "Title Generation":
+                prompt = abs_2_title(message, template, has_predefined_template)
+            elif selected_task == "Citation Recommendation":
+                prompt = paper_retrieval(message, template, has_predefined_template)
+            elif selected_task == "Citation Sentence Generation":
+                prompt = citation_sentence(message, template, has_predefined_template)
+            elif selected_task == "Citation Link Prediction":
+                prompt = link_pred(message, template, has_predefined_template)
+            elif selected_task == "Introduction to Abstract":
+                prompt = intro_2_abs(message, template, tokenizer.model_max_length, has_predefined_template)
+            elif selected_task == "Influential Papers Recommendation":
+                if download_papers.value:
+                    graph = nx.read_gexf(gexf_file)
+                    advanced_tasks_out = influential_papers(message, graph)
+                else:
+                    graph = nx.read_gexf(predef_graph)
+                    advanced_tasks_out = influential_papers(message, graph)
+            elif selected_task == "Related Work Generation":
+                adapter_path = (
+                    f"{config['model_saving']['model_output_dir']}/{config['model_saving']['model_name']}_{config['model_saving']['index']}_adapter_test_graph"
+                    if train_model.value else config['inference']['pretrained_model']
+                )
+                if download_papers.value:
+                    advanced_tasks_out = gen_related_work(message, gexf_file, adapter_path)
+                else:
+                    advanced_tasks_out = gen_related_work(message, predef_graph, adapter_path)
+            else:
+                prompt = conversation + f"<human>: {message}\n<bot>:"
+            if selected_task != "Influential Papers Recommendation" and selected_task != "Related Work Generation":
+                if tokenizer.chat_template is not None:
+                    response = model_pipeline(prompt, **generate_kwargs)
+                    streamer.put(response[0]['generated_text'][-1])
+                else:
+                    model_inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+                    generate_kwargs["inputs"] = model_inputs["input_ids"]
+                    generate_kwargs["attention_mask"] = model_inputs["attention_mask"]
+                    response = model.generate(**generate_kwargs)
+                    streamer.put(response)
+        # Generate the response in a separate thread
+        t = Thread(target=generate_response,
+                    kwargs={
+                       "model": model,
+                       "generate_kwargs": generate_kwargs,
+                       "selected_task": selected_task
+                    })
+        global advanced_tasks_out
+        advanced_tasks_out = None
+        t.start()
+        # Stream the partial response
+        if selected_task != "Influential Papers Recommendation" and selected_task != "Related Work Generation":
+            partial_message = ""
+            for new_token in streamer:
+                if new_token != '<':  # Ignore placeholder tokens
+                    partial_message += new_token
+                    yield partial_message
+        else:
+            if selected_task == "Related Work Generation":
+                yield "🔍 Generating related work..."
+            while advanced_tasks_out == None:
+                time.sleep(0.1)
+            yield advanced_tasks_out
+    # Fetch arXiv papers if the user opted to download them
+    if len(history) == 0:
+        if download_papers.value:
+            # Fetch relevant papers
+            yield "🔍 Retrieving relevant papers..."
+            retrieve_progress = gr.Progress()
+            for percent in retriever(message, retrieval_nodes_path):
+                retrieve_progress(percent)
+            with open(retrieval_nodes_path, "r") as f:
+                data_download = json.load(f)
+            papers_to_download = list(data_download.keys())
+            yield f"📥 Fetching {len(papers_to_download)} arXiv papers' source files... Please wait."
+            content = fetch_arxiv_papers(papers_to_download)
+            yield content
+            time.sleep(2.5)
+    # Train the model with the retrieved graph
+    if len(history) == 0:
+        if train_model.value:
+            training_progress=gr.Progress()
+            training_progress(0.0)
+            # If the user opted to download papers, use the retrieved graph, else use the predefined graph
+            if download_papers.value:
+                yield "🚀 Training the model with the retrieved graph..."
+                with open(save_graph, "r") as f:
+                    data_graph = json.load(f)
+                renamed_data = {
+                    "/".join(re.match(r"([a-z-]+)([0-9]+)", key, re.I).groups()) if re.match(r"([a-z-]+)([0-9]+)", key, re.I) else key: value
+                    for key, value in data_graph.items()
+                }
+                concept_data = load_dataset("AliMaatouk/arXiv_Topics", cache_dir="datasets/arxiv_topics")
+                id2topics = {
+                    entry["paper_id"]: [entry["Level 1"], entry["Level 2"], entry["Level 3"]]
+                    for entry in concept_data["train"]
+                }
+                dataset = 'datasets/arxiv-metadata-oai-snapshot.json'
+                data = []
+                if not os.path.exists(dataset):
+                    os.system("wget https://huggingface.co/spaces/ddiddu/simsearch/resolve/main/arxiv-metadata-oai-snapshot.json -P ./datasets")
+                with open(dataset, 'r') as f:
+                    for line in f:
+                        data.append(json.loads(line))
+                papers = {d['id']: d for d in data}
+                G = nx.DiGraph()
+                for k in renamed_data:
+                    if k not in G and k in papers:
+                        if config['data_downloading']['processing']['keep_unstructured_content']:
+                            G.add_node(
+                                k,
+                                title=papers[k]['title'],
+                                abstract=papers[k]['abstract'],
+                                introduction=renamed_data[k].get('Introduction', '') if renamed_data[k].get('Introduction', '') != '\n' else '',
+                                related=renamed_data[k].get('Related Work', '') if renamed_data[k].get('Related Work', '') != '\n' else '',
+                                concepts=", ".join(list(set(item for sublist in id2topics[k] for item in sublist))) if k in id2topics else '',
+                                content=renamed_data[k].get('content', '') if k in renamed_data else ''
+                            )
+                        else:
+                            G.add_node(
+                                k,
+                                title=papers[k]['title'],
+                                abstract=papers[k]['abstract'],
+                                introduction=renamed_data[k].get('Introduction', '') if renamed_data[k].get('Introduction', '') != '\n' else '',
+                                related=renamed_data[k].get('Related Work', '') if renamed_data[k].get('Related Work', '') != '\n' else '',
+                                concepts=", ".join(list(set(item for sublist in id2topics[k] for item in sublist))) if k in id2topics else ''
+                            )
+                    if 'Citations' in renamed_data[k]:
+                        for citation in renamed_data[k]['Citations']:
+                            source, target, metadata = citation
+                            sentence = metadata.get('sentence', '')  # Extract sentence or default to empty string
+                            if target not in G and target in papers:
+                                if config['data_downloading']['processing']['keep_unstructured_content']:
+                                    G.add_node(
+                                        target,
+                                        title=papers[target]['title'],
+                                        abstract=papers[target]['abstract'],
+                                        introduction=renamed_data[target].get('Introduction', '') if target in renamed_data and renamed_data[target].get('Introduction', '') != '\n'  else '',
+                                        related=renamed_data[target].get('Related Work', '') if target in renamed_data and renamed_data[target].get('Related Work', '') != '\n'  else '',
+                                        concepts=", ".join(list(set(item for sublist in concept_data[target].values() for item in sublist))) if target in concept_data else '',
+                                        content=renamed_data[target].get('content', '') if target in renamed_data else ''
+                                    )
+                                else:
+                                    G.add_node(
+                                        target,
+                                        title=papers[target]['title'],
+                                        abstract=papers[target]['abstract'],
+                                        introduction=renamed_data[target].get('Introduction', '') if target in renamed_data and renamed_data[target].get('Introduction', '') != '\n'  else '',
+                                        related=renamed_data[target].get('Related Work', '') if target in renamed_data and renamed_data[target].get('Related Work', '') != '\n'  else '',
+                                        concepts=", ".join(list(set(item for sublist in concept_data[target].values() for item in sublist))) if target in concept_data else ''
+                                    )
+                                G.add_edge(source, target, sentence=sentence)
+                G.remove_nodes_from(list(nx.isolates(G)))
+                nx.write_gexf(G, gexf_file)
+                print(f"Processed graph written to {gexf_file}")
+            else:
+                yield f"✅ Using predefined graph: {predef_graph}"
+            wandb.init(project='qlora_train')
+            if download_papers.value:
+                trainer = QloraTrainer_CS(config=config, use_predefined_graph=False)
+            else:
+                trainer = QloraTrainer_CS(config=config, use_predefined_graph=True)
+            print("Load base model")
+            trainer.load_base_model()
+            print("Start training")
+            def update_progress():
+                # Wait for the trainer to be initialized
+                while trainer.transformer_trainer is None:
+                    time.sleep(0.5)
+                time.sleep(1.5)
+                # Update the progress bar until training is complete
+                while trainer.transformer_trainer.state.global_step != trainer.transformer_trainer.state.max_steps:
+                    progress_bar = (
+                        trainer.transformer_trainer.state.global_step /
+                        trainer.transformer_trainer.state.max_steps
+                    )
+                    training_progress(progress_bar)
+                    time.sleep(0.5)
+                training_progress(1.0)
+            t1 = Thread(target=trainer.train)
+            t1.start()
+            t2 = Thread(target=update_progress())
+            t2.start()
+            t1.join()
+            t2.join()
+            yield "🎉 Model training complete! Please provide your task prompt."
+            adapter_path = f"{config['model_saving']['model_output_dir']}/{config['model_saving']['model_name']}_{config['model_saving']['index']}_adapter_test_graph"
+            peft_model = PeftModel.from_pretrained(model, adapter_path, torch_dtype=torch.float16)
+            # change the global model with peft model
+            model = peft_model
+if __name__ == "__main__":
+    print("This is running in a virtual environment: {}".format(is_venv()))
+    config = read_yaml_file("configs/config.yaml")
+    template_file_path = 'configs/alpaca.json'
+    template = json.load(open(template_file_path, "r"))
+    seed_no = config['data_downloading']['processing']['random_seed']
+    model_name = config['inference']['base_model']
+    working_dir = config['data_downloading']['download_directory']
+    save_zip_directory = working_dir + 'research_papers_zip/'
+    save_directory = working_dir + 'research_papers/'
+    save_description = working_dir + 'description/'
+    save_path = save_description + 'results.json'
+    save_graph = save_description + 'test_graph.json'
+    gexf_file = save_description + config['data_downloading']['gexf_file']
+    predef_graph = 'datasets/' + config['training']['predefined_graph_path']
+    retrieval_nodes_path = 'datasets/retrieval_nodes.json'
+    isExist = os.path.exists(save_zip_directory)
+    if not isExist:
+        os.makedirs(save_zip_directory)
+    isExist = os.path.exists(save_directory)
+    if not isExist:
+        os.makedirs(save_directory)
+    isExist = os.path.exists(save_description)
+    if not isExist:
+        os.makedirs(save_description)
+    random.seed(seed_no)
+    # Load model and tokenizer
+    bnb_config = BitsAndBytesConfig(
+        load_in_8bit=True,
+        bnb_8bit_use_double_quant=True,
+        bnb_8bit_quant_type="nf8",
+        bnb_8bit_compute_dtype=torch.bfloat16
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
+    if model.device.type != 'cuda':
+        model.to('cuda')
+    if tokenizer.chat_template is not None:
+        model_pipeline = pipeline(
+            "text-generation",
+            model=model_name,
+            model_kwargs={"torch_dtype": torch.bfloat16},
+            device_map="auto",
+        )
+    signal.signal(signal.SIGINT, signal_handler)
+    # Global States for User Preferences
+    download_papers = gr.State(value=True)  # Default: Download papers
+    train_model = gr.State(value=True)    # Default: Train the model
+    # Categorized Recommended Prompts
+    task_list = {
+        "Abstract Completion",
+        "Introduction to Abstract",
+        "Title Generation",
+        "Citation Recommendation",
+        "Citation Sentence Generation",
+        "Citation Link Prediction",
+        "Influential Papers Recommendation",
+        "Related Work Generation",
+    }
+    # CSS for Styling
+    css = """
+    body { background-color: #E0F7FA; margin: 0; padding: 0; }
+    .gradio-container { background-color: #E0F7FA; border-radius: 10px; }
+    #logo-container { display: flex; justify-content: center; align-items: center; margin: 0 auto; padding: 0; max-width: 120px; height: 120px; border-radius: 10px; overflow: hidden; }
+    #scroll-menu { max-height: 310px; overflow-y: auto; padding: 10px; background-color: #fff; margin-top: 10px;}
+    #task-header { background-color: #0288d1; color: white; font-size: 18px; padding: 8px; text-align: center; margin-bottom: 5px; margin-top: 40px; }
+    #category-header { background-color: #ecb939; font-size: 16px; padding: 8px; margin: 10px 0; }
+    """
+    # State to store the selected task
+    selected_task = gr.State(value="")
+    # Gradio Interface
+    with gr.Blocks(theme="soft", css=css) as demo:
+        gr.HTML('<div id="logo-container"><img src="https://static.thenounproject.com/png/6480915-200.png" alt="Logo"></div>')
+        gr.Markdown("# LitBench Interface")
+        # Setup row for user preferences
+        with gr.Row(visible=True) as setup_row:
+            with gr.Column():
+                gr.Markdown("### Setup Your Preferences")
+                download_option = gr.Dropdown(
+                    choices=["Download Paper", "Don't Download"],
+                    value="Download Paper",
+                    label="Download Option"
+                )
+                train_option = gr.Dropdown(
+                    choices=["Train", "Don't Train"],
+                    value="Train",
+                    label="Training Option"
+                )
+                setup_button = gr.Button("Set Preferences and Proceed")
+        # Chatbot row for user interaction
+        with gr.Row(visible=False) as chatbot_row:
+            # Store the currently selected task
+            with gr.Column(scale=3):
+                gr.Markdown("### Start Chatting!")
+                chatbot = gr.ChatInterface(
+                    predict,
+                    chatbot=gr.Chatbot(
+                        height=400,
+                        type="messages",
+                        avatar_images=[
+                            "https://icons.veryicon.com/png/o/miscellaneous/user-avatar/user-avatar-male-5.png",
+                            "https://cdn-icons-png.flaticon.com/512/8649/8649595.png"
+                        ],
+                    ),
+                    textbox=gr.Textbox(placeholder="Type your message here..."),
+                    additional_inputs=selected_task,
+                    additional_inputs_accordion=gr.Accordion(visible=False, label="Additional Inputs", ),
+                )
+                # Store user preferences and selected task for display
+                preferences_output = gr.Textbox(value="", interactive=False, label="Your Preferences")
+            # Task selection buttons for user interaction
+            with gr.Column(scale=1):
+                gr.HTML('<div id="task-header">Tasks:</div>')
+                with gr.Column(elem_id="scroll-menu"):
+                    # Create buttons
+                    button_map = {prompt: gr.Button(prompt) for prompt in task_list}
+                    for prompt in task_list:
+                        button_map[prompt].click(
+                            toggle_selection,
+                            inputs=[selected_task, gr.State(value=prompt)],  # Toggle task selection
+                            outputs=selected_task
+                        ).then(
+                            update_button_styles,  # Update button appearances
+                            inputs=[selected_task],
+                            outputs=[button_map[p] for p in task_list]  # Update all buttons
+                        )
+        # Setup button to finalize user preferences and start chatbot
+        setup_button.click(
+            setup,
+            inputs=[download_option, train_option],
+            outputs=[setup_row, chatbot_row, preferences_output, chatbot.chatbot]
+        )
+    # Launch the interface
+    demo.launch(server_port=7880)

retriever/retriever.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from transformers import AutoTokenizer, AutoModel
+from sklearn.metrics.pairwise import cosine_similarity
+import json
+import torch
+from tqdm import tqdm
+import os
+import pandas as pd
+import numpy as np
+from datasets import load_dataset
+from utils.utils import read_yaml_file
+def generate_topic_level_embeddings(model, tokenizer, paper_list, tmp_id_2_abs):
+    id2topics = {
+        entry["paper_id"]: [entry["Level 1"], entry["Level 2"], entry["Level 3"]]
+        for entry in tmp_id_2_abs['train']
+    }
+    for topic_level in ['Level 1', 'Level 2', 'Level 3']:
+        i = 0
+        batch_size = 2048
+        candidate_emb_list = []
+        pbar = tqdm(total=len(paper_list))
+        while i < len(paper_list):
+            yield i / len(paper_list) / 3 if topic_level == 'Level 1' else 0.33 + i / len(paper_list) / 3 if topic_level == 'Level 2' else 0.66 + i / len(paper_list) / 3
+            paper_batch = paper_list[i:i+batch_size]
+            paper_text_batch = []
+            for paper_id in paper_batch:
+                topics = id2topics[paper_id][int(topic_level[6])-1]
+                topic_text = ''
+                for t in topics:
+                    topic_text += t + ','
+                paper_text_batch.append(topic_text)
+            inputs = tokenizer(paper_text_batch, return_tensors='pt', padding=True, truncation=True)
+            with torch.no_grad():
+                outputs = model(**inputs.to('cuda'))
+                candidate_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
+                candidate_embeddings = candidate_embeddings.reshape(-1, 1024)
+                candidate_emb_list.append(candidate_embeddings)
+                i += len(candidate_embeddings)
+                pbar.update(len(candidate_embeddings))
+        all_candidate_embs = torch.cat(candidate_emb_list, 0)
+        df = pd.DataFrame({
+            "paper_id": paper_list,
+            "embedding": list(all_candidate_embs.numpy())
+        })
+        if not os.path.exists('datasets/topic_level_embeds'):
+            os.makedirs('datasets/topic_level_embeds')
+        df.to_parquet(f'datasets/topic_level_embeds/{topic_level}_emb.parquet', engine='pyarrow', compression='snappy')
+    all_candidate_embs_L1 = torch.tensor(np.array(pd.read_parquet('datasets/topic_level_embeds/Level 1_emb.parquet')['embedding'].tolist()))
+    all_candidate_embs_L2 = torch.tensor(np.array(pd.read_parquet('datasets/topic_level_embeds/Level 2_emb.parquet')['embedding'].tolist()))
+    all_candidate_embs_L3 = torch.tensor(np.array(pd.read_parquet('datasets/topic_level_embeds/Level 3_emb.parquet')['embedding'].tolist()))
+    all_candidate_embs = all_candidate_embs_L1 + all_candidate_embs_L2 + all_candidate_embs_L3
+    df = pd.DataFrame({
+        "paper_id": paper_list,
+        "embedding": list(all_candidate_embs.numpy())
+    })
+    df.to_parquet('datasets/topic_level_embeds/arxiv_papers_embeds.parquet', engine='pyarrow', compression='snappy')
+def retriever(query, retrieval_nodes_path):
+    yield 0
+    config = read_yaml_file('configs/config.yaml')
+    # Load the model and tokenizer to generate the embeddings
+    embedder_name = config['retriever']['embedder']
+    tokenizer = AutoTokenizer.from_pretrained(embedder_name)
+    model = AutoModel.from_pretrained(embedder_name).to(device='cuda', dtype=torch.float16)
+    # Load the arXiv dataset
+    tmp_id_2_abs = load_dataset("AliMaatouk/arXiv_Topics", cache_dir="datasets/arxiv_topics")
+    paper_list = list(tmp_id_2_abs['train']['paper_id'])
+    # Generate the query embeddings
+    inputs = tokenizer([query], return_tensors='pt', padding=True, truncation=True)
+    with torch.no_grad():
+        outputs = model(**inputs.to('cuda'))
+        query_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
+    # Generate the candidate embeddings
+    # Load the embeddings from the dataset, otherwise generate the embeddings and save them
+    if config['retriever']['load_arxiv_embeds']:
+        dataset = load_dataset("AliMaatouk/arXiv-Topics-Embeddings", cache_dir="datasets/topic_level_embeds")
+        table = dataset["train"].data  # Get PyArrow Table
+        all_candidate_embs = table.column("embedding").to_numpy()
+    else:
+        # If the file does not exist, generate the embeddings, otherwise, load the embeddings
+        if not os.path.exists('datasets/topic_level_embeds/arxiv_papers_embeds.parquet'):
+            yield from generate_topic_level_embeddings(model, tokenizer, paper_list, tmp_id_2_abs)
+        all_candidate_embs = torch.tensor(np.array(pd.read_parquet('datasets/topic_level_embeds/arxiv_papers_embeds.parquet')['embedding'].tolist()))
+        all_candidate_embs = all_candidate_embs.cpu().numpy()
+    all_candidate_embs = np.stack(all_candidate_embs)
+    # Calculate the cosine similarity between the query and all candidate embeddings
+    query_embeddings = np.array(query_embeddings)
+    similarity_scores = cosine_similarity(query_embeddings, all_candidate_embs)[0]
+    # Sort the papers by similarity scores and select the top K papers
+    id_score_list = []
+    for i in range(len(paper_list)):
+        id_score_list.append([paper_list[i], similarity_scores[i]])
+    sorted_scores = sorted(id_score_list, key=lambda i: i[-1], reverse = True)
+    top_K_paper = [sample[0] for sample in sorted_scores[:config['retriever']['num_retrievals']]]
+    papers_results = {
+        paper: True
+        for paper in top_K_paper
+    }
+    with open(retrieval_nodes_path, 'w') as f:
+        json.dump(papers_results, f)
+    yield 1.0

tasks/abs_2_title.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+Generate a prompt for generating the title of a paper based on its abstract.
+Args:
+    usr_input (str): A string containing the title and abstract of the paper in the format "Title: <title> Abstract: <abstract>".
+    template (dict): A dictionary containing the template for the prompt with a key "prompt_input".
+Returns:
+    str: A formatted string with the instruction and abstract to be used as input for generating the title.
+"""
+def abs_2_title(usr_input, template, has_predefined_template=False):
+    instruction = "Please generate the title of paper based on its abstract"
+    if has_predefined_template:
+        res = [
+            {"role": "system", "content": instruction},
+            {"role": "user", "content": usr_input},
+        ]
+    else:
+        res = template["prompt_input"].format(instruction=instruction, input=usr_input)
+    return res

tasks/abs_completion.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+Generates a formatted prompt for completing the abstract of a paper.
+Args:
+    usr_input (str): The user input containing the title and part of the abstract.
+                        Expected format:
+                        "Title: <title>\nAbstract: <abstract>"
+    template (dict): A dictionary containing the template for the prompt.
+                        Expected format:
+                        {"prompt_input": "<template_string>"}
+                        The template string should contain placeholders for
+                        'instruction' and 'input'.
+Returns:
+    str: A formatted string with the instruction and the input embedded in the template.
+"""
+def abs_completion(usr_input, template, has_predefined_template=False):
+    instruction = "Please complete the abstract of a paper."
+    if has_predefined_template:
+        res = [
+            {"role": "system", "content": instruction},
+            {"role": "user", "content": usr_input},
+        ]
+    else:
+        res = template["prompt_input"].format(instruction=instruction, input=usr_input)
+    return res

tasks/citation_sentence.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+Generates a citation sentence based on the titles and abstracts of two papers.
+Args:
+    usr_input (str): A string containing the titles and abstracts of Paper A and Paper B.
+                        The format should be:
+                        "Title A: <title of paper A>\nAbstract A: <abstract of paper A>\nTitle B: <title of paper B>\nAbstract B: <abstract of paper B>"
+    template (dict): A dictionary containing a template for the prompt input. The key "prompt_input" should map to a string with placeholders for the instruction and input.
+Returns:
+    str: A formatted string that combines the instruction and the prompt input with the provided titles and abstracts.
+"""
+def citation_sentence(usr_input, template, has_predefined_template=False):
+    instruction = "Please generate the citation sentence of how Paper A cites paper B in its related work section. \n"
+    if has_predefined_template:
+        res = [
+            {"role": "system", "content": instruction},
+            {"role": "user", "content": usr_input},
+        ]
+    else:
+        res = template["prompt_input"].format(instruction=instruction, input=usr_input)
+    return res

tasks/gen_related_work.py ADDED Viewed

	@@ -0,0 +1,430 @@

+"""
+Generates the related work section for a given paper.
+The input
+    - The input prompt is a string that contains the information of the paper for which the related work section needs to be generated.
+    - The input prompt should be in the following format:
+        Title of Paper: <title of the paper>
+        Abstract of Paper: <abstract of the paper>
+The output
+    - The output is a string that contains the related work section for the given paper.
+"""
+import torch
+import json
+import networkx as nx
+import numpy as np
+from tqdm import tqdm
+from peft import PeftModel
+from transformers import (AutoModel, AutoTokenizer, AutoModelForCausalLM, pipeline)
+from tqdm import tqdm
+import re
+import pandas as pd
+import os
+from sklearn.metrics.pairwise import cosine_similarity
+from utils.utils import read_yaml_file
+import datetime
+class LitFM():
+    def __init__(self, graph_path, adapter_path):
+        self.graph_name = graph_path.split('.')[0].split('/')[-1] if '/' in graph_path else graph_path.split('.')[0]
+        self.batch_size = 32
+        self.neigh_num = 4
+        config = read_yaml_file('configs/config.yaml')
+        retrieval_graph_path = graph_path
+        self.pretrained_model = config['retriever']['embedder']
+        # define generation model
+        model_path = config['inference']["base_model"]
+        self.generation_tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.generation_tokenizer.model_max_length = 2048
+        if self.generation_tokenizer.pad_token is None:
+            self.generation_tokenizer.pad_token = self.generation_tokenizer.eos_token
+        self.generation_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map="auto")
+        self.generation_model = PeftModel.from_pretrained(self.generation_model, adapter_path, adapter_name="instruction", torch_dtype=torch.float16)
+        self.model_pipeline = None
+        if self.generation_tokenizer.chat_template is not None:
+            self.model_pipeline = pipeline(
+                "text-generation",
+                model=model_path,
+                model_kwargs={"torch_dtype": torch.bfloat16},
+                device_map="auto",
+            )
+        # define instruction models
+        self.instruction_pipe = pipeline(
+            "text-generation",
+            model=config["inference"]["gen_related_work_instruct_model"],
+            model_kwargs={"torch_dtype": torch.bfloat16},
+            device_map="auto",
+        )
+        # load graph data for retrieval
+        def translate_graph(graph):
+            all_nodes = list(graph.nodes())
+            raw_id_2_id_dict = {}
+            id_2_raw_id_dict = {}
+            num = 0
+            for node in all_nodes:
+                raw_id_2_id_dict[node] = num
+                id_2_raw_id_dict[num] = node
+                num += 1
+            return raw_id_2_id_dict, id_2_raw_id_dict
+        whole_graph_data_raw = nx.read_gexf(retrieval_graph_path, node_type=None, relabel=False, version='1.2draft')
+        self.whole_graph_raw_id_2_id_dict, self.whole_graph_id_2_raw_id_dict = translate_graph(whole_graph_data_raw)
+        self.whole_graph_id_2_title_abs = dict()
+        for paper_id in whole_graph_data_raw.nodes():
+            title = whole_graph_data_raw.nodes()[paper_id]['title']
+            abstract = whole_graph_data_raw.nodes()[paper_id]['abstract']
+            self.whole_graph_id_2_title_abs[self.whole_graph_raw_id_2_id_dict[paper_id]] = [title, abstract]
+        # define prompt template
+        template_file_path = 'configs/alpaca.json'
+        with open(template_file_path) as fp:
+            self.template = json.load(fp)
+        self.human_instruction = ['### Input:', '### Response:']
+    def _generate_retrieval_prompt(self, data_point: dict):
+        instruction = "Please select the paper that is more likely to be cited by the paper from the list of candidate papers. Your answer MUST be **only the exact title** of the selected paper without generating ANY other text or section. Your answer MUST belong to the list of candidate papers.\n"
+        prompt_input = ""
+        prompt_input = prompt_input + data_point['usr_prompt'] + "\n"
+        prompt_input = prompt_input + "candidate papers: " + "\n"
+        for i in range(len(data_point['nei_titles'])):
+            prompt_input = prompt_input + str(i) + '. ' + data_point['nei_titles'][i] + "\n"
+        if self.model_pipeline is not None:
+            res = [
+                {"role": "system", "content": instruction},
+                {"role": "user", "content": prompt_input},
+            ]
+        else:
+            res = self.template["prompt_input"].format(instruction=instruction, input=prompt_input)
+        return res
+    def _generate_sentence_prompt(self, data_point):
+        instruction = "Please generate the citation sentence of how the Paper cites paper B in its related work section."
+        prompt_input = ""
+        prompt_input = prompt_input + data_point['usr_prompt'] + "\n"
+        prompt_input = prompt_input + "Title of Paper B: " + (data_point['t_title'] if data_point['t_title'] != None else 'Unknown') + "\n"
+        prompt_input = prompt_input + "Abstract of Paper B: " + (data_point['t_abs'] if data_point['t_abs'] != None else 'Unknown') + "\n"
+        if self.model_pipeline is not None:
+            res = [
+                {"role": "system", "content": instruction},
+                {"role": "user", "content": prompt_input},
+            ]
+        else:
+            res = self.template["prompt_input"].format(instruction=instruction, input=prompt_input)
+        return res
+    def _generate_topic_prompt(self, data_point):
+        prompt_input = ""
+        prompt_input = prompt_input + "Here are the information of the paper: \n"
+        prompt_input = prompt_input + data_point['usr_prompt'] + '\n'
+        prompt_input = prompt_input + "Directlty give me the topics you select.\n"
+        res = [
+            {"role": "system", "content": "I need to write the related work section for this paper. Could you suggest three most relevant topics to discuss in the related work section? Your answer should be strictly one topic after the other line by line with nothing else being generated and no further explanation/information.\n"},
+            {"role": "user", "content": prompt_input},
+        ]
+        return res
+    def _generate_paragraph_prompt(self, data_point):
+        prompt_input = ""
+        prompt_input = prompt_input + data_point['usr_prompt'] + "\n"
+        prompt_input = prompt_input + "Topic of this paragraph: " + data_point['topic'] + "\n"
+        prompt_input = prompt_input + "Papers that should be cited in paragraph: \n"
+        i = data_point['paper_citation_indicator']
+        for paper_idx in range(len(data_point['nei_title'])):
+            prompt_input = prompt_input + "[" + str(i) + "]. " + data_point['nei_title'][paper_idx][0]  + '.' + " Citation sentence of this paper in the paragraph: " + data_point['nei_sentence'][paper_idx] + '\n'
+            i += 1
+        prompt_input = prompt_input + "All the above cited papers should be included and each cited paper should be indicated with its index number. Note that you should not include the title of any paper\n"
+        res = [
+            {"role": "system", "content": "Please write a paragraph that review the research relationships between this paper and other cited papers.\n"},
+            {"role": "user", "content": prompt_input},
+        ]
+        return res
+    def _generate_summary_prompt(self, data_point):
+        prompt_input = ""
+        prompt_input = prompt_input + data_point['usr_prompt'] + "\n"
+        prompt_input = prompt_input + "Paragraphs that should be combined: " + "\n"
+        i = 1
+        for para in data_point['paragraphs']:
+            prompt_input = prompt_input + " Paragraph " + str(i) + ": " + para + '\n'
+            i += 1
+        res = [
+            {"role": "system", "content": "Please combine the following paragraphs in a cohenrent way that also keeps the citations and make the flow between paragraphs more smoothly\nAdd a sentence at the beginning of each paragraph to clarify its connection to the previous ones. Do not include any other surrounding text and not add a references list at all\n"},
+            {"role": "user", "content": prompt_input},
+        ]
+        return res
+    @staticmethod
+    def generate_text(prompt, tokenizer, model, temperature, top_p, repetition_penalty, max_new_tokens):
+        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+        with torch.no_grad():
+            output = model.generate(
+                **inputs,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                max_new_tokens=max_new_tokens,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                use_cache=True,
+            )
+        output_text = tokenizer.decode(output[0], skip_special_tokens=True)
+        return output_text
+    def get_llm_response(self, prompt, model_type):
+        self.generation_model.set_adapter('instruction')
+        if model_type == 'zeroshot':
+            raw_output = self.instruction_pipe(
+                prompt,
+                max_new_tokens=8096,
+                temperature=0.9,
+                top_p=0.95,
+                repetition_penalty=1.15,
+            )[0]['generated_text'][-1]
+        if model_type == 'zeroshot_short':
+            raw_output = self.instruction_pipe(
+                prompt,
+                max_new_tokens=256,
+                temperature=0.9,
+                top_p=0.95,
+                repetition_penalty=1.15,
+            )[0]['generated_text'][-1]
+        if model_type == 'instruction':
+            self.generation_model.set_adapter('instruction')
+            if self.model_pipeline is not None:
+                raw_output = self.model_pipeline(
+                    prompt,
+                    temperature=0.9,
+                    top_p=0.95,
+                    repetition_penalty=1.15,
+                )[0]['generated_text'][-1]
+            else:
+                raw_output = self.generate_text(
+                    prompt,
+                    self.generation_tokenizer,
+                    self.generation_model,
+                    temperature=0.9,
+                    top_p=0.95,
+                    repetition_penalty=1.15,
+                    max_new_tokens=256,
+                )
+        return raw_output
+    def single_paper_sentence_test(self, usr_prompt, t_title, t_abs):
+        datapoint = {'usr_prompt':usr_prompt, 't_title':t_title, 't_abs':t_abs}
+        prompt = self._generate_sentence_prompt(datapoint)
+        ans = self.get_llm_response(prompt, 'instruction')
+        res = ans.strip().split(self.human_instruction[1])[-1]
+        return res
+    def single_paper_retrieval_test(self, usr_prompt, candidates):
+        datapoint = {'usr_prompt':usr_prompt, 'nei_titles':list(candidates), 't_title': ''}
+        prompt = self._generate_retrieval_prompt(datapoint)
+        ans = self.get_llm_response(prompt, 'instruction')
+        res = ans.strip().split(self.human_instruction[1])[-1]
+        return res
+    def single_paper_topic_test(self, usr_prompt):
+        datapoint = {'usr_prompt': usr_prompt}
+        prompt = self._generate_topic_prompt(datapoint)
+        ans = self.get_llm_response(prompt, 'zeroshot_short')
+        res = ans['content']
+        res = res.replace('\n\n', '\n')
+        return res
+    def retrieval_for_one_query(self, id_2_title_abs, prompt):
+        if os.path.exists(f'datasets/{self.graph_name}_embeddings.parquet'):
+            all_query_embs = torch.tensor(np.array(pd.read_parquet(f'datasets/{self.graph_name}_embeddings.parquet')))
+        else:
+            tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
+            model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5").to(device='cuda', dtype=torch.float16)
+            model.eval()
+            paper_list = list(id_2_title_abs.keys())
+            all_query_embs = torch.zeros(len(paper_list), 1024)
+            i = 0
+            batch_size = 200
+            candidate_emb_list = []
+            pbar = tqdm(total=len(paper_list))
+            while i < len(paper_list):
+                paper_batch = paper_list[i:i+batch_size]
+                paper_text_batch = []
+                for paper_id in paper_batch:
+                    prompt = id_2_title_abs[paper_id][0] + id_2_title_abs[paper_id][1]
+                    paper_text_batch.append(prompt)
+                inputs = tokenizer(paper_text_batch, return_tensors='pt', padding=True, truncation=True)
+                with torch.no_grad():
+                    outputs = model(**inputs.to('cuda'))
+                    candidate_embeddings = outputs.last_hidden_state[:, 0, :].cpu()
+                    candidate_embeddings = candidate_embeddings.reshape(-1, 1024)
+                    candidate_emb_list.append(candidate_embeddings)
+                    i += len(candidate_embeddings)
+                    pbar.update(len(candidate_embeddings))
+            all_query_embs = torch.cat(candidate_emb_list, 0)
+            pd.DataFrame(all_query_embs.numpy()).to_parquet(f'datasets/{self.graph_name}_embeddings.parquet')
+        # get the embeddings of the prompt
+        pretrained_model_name = self.pretrained_model
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
+        LLM_model = AutoModel.from_pretrained(pretrained_model_name).cuda()
+        LLM_model.eval()
+        encoded_input = tokenizer([prompt], padding = True, truncation=True, max_length=512 , return_tensors='pt')
+        with torch.no_grad():
+            output = LLM_model(**encoded_input.to('cuda'), output_hidden_states=True).hidden_states[-1]
+            sentence_embedding = output[:, 0, :]
+        tmp_scores = cosine_similarity(sentence_embedding.to("cpu"), all_query_embs.to("cpu"))[0]
+        _, idxs = torch.sort(torch.tensor(tmp_scores), descending=True)
+        top_10 = [int(k) for k in idxs[:10]]
+        return [id_2_title_abs[i][0] for i in top_10], [self.whole_graph_id_2_raw_id_dict[i] for i in top_10]
+    def single_paper_related_work_generation(self, usr_prompt):
+        citation_papers = []
+        nei_sentence = []
+        # Get topics
+        retrieval_query = self.single_paper_topic_test(usr_prompt)
+        # Split topics
+        topic_num = 3
+        try:
+            split_topics = retrieval_query.strip().split('\n')
+            if split_topics[0] == '':
+                split_topics = split_topics[1:]
+            split_topics = split_topics[:topic_num]
+        except:
+            split_topics = retrieval_query.strip().split(':')
+            split_topics = split_topics.strip().split(';')
+        split_topics = split_topics[:topic_num]
+        if len(split_topics) > topic_num:
+            return ["too many topics", split_topics]
+        # Get top-5 papers for each topic
+        for retrieval_query in split_topics:
+            # retrieve papers
+            candidate_citation_papers, candidate_raw_ids = self.retrieval_for_one_query(self.whole_graph_id_2_title_abs, retrieval_query)
+            topic_specific_citation_papers = []
+            # select top-5 papers
+            for _ in range(5):
+                # picking most likely to be cited paper
+                selected_paper = self.single_paper_retrieval_test(usr_prompt, candidate_citation_papers).replace(' \n','').replace('\n','')
+                words = selected_paper.strip().split(' ')
+                index = -1
+                for w in words:
+                    try:
+                        index = int(w)
+                    except:
+                        pass
+                if index != -1 and index < len(candidate_citation_papers):
+                    paper_title = candidate_citation_papers[index]
+                    candidate_citation_papers = list(set(candidate_citation_papers) - set([paper_title]))
+                    topic_specific_citation_papers.append([paper_title, candidate_raw_ids[index]])
+                else:
+                    for i, paper_title in enumerate(list(candidate_citation_papers)):
+                        if paper_title.lower().replace(' ', '') in selected_paper.lower().replace(' ', '') or selected_paper.lower().replace(' ', '') in paper_title.lower().replace(' ', ''):
+                            candidate_citation_papers = list(set(candidate_citation_papers) - set([paper_title]))
+                            topic_specific_citation_papers.append([paper_title, candidate_raw_ids[i]])
+                            break
+            citation_papers.append(topic_specific_citation_papers)
+        # Remove empty lists
+        citation_papers = [x for x in citation_papers if x != []]
+        # Generate citation sentences
+        for topic_idx in range(len(citation_papers)):
+            topic_specific_nei_sentence = []
+            for paper_idx in range(len(citation_papers[topic_idx])):
+                sentence = self.single_paper_sentence_test(usr_prompt, citation_papers[topic_idx][paper_idx][0], "")
+                # Match \cite{...}
+                sentence = re.sub(r'\\cite\{[^{}]+\}', "", sentence)
+                topic_specific_nei_sentence.append(sentence)
+            nei_sentence.append(topic_specific_nei_sentence)
+        # Generate paragraphs
+        paragraphs = []
+        references = []               # Store references for citation
+        paper_citation_indicator = 1  # Indicator for citation paper
+        for topic_idx in range(len(citation_papers)):
+            datapoint = {'usr_prompt': usr_prompt,
+                        'nei_title': citation_papers[topic_idx],
+                        'nei_sentence': nei_sentence[topic_idx],
+                        'topic': split_topics[topic_idx],
+                        'paper_citation_indicator': paper_citation_indicator}
+            prompt = self._generate_paragraph_prompt(datapoint)
+            ans = self.get_llm_response(prompt, 'zeroshot')
+            res = ans['content']
+            paragraphs.append(res)
+            # Store referencess
+            for ref_idx, paper in enumerate(citation_papers[topic_idx]):
+                # Extract year and month from raw_id
+                raw_id = re.sub(r'[a-zA-Z/]+', '', paper[1])
+                year = raw_id[:2]
+                year = '19' + year if int(year) > 70 else '20' + year
+                month = datetime.date(1900, int(raw_id[2:4]), 1).strftime('%B')
+                references.append(f"[{paper_citation_indicator + ref_idx}] {paper[0]}, arXiv {raw_id}, {month} {year}")
+            # Update paper_citation_indicator
+            paper_citation_indicator = paper_citation_indicator + len(nei_sentence[topic_idx])
+        # Generate summary
+        datapoint = {'usr_prompt': usr_prompt, 'paragraphs': paragraphs}
+        prompt = self._generate_summary_prompt(datapoint)
+        ans = self.get_llm_response(prompt, 'zeroshot')
+        summary = ans['content']
+        # Append references to summary
+        summary_with_references = summary + "\n\n### References\n" + "\n".join(references)
+        return summary_with_references
+def gen_related_work(message, graph_path, adapter_path):
+    litfm_instance = LitFM(graph_path, adapter_path)
+    return litfm_instance.single_paper_related_work_generation(message)

tasks/influential_papers.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""
+Influential Papers Task
+This module provides functionality to identify the most influential papers in a citation graph.
+Functions:
+    influential_papers(K, graph):
+        Given an integer K and a citation graph, returns the K most influential papers based on the number of citations.
+        The function returns the title and abstract of each of the K most influential papers in a formatted string.
+Usage:
+    The script reads configuration from a YAML file, loads a citation graph from a GEXF file, and prints the K most influential papers.
+"""
+import datetime
+import re
+def influential_papers(message, graph):
+    # Get integer number from message
+    K = int(re.search(r'\d+', message).group())
+    in_degree = dict(graph.in_degree())
+    sorted_in_degree = sorted(in_degree.items(), key=lambda x: x[1], reverse=True)
+    most_cited_papers = []
+    for i in range(K):
+        node = sorted_in_degree[i]
+        paper = graph.nodes[node[0]]
+        most_cited_papers.append(paper)
+    resp = "Here are the most influential papers:\n"
+    for i, paper in enumerate(most_cited_papers):
+        full_paper_id = paper['label']
+        paper_id = re.sub(r'[a-zA-Z/]+', '', full_paper_id)
+        year = paper_id[:2]
+        year = '19' + year if int(year) > 70 else '20' + year
+        month = datetime.date(1900, int(paper_id[2:4]), 1).strftime('%B')
+        resp += f"{i+1}. Title: {paper['title']}, arXiv {full_paper_id}, {month} {year} \nAbstract: {paper['abstract']}\n"
+    return resp

tasks/intro_2_abs.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+Generate the abstract of a paper based on its introduction section.
+Args:
+    usr_prompt (str): The user-provided prompt containing the introduction section of the paper.
+    template (dict): A dictionary containing the template for generating the abstract.
+    context_window (int): The maximum length of the context window for the prompt input.
+Returns:
+    str: The generated abstract based on the introduction section.
+"""
+def intro_2_abs(usr_prompt, template, context_window, has_predefined_template=False):
+    instruction = "Please generate the abstract of paper based on its introduction section."
+    # Reduce it to make it fit
+    prompt_input = usr_prompt[:int(context_window*2)]
+    if has_predefined_template:
+        res = [
+            {"role": "system", "content": instruction},
+            {"role": "user", "content": prompt_input},
+        ]
+    else:
+        res = template["prompt_input"].format(instruction=instruction, input=prompt_input)
+    return res

tasks/link_pred.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+Determine if paper A will cite paper B.
+Args:
+    usr_input (str): The user-provided input containing the titles and abstracts of papers A and B.
+    template (dict): A dictionary containing the template for generating the link prediction task.
+Returns:
+    str: The generated link prediction task based on the user input.
+"""
+def link_pred(usr_input, template, has_predefined_template=False):
+    instruction = "Determine if paper A will cite paper B."
+    if has_predefined_template:
+        res = [
+            {"role": "system", "content": instruction},
+            {"role": "user", "content": usr_input},
+        ]
+    else:
+        res = template["prompt_input"].format(instruction=instruction, input=usr_input)
+    return res

tasks/paper_retrieval.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Retrieves the most likely paper to be cited by Paper A from a list of candidate papers based on user input.
+Args:
+    usr_input (str): A string containing the title and abstract of Paper A followed by the titles and abstracts of candidate papers.
+    template (dict): A dictionary containing a template for formatting the prompt input.
+Returns:
+    str: A string containing the prompt input for the user.
+"""
+def paper_retrieval(usr_input, template, has_predefined_template=False):
+    instruction = "Please select the paper that is more likely to be cited by paper A from candidate papers."
+    if has_predefined_template:
+        res = [
+            {"role": "system", "content": instruction},
+            {"role": "user", "content": usr_input},
+        ]
+    else:
+        res = template["prompt_input"].format(instruction=instruction, input=usr_input)
+    return res

train.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import json
+import torch
+import random
+import transformers
+import networkx as nx
+from tqdm import tqdm
+from peft import (LoraConfig, get_peft_model,
+                  prepare_model_for_kbit_training)
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+class QloraTrainer_CS:
+    def __init__(self, config: dict, use_predefined_graph=False):
+        self.config = config
+        self.use_predefined_graph = use_predefined_graph
+        self.tokenizer = None
+        self.base_model = None
+        self.adapter_model = None
+        self.merged_model = None
+        self.transformer_trainer = None
+        self.test_data = None
+        template_file_path = 'configs/alpaca.json'
+        with open(template_file_path) as fp:
+            self.template = json.load(fp)
+    def load_base_model(self):
+        model_id = self.config['inference']["base_model"]
+        print(model_id)
+        bnb_config = BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_8bit_use_double_quant=True,
+            bnb_8bit_quant_type="nf8",
+            bnb_8bit_compute_dtype=torch.bfloat16
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.model_max_length = self.config['training']['tokenizer']["max_length"]
+        if not tokenizer.pad_token:
+            tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, torch_dtype=torch.bfloat16)
+        if model.device.type != 'cuda':
+            model.to('cuda')
+        model.gradient_checkpointing_enable()
+        model = prepare_model_for_kbit_training(model)
+        self.tokenizer = tokenizer
+        self.base_model = model
+    def train(self):
+        # Set up lora config or load pre-trained adapter
+        lora_config = LoraConfig(
+            r=self.config['training']['qlora']['rank'],
+            lora_alpha=self.config['training']['qlora']['lora_alpha'],
+            target_modules=self.config['training']['qlora']['target_modules'],
+            lora_dropout=self.config['training']['qlora']['lora_dropout'],
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(self.base_model, lora_config)
+        self._print_trainable_parameters(model)
+        print("Start data preprocessing")
+        train_data = self._process_data_instruction()
+        print('Length of dataset: ', len(train_data))
+        print("Start training")
+        self.transformer_trainer = transformers.Trainer(
+            model=model,
+            train_dataset=train_data,
+            args=transformers.TrainingArguments(
+                per_device_train_batch_size=self.config["training"]['trainer_args']["per_device_train_batch_size"],
+                gradient_accumulation_steps=self.config['model_saving']['index'],
+                warmup_steps=self.config["training"]['trainer_args']["warmup_steps"],
+                num_train_epochs=self.config["training"]['trainer_args']["num_train_epochs"],
+                learning_rate=self.config["training"]['trainer_args']["learning_rate"],
+                lr_scheduler_type=self.config["training"]['trainer_args']["lr_scheduler_type"],
+                fp16=self.config["training"]['trainer_args']["fp16"],
+                logging_steps=self.config["training"]['trainer_args']["logging_steps"],
+                output_dir=self.config["training"]['trainer_args']["trainer_output_dir"],
+                report_to="wandb",
+                save_steps=self.config["training"]['trainer_args']["save_steps"],
+            ),
+            data_collator=transformers.DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
+        )
+        model.config.use_cache = False
+        self.transformer_trainer.train()
+        model_save_path = f"{self.config['model_saving']['model_output_dir']}/{self.config['model_saving']['model_name']}_{self.config['model_saving']['index']}_adapter_test_graph"
+        self.transformer_trainer.save_model(model_save_path)
+        self.adapter_model = model
+        print(f"Training complete, adapter model saved in {model_save_path}")
+    def _print_trainable_parameters(self, model):
+        """
+        Prints the number of trainable parameters in the model.
+        """
+        trainable_params = 0
+        all_param = 0
+        for _, param in model.named_parameters():
+            all_param += param.numel()
+            if param.requires_grad:
+                trainable_params += param.numel()
+        print(
+            f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+        )
+    def _process_data_instruction(self):
+        context_window = self.tokenizer.model_max_length
+        if self.use_predefined_graph:
+            graph_data = nx.read_gexf('datasets/' + self.config["training"]["predefined_graph_path"], node_type=None, relabel=False, version='1.2draft')
+        else:
+            graph_path = self.config['data_downloading']['download_directory'] + 'description/' + self.config['data_downloading']['gexf_file']
+            graph_data = nx.read_gexf(graph_path, node_type=None, relabel=False, version='1.2draft')
+        raw_graph = graph_data
+        test_set_size = len(graph_data.nodes()) // 10
+        all_test_nodes = set(list(graph_data.nodes())[:test_set_size])
+        all_train_nodes = set(list(graph_data.nodes())[test_set_size:])
+        raw_id_2_title_abs = dict()
+        for paper_id in list(graph_data.nodes())[test_set_size:]:
+            title = graph_data.nodes()[paper_id]['title']
+            abstract = graph_data.nodes()[paper_id]['abstract']
+            raw_id_2_title_abs[paper_id] = [title, abstract]
+        raw_id_2_intro = dict()
+        for paper_id in list(graph_data.nodes())[test_set_size:]:
+            if graph_data.nodes[paper_id]['introduction'] != '':
+                intro = graph_data.nodes[paper_id]['introduction']
+                raw_id_2_intro[paper_id] = intro
+        raw_id_pair_2_sentence = dict()
+        for edge in list(graph_data.edges()):
+            sentence = graph_data.edges()[edge]['sentence']
+            raw_id_pair_2_sentence[edge] = sentence
+        test_data = []
+        edge_list = []
+        for edge in list(raw_graph.edges()):
+            src, tar = edge
+            if src not in all_test_nodes and tar not in all_test_nodes:
+                edge_list.append(edge)
+            else:
+                test_data.append(edge)
+        train_num = int(len(edge_list))
+        data_LP = []
+        data_abstract_2_title = []
+        data_paper_retrieval = []
+        data_citation_sentence = []
+        data_abs_completion = []
+        data_title_2_abs = []
+        data_intro_2_abs = []
+        for sample in tqdm(random.sample(edge_list, train_num)):
+            source, target = sample[0], sample[1]
+            source_title, source_abs = raw_id_2_title_abs[source]
+            target_title, target_abs = raw_id_2_title_abs[target]
+            # LP prompt
+            rand_ind = random.choice(list(raw_id_2_title_abs.keys()))
+            neg_title, neg_abs = raw_id_2_title_abs[rand_ind]
+            data_LP.append({'s_title':source_title, 's_abs':source_abs, 't_title':target_title, 't_abs':target_abs, 'label':'yes'})
+            data_LP.append({'s_title':source_title, 's_abs':source_abs, 't_title':neg_title, 't_abs':neg_abs, 'label':'no'})
+        for sample in tqdm(random.sample(edge_list, train_num)):
+            source, target = sample[0], sample[1]
+            source_title, source_abs = raw_id_2_title_abs[source]
+            target_title, target_abs = raw_id_2_title_abs[target]
+            # abs_2_title prompt
+            data_abstract_2_title.append({'title':source_title, 'abs':source_abs})
+            data_abstract_2_title.append({'title':target_title, 'abs':target_abs})
+        for sample in tqdm(random.sample(edge_list, train_num)):
+            source, target = sample[0], sample[1]
+            source_title, source_abs = raw_id_2_title_abs[source]
+            target_title, target_abs = raw_id_2_title_abs[target]
+            # paper_retrieval prompt
+            neighbors = list(nx.all_neighbors(raw_graph, source))
+            sample_node_list = list(all_train_nodes - set(neighbors) - set([source]) - set([target]))
+            sampled_neg_nodes = random.sample(sample_node_list, 5) + [target]
+            random.shuffle(sampled_neg_nodes)
+            data_paper_retrieval.append({'title':source_title, 'abs':source_abs, 'sample_title': [raw_id_2_title_abs[node][0] for node in sampled_neg_nodes], 'right_title':target_title})
+        for sample in tqdm(random.sample(edge_list, train_num)):
+            source, target = sample[0], sample[1]
+            source_title, source_abs = raw_id_2_title_abs[source]
+            target_title, target_abs = raw_id_2_title_abs[target]
+            # citation_sentence prompt
+            citation_sentence = raw_id_pair_2_sentence[(source, target)] if (source, target) in raw_id_pair_2_sentence.keys() else raw_id_pair_2_sentence[(target, source)]
+            data_citation_sentence.append({'s_title':source_title, 's_abs':source_abs, 't_title':target_title, 't_abs':target_abs, 'sentence': citation_sentence})
+        for sample in tqdm(random.sample(edge_list, train_num)):
+            source, target = sample[0], sample[1]
+            source_title, source_abs = raw_id_2_title_abs[source]
+            target_title, target_abs = raw_id_2_title_abs[target]
+            # abs_complete prompt
+            data_abs_completion.append({'title':source_title, 'abs':source_abs})
+            data_abs_completion.append({'title':target_title, 'abs':target_abs})
+        for sample in tqdm(random.sample(edge_list, train_num)):
+            source, target = sample[0], sample[1]
+            source_title, source_abs = raw_id_2_title_abs[source]
+            target_title, target_abs = raw_id_2_title_abs[target]
+            # title_2_abs prompt
+            data_title_2_abs.append({'title':source_title, 'right_abs':source_abs})
+            data_title_2_abs.append({'title':target_title, 'right_abs':target_abs})
+        for sample in tqdm(random.sample(edge_list, train_num)):
+            source, target = sample[0], sample[1]
+            if source in raw_id_2_intro:
+                source_intro = raw_id_2_intro[source]
+                _, source_abs = raw_id_2_title_abs[source]
+                data_intro_2_abs.append({'intro':source_intro, 'abs':source_abs})
+            if target in raw_id_2_intro:
+                target_intro = raw_id_2_intro[target]
+                _, target_abs = raw_id_2_title_abs[target]
+                data_intro_2_abs.append({'intro':target_intro, 'abs':target_abs})
+        data_prompt = []
+        data_prompt += [self._generate_paper_retrieval_prompt(data_point) for data_point in data_paper_retrieval]
+        data_prompt += [self._generate_LP_prompt(data_point) for data_point in data_LP]
+        data_prompt += [self._generate_abstract_2_title_prompt(data_point) for data_point in data_abstract_2_title]
+        data_prompt += [self._generate_citation_sentence_prompt(data_point) for data_point in data_citation_sentence]
+        data_prompt += [self._generate_abstract_completion_prompt(data_point) for data_point in data_abs_completion]
+        data_prompt += [self._generate_title_2_abstract_prompt(data_point) for data_point in data_title_2_abs]
+        data_prompt += [self._generate_intro_2_abstract_prompt(data_point, context_window) for data_point in data_intro_2_abs]
+        print("Total prompts:", len(data_prompt))
+        random.shuffle(data_prompt)
+        if self.tokenizer.chat_template is None:
+            data_tokenized = [self.tokenizer(sample,  max_length=context_window, truncation=True) for sample in tqdm(data_prompt)]
+        else:
+            data_tokenized = [self.tokenizer.apply_chat_template(sample,  max_length=context_window, truncation=True, tokenize=False) for sample in tqdm(data_prompt)]
+        return data_tokenized
+    def _generate_LP_prompt(self, data_point: dict):
+        instruction = "Determine if paper A will cite paper B."
+        prompt_input = ""
+        prompt_input = prompt_input + "Title of Paper A: " + (data_point['s_title'] if data_point['s_title'] != None else 'Unknown') + "\n"
+        prompt_input = prompt_input + "Abstract of Paper A: " + (data_point['s_abs'] if data_point['s_abs'] != None else 'Unknown') + "\n"
+        prompt_input = prompt_input + "Title of Paper B: " + (data_point['t_title'] if data_point['t_title'] != None else 'Unknown') + "\n"
+        prompt_input = prompt_input + "Abstract of Paper B: " + (data_point['t_abs'] if data_point['t_abs'] != None else 'Unknown') + "\n"
+        if self.tokenizer.chat_template is None:
+            res = self.template["prompt_input"].format(instruction=instruction, input=prompt_input)
+            res = f"{res}{data_point['label']}"
+        else:
+            res = [
+                {"role": "user", "content": self.template["prompt_input"].format(instruction=instruction, input=prompt_input)},
+                {"role": "assistant", "content": data_point['label']}
+            ]
+        return res
+    def _generate_abstract_2_title_prompt(self, data_point: dict):
+        instruction = "Please generate the title of paper based on its abstract."
+        prompt_input = ""
+        prompt_input = prompt_input + "Abstract: " + data_point['abs'] + "\n"
+        if self.tokenizer.chat_template is None:
+            res = self.template["prompt_input"].format(instruction=instruction, input=prompt_input)
+            res = f"{res}{data_point['title']}"
+        else:
+            res = [
+                {"role": "user", "content": self.template["prompt_input"].format(instruction=instruction, input=prompt_input)},
+                {"role": "assistant", "content": data_point['title']}
+            ]
+        return res
+    def _generate_paper_retrieval_prompt(self, data_point: dict):
+        instruction = "Please select the paper that is more likely to be cited by paper A from candidate papers."
+        prompt_input = ""
+        prompt_input = prompt_input + "Title of the Paper A: " + data_point['title'] + "\n"
+        prompt_input = prompt_input + "Abstract of the Paper A: " + data_point['abs'] + "\n"
+        prompt_input = prompt_input + "candidate papers: " + "\n"
+        for i in range(len(data_point['sample_title'])):
+            prompt_input = prompt_input + str(i) + '. ' + data_point['sample_title'][i] + "\n"
+        if self.tokenizer.chat_template is None:
+            res = self.template["prompt_input"].format(instruction=instruction, input=prompt_input)
+            res = f"{res}{data_point['right_title']}"
+        else:
+            res = [
+                {"role": "user", "content": self.template["prompt_input"].format(instruction=instruction, input=prompt_input)},
+                {"role": "assistant", "content": data_point['right_title']}
+            ]
+        return res
+    def _generate_citation_sentence_prompt(self, data_point: dict):
+        instruction = "Please generate the citation sentence of how Paper A cites paper B in its related work section."
+        prompt_input = ""
+        prompt_input = prompt_input + "Title of Paper A: " + (data_point['s_title'] if data_point['s_title'] != None else 'Unknown') + "\n"
+        prompt_input = prompt_input + "Abstract of Paper A: " + (data_point['s_abs'] if data_point['s_abs'] != None else 'Unknown') + "\n"
+        prompt_input = prompt_input + "Title of Paper B: " + (data_point['t_title'] if data_point['t_title'] != None else 'Unknown') + "\n"
+        prompt_input = prompt_input + "Abstract of Paper B: " + (data_point['t_abs'] if data_point['t_abs'] != None else 'Unknown') + "\n"
+        if self.tokenizer.chat_template is None:
+            res = self.template["prompt_input"].format(instruction=instruction, input=prompt_input)
+            res = f"{res}{data_point['sentence']}"
+        else:
+            res = [
+                {"role": "user", "content": self.template["prompt_input"].format(instruction=instruction, input=prompt_input)},
+                {"role": "assistant", "content": data_point['sentence']}
+            ]
+        return res
+    def _generate_abstract_completion_prompt(self, data_point: dict):
+        instruction = "Please complete the abstract of a paper."
+        prompt_input = ""
+        prompt_input = prompt_input + "Title: " + data_point['title'] if data_point['title'] != None else 'Unknown' + "\n"
+        split_abs = data_point['abs'][: int(0.3*len(data_point['abs']))]
+        prompt_input = prompt_input + "Part of abstract: " + split_abs + "\n"
+        if self.tokenizer.chat_template is None:
+            res = self.template["prompt_input"].format(instruction=instruction, input=prompt_input)
+            res = f"{res}{data_point['abs']}"
+        else:
+            res = [
+                {"role": "user", "content": self.template["prompt_input"].format(instruction=instruction, input=prompt_input)},
+                {"role": "assistant", "content": data_point['abs']}
+            ]
+        return res
+    def _generate_title_2_abstract_prompt(self, data_point: dict):
+        instruction = "Please generate the abstract of paper based on its title."
+        prompt_input = ""
+        prompt_input = prompt_input + "Title: " + data_point['title'] + "\n"
+        if self.tokenizer.chat_template is None:
+            res = self.template["prompt_input"].format(instruction=instruction, input=prompt_input)
+            res = f"{res}{data_point['right_abs']}"
+        else:
+            res = [
+                {"role": "user", "content": self.template["prompt_input"].format(instruction=instruction, input=prompt_input)},
+                {"role": "assistant", "content": data_point['right_abs']}
+            ]
+        return res
+    def _generate_intro_2_abstract_prompt(self, data_point: dict, context_window):
+        instruction = "Please generate the abstract of paper based on its introduction section."
+        prompt_input = ""
+        prompt_input = prompt_input + "Introduction: " + data_point['intro'] + "\n"
+        # Reduce it to make it fit
+        prompt_input = prompt_input[:int(context_window*2)]
+        if self.tokenizer.chat_template is None:
+            res = self.template["prompt_input"].format(instruction=instruction, input=prompt_input)
+            res = f"{res}{data_point['abs']}"
+        else:
+            res = [
+                {"role": "user", "content": self.template["prompt_input"].format(instruction=instruction, input=prompt_input)},
+                {"role": "assistant", "content": data_point['abs']}
+            ]
+        return res

utils/de-macro.py ADDED Viewed

	@@ -0,0 +1,1110 @@

+#!/usr/bin/python -O
+r"""
+Copyright 2005-2020 Peter Gacs
+Licensed under the Academic Free Licence version 2.1
+						  DE-MACRO
+Version 1.4.1 - A small typo corrected.
+Version 1.4	 - Luca Citi made it python2.7 and python3 compatible.
+			   Peter Gacs improved the parsing of \input{<filename>},
+			   and made @ a letter in the style files.
+Version 1.3	 - this version is much more conservative about deleting
+			   comments and inserting or deleting blank space: tries to
+			   leave in all comments, adds space only when necessary, and
+			   tries not to delete space in the main text.
+			   The motivating comments came from Daniel Webb.
+Version 1.2	 - a syntactical bug corrected, thanks Brian de Alwis!
+PURPOSE
+This program can eliminate most private macros from a LaTeX file.
+Applications:
+  - your publisher has difficulty dealing with many private macros
+  - you cooperate with colleagues who do not understand your macros
+  - preprocessing before a system like latex2html, which is somewhat
+	unpredictable with private macros.
+It cannot be used to eliminate more complex macros that rely on
+more programming-like constructs in style files.  In particular, it will
+not replace style files that have options.
+USAGE
+de-macro [--defs <defs-db>] <tex-file-1>[.tex] [<tex-file-2>[.tex] ...]
+Simplest example:	 de-macro testament
+(As you see, the <> is used only in the notation of this documentation,
+you should not type it.)
+If <tex-file-i> contains a command \usepackage{<defs-file>-private}
+then the file <defs-file>-private.sty will be read, and its macros will be
+replaced  in <tex-file-i> with their definitions.
+The result is in <tex-file-i>-clean.tex.
+Only newcommand, renewcommand, newenvironment, and renewenvironment are
+understood (it does not matter, whether you write new or renew).
+These can be nested but do not be too clever, since I do not
+guarantee the same expansion order as in TeX.
+FILES
+<tex-file-1>.db
+<tex-file>-clean.tex
+<defs-file>-private.sty
+For speed, a macro database file called <defs-file>.db is created.
+If such a file exists already then it is used.
+If <defs-file>-private.sty is older than <tex-file-1>.db then it will not
+be used.
+It is possible to specify another database filename via --defs <defs-db>.
+Then <defs-db>.db will be used.
+For each <tex-file-i>, a file <tex-file-i>-clean.tex will be produced.
+If <tex-file-i>-clean.tex is newer than <tex-file-i>.tex then it stays.
+INPUT COMMAND
+If a tex file contains a command \input{<tex-file-j>} or \input <tex-file-j>
+then <tex-file-j>.tex is processed recursively, and <tex-file-j>-clean.tex
+will be inserted into the final output.
+For speed, if <tex-file-j>-clean.tex is newer than <tex-file-j>.tex
+then <tex-file-j>.tex will not be reprocessed.
+The dependency checking is not sophisticated, so if you rewrite some macros
+then remove all *-clean.tex files!
+"""
+import sys, os, re, shelve
+# Utilities
+class No_detail:
+	strerror = ""
+no_detail = No_detail()
+class Error(Exception):
+	"""Base class for exceptions in this module."""
+	pass
+class Empty_text_error(Error):
+	"""Exception raised for errors in the input.
+	Attributes:
+		data -- data that was found empty
+		message
+	"""
+	def __init__(self, data, message):
+		self.data = data
+		self.message = message
+def warn(error_message, detail = no_detail):
+	sys.stderr.write(error_message + "\n")
+	if no_detail != detail:
+		sys.stderr.write(detail.strerror + "\n")
+def die(error_message, detail = no_detail):
+	warn(error_message, detail = no_detail)
+	sys.exit(1)
+def getopt_map(one_letter_opts, long_optlist):
+	"Turns long options into an option map, using getopt."
+	import getopt
+	optlist, args = getopt.getopt(sys.argv[1:],
+								  one_letter_opts, long_optlist)
+	opt_map = {}
+	for pair in optlist: opt_map[pair[0]] = pair[1] or 1
+	return opt_map, args
+def newer(file1, file2):
+	if not os.path.isfile(file1):
+		return False
+	try:
+		stat_return = os.lstat(file1)
+	except OSError as detail:
+		die("lstat " + file1 + " failed:", detail)
+	time1 = stat_return.st_mtime
+	try:
+		stat_return = os.lstat(file2)
+	except OSError as detail:
+		die("lstat " + file2 + " failed:", detail)
+	time2 = stat_return.st_mtime
+	return time1 > time2
+def cut_extension(filename, ext):
+	"""
+	If filename has extension ext (including the possible dot),
+	it will be cut off.
+	"""
+	file = filename
+	index = filename.rfind(ext)
+	if 0 <= index and len(file)-len(ext) == index:
+		file = file[:index]
+	return file
+class Stream:
+	data = None
+	pos = None
+	item = None
+	def legal(self):
+		return 0 <= self.pos and self.pos < len(self.data)
+	def uplegal(self):
+		return self.pos < len(self.data)
+	def __init__(self, data_v = None):
+		self.data = data_v
+		if self.data:
+		   self.pos = 0
+		   self.item = self.data[self.pos]
+	def next(self):
+		self.pos += 1
+		if self.pos < len(self.data):
+			self.item = self.data[self.pos]
+			return self.item
+	def reset(self):
+		if self.data and 0 < len(self.data):
+			self.pos = 0
+			self.item = self.data[0]
+			return self.item
+# Basic classes
+blank_re = re.compile(r"\s")
+blanked_filename_re = re.compile(r"^\s+(\w*)\s+")
+braced_filename_re = re.compile(r"^\s*{\s*(\w*)\s*}")
+blank_or_rbrace_re = re.compile(r"[\s}]")
+pos_digit_re = re.compile(r"[1-9]")
+def isletter(c, isatletter=False):
+	if "@" == c:
+		return isatletter
+	else:
+		return c.isalpha()
+class Token:
+	"""Type 0 means ordinary character, types 1,2 mean escape sequence
+	(without the \ ), type 3 means comment.
+	"""
+	simple_ty = 0
+	esc_symb_ty = 1
+	esc_str_ty = 2
+	comment_ty = 3
+	type = simple_ty
+	val = " "
+	def __init__(self, type_v=simple_ty, val_v=" "):
+		self.type = type_v
+		self.val = val_v
+	def show(self):
+		out = ""
+		if simple_ty == self.type or comment_ty == self.type:
+			out = self.val
+		else:
+			out = "\\" + self.val
+		return out
+# Constants
+g_token = Token(0," ")	# generic token
+simple_ty = g_token.simple_ty
+comment_ty = g_token.comment_ty
+esc_symb_ty = g_token.esc_symb_ty
+esc_str_ty = g_token.esc_str_ty
+def detokenize(text, isatletter=False):
+	"""
+	Input is a list of tokens.
+	Output is a string.
+	"""
+	out = ""
+	if 0 == len(text):
+		return
+	pos = 0
+	out += text[pos].show()
+	pos += 1
+	while pos < len(text):
+		previtem = text[pos-1]
+		item = text[pos]
+		"""Insert a separating space after an escape sequence if it is a
+		string and is followed by a letter."""
+		if (esc_str_ty == previtem.type
+			and simple_ty == item.type and isletter(item.val[0], isatletter)):
+			out += " "
+		out += item.show()
+		pos += 1
+	return out
+def strip_comments(text):
+	"""
+	Input is a list of tokens.
+	Output is the same list except the comment tokens.
+	"""
+	out = []
+	for token in text:
+		if not comment_ty == token.type:
+			out.append(token)
+	return out
+class Group:
+	"""type 0 means a token, type 1 means contents of a group within {}
+	"""
+	token_ty = 0
+	group_ty = 1
+	type = token_ty
+	val = [] # Value is a token list.
+	def __init__(self, type_v, val_v):
+		self.type = type_v
+		self.val = val_v
+	def show(self):
+		if token_ty == self.type:
+			return self.val.show()
+		else:
+			return "{%s}" % detokenize(self.val)
+# Constants
+g_group = Group(0, [])
+token_ty = g_group.token_ty
+group_ty = g_group.group_ty
+def tokenize(in_str, isatletter=False):
+	"""Returns a list of tokens.
+	"""
+	text = []
+	cs = Char_stream(in_str)
+	cs.reset()
+	if not cs.legal():
+		raise Error("No string to tokenize.")
+	while cs.uplegal():
+		if "%" == cs.item:
+			comment = cs.scan_comment_token()
+			text.append(Token(comment_ty, comment))
+		elif "\\" != cs.item:
+			text.append(Token(simple_ty, cs.item))
+			cs.next()
+		else:
+			cs.next()
+			name = cs.scan_escape_token(isatletter)
+			if isletter(name[0], isatletter):
+				token = Token(esc_str_ty, name)
+			else:
+				token = Token(esc_symb_ty, name)
+			text.append(token)
+			if "makeatletter" == name:
+				isatletter=True
+			elif "makeatother" == name:
+				isatletter=False
+	return text
+class Command_def:
+	name = "1"
+	numargs = 0
+	body= ""
+	def __init__(self, name_v, numargs_v, body_v):
+		self.name = name_v
+		self.numargs = numargs_v
+		self.body = body_v
+	def show(self):
+		out = "\\newcommand{\\%s}" % (self.name)
+		if 0 < self.numargs:
+			out += "[%d]" % self.numargs
+		out += "{%s}" % detokenize(self.body)
+		return out
+class Env_def:
+	name = "1"
+	numargs = 0
+	begin = ""
+	end = ""
+	def __init__(self, name_v, numargs_v, begin_v, end_v):
+		self.name = name_v
+		self.numargs = numargs_v
+		self.begin = begin_v
+		self.end = end_v
+	def show(self):
+		out = "\\newenvironment{%s}" % self.name
+		if 0 < self.numargs:
+			out += "[%d]" % self.numargs
+		out += "{%s}" % detokenize(self.begin)
+		out += "{%s}" % detokenize(self.end)
+		return out
+class Command_instance:
+	name = "1"
+	args = []
+	def __init__(self, name_v, args_v):
+		self.name = name_v
+		self.args = args_v
+	def show(self):
+		out = "\\"+self.name
+		for arg in self.args:
+			out += "{%s}" % detokenize(arg)
+		return out
+class Env_instance:
+	name = "1"
+	args = []
+	def __init__(self, name_v, args_v, body_v):
+		self.name = name_v
+		self.args = args_v
+		self.body = body_v
+	def show(self):
+		out = "\\begin{%s}" % self.name
+		for arg in self.args:
+			out += "{%s}" % detokenize(arg)
+		out += detokenize(self.body)
+		out += "\\end{%s}" % self.name
+		return out
+class Char_stream(Stream):
+	def scan_escape_token(self, isatletter=False):
+		"""
+		Starts after the escape sign, assumes that it is scanning a symbol.
+		Returns a token-string.
+		"""
+		out = self.item # Continue only if this is a letter.
+		item = self.next()
+		if isletter(out, isatletter):
+			while self.uplegal() and isletter(item, isatletter):
+				out += item
+				item = self.next()
+		return out
+	def scan_comment_token(self):
+		"""
+		Starts at the comment sign %, assumes that it is scanning a comment.
+		Returns the whole comment string,
+		including the % and all empty space after it.
+		"""
+		comment = ""
+		while self.uplegal() and "\n" != self.item:
+			comment += self.item
+			self.next()
+		while self.uplegal() and blank_re.match(self.item):
+			comment += self.item
+			self.next()
+		return comment
+	def scan_input_filename(self):
+		"""We have just read an \input token.  The next group or word will be
+		interpreted as a filename (possibly without .tex).	Filenames should not begin with spaces.
+		Return the filename.
+		"""
+		item = self.item
+		file = ""
+		while self.uplegal() and blank_re.match(self.item):
+			item = self.next()
+		if "{" == item:
+			item = self.next()
+			while self.uplegal() and not "}" == item:
+				file += item
+				item = self.next()
+			self.next()
+		else:
+			while self.uplegal() and not blank_re.match(item):
+				file += item
+				item = self.next()
+		return file
+	def scan_package_filenames(self):
+		r"""We just read a \usepackage token.  The next group will be
+		interpreted as a list of filenames (without .sty) separated by commas.
+		Return the list.
+		"""
+		item = self.item
+		while self.uplegal() and blank_re.match(item):
+			item = self.next()
+		file = ""
+		if not "{" == item:
+			raise Error("\\usepackage not followed by brace.")
+		item = self.next()
+		while self.uplegal() and not blank_or_rbrace_re.match(item):
+			file += item
+			item = self.next()
+		self.next()
+		return file.split(",")
+class Tex_stream(Stream):
+	defs = ({}, {})
+	defs_db = "x"
+	defs_db_file = "x.db"
+	debug = False
+	def smart_tokenize(self, in_str, handle_inputs=False, isatletter=False):
+		"""Returns a list of tokens.
+		It may interpret and carry out all \input commands.
+		"""
+		self.data = []
+		text = self.data
+		cs = Char_stream(in_str)
+		cs.reset()
+		if not cs.legal():
+			raise Error("No string to tokenize.")
+		while cs.uplegal():
+			if "%" == cs.item:
+				comment = cs.scan_comment_token()
+				text.append(Token(comment_ty, comment))
+			elif "\\" != cs.item:
+				text.append(Token(simple_ty, cs.item))
+				cs.next()
+			else:
+				cs.next()
+				name = cs.scan_escape_token(isatletter)
+				if "input" == name and handle_inputs:
+					file = cs.scan_input_filename()
+					to_add = self.process_if_newer(file)
+					text.extend(to_add)
+				elif "usepackage" == name:
+					while cs.uplegal() and blank_re.match(cs.item):
+						cs.next()
+					if "[" == cs.item: # Packages with options will not be processed.
+						text.extend([Token(esc_str_ty, "usepackage"),
+									 Token(simple_ty, "[")])
+						cs.next()
+						continue
+					files = cs.scan_package_filenames()
+					i = 0
+					while i < len(files):  # process private packages
+						file = files[i]
+						p = file.rfind("-private")
+						if p < 0 or not len(file) - len("-private") == p:
+							i += 1
+							continue
+						defs_db_file = file+".db"
+						self.add_defs(file)
+						del files[i:(i+1)]
+					if files: # non-private packages left
+						group_content = ",".join(files)
+						to_add_str = "\\usepackage{%s}" % (group_content)
+						to_add = tokenize(to_add_str,isatletter)
+						text.extend(to_add)
+				else:
+					if isletter(name[0], isatletter):
+						token = Token(esc_str_ty, name)
+					else:
+						token = Token(esc_symb_ty, name)
+					text.append(token)
+					if "makeatletter" == name:
+						isatletter=True
+					elif "makeatother" == name:
+						isatletter=False
+		self.reset()
+		return self.data
+	def smart_detokenize(self,isatletter=False):
+		"""
+		Output is a string.
+		If the list contains an \input{file} then the content of file
+		file-clean.tex replaces it in the output.
+		"""
+		self.reset()
+		if not self.legal():
+			return ""
+		out = ""
+		previtem = None
+		while self.uplegal():
+			item = self.item
+			"""Insert a separating space after an escape sequence if it is a
+			string and is followed by a letter."""
+			if (None != previtem and esc_str_ty == previtem.type
+				and simple_ty == item.type and isletter(item.val[0], isatletter)):
+				out += " "
+			previtem = item
+			if not (esc_str_ty == item.type and "input" == item.val):
+				out += item.show()
+				self.next()
+			else:
+				self.next()
+				group = self.scan_group()
+				file = detokenize(group.val)
+				clean_file = "%s-clean.tex" % (file)
+				print("Reading file %s" % (clean_file))
+				fp = open(clean_file,"r")
+				content = fp.read()
+				fp.close()
+				out += content
+		return out
+	# Basic tex scanning
+	def skip_blank_tokens(self): # we also skip comment tokens.
+		item = self.item
+		while (self.uplegal() and
+			   (comment_ty == item.type or
+				(simple_ty == item.type and blank_re.match(item.val)))):
+			item = self.next()
+		return item
+	def scan_group(self):
+		"""Returns group.
+		"""
+		if not self.legal():
+			raise Error("No group to scan.")
+		item = self.item
+		if not (simple_ty == item.type and "{" == item.val):
+			return Group(token_ty, [self.item])
+		count = 1
+		group = []
+		item = self.next()
+		while count and self.uplegal():
+			if simple_ty == item.type:
+				if "{" == item.val:
+					count += 1
+				elif "}" == item.val:
+					count -= 1
+			if count != 0:
+				group.append(item)
+			item = self.next()
+		return Group(group_ty, group)
+	# Command and environment definitions
+	def scan_command_name(self):
+		"""Returns name.
+		"""
+		if not self.legal():
+			raise Error("No command name to scan.")
+		item = self.item
+		name = ""
+		if item.type in [esc_symb_ty, esc_str_ty]:
+			name = item.val
+		else:
+			if not "{" == item.val:
+				raise Error("Command definition misses first {.")
+			self.next()
+			item = self.skip_blank_tokens()
+			if not item.type in [esc_symb_ty, esc_str_ty]:
+				raise Error("Command definition does not begin with control sequence.")
+			name = item.val
+			self.next()
+			item = self.skip_blank_tokens()
+			if not "}" == item.val:
+				raise Error("Definition for commmand %s misses first }., %s" %
+					   (name, item.val))
+		self.next()
+		self.skip_blank_tokens()
+		return name
+	def scan_numargs(self, name):
+		"""
+		name is the name of the command or environment definition being
+		scanned.
+		Starts on a nonblank token.
+		Returns numargs
+		where numargs is the number of arguments in a command or environment
+		definition,
+		"""
+		if not self.legal():
+			raise Error("No numargs to scan.")
+		item = self.item
+		numargs = 0
+		if not simple_ty == item.type:
+			raise Error("Illegal command or environment definition: "+name)
+		if "[" == item.val:
+			if not 4 < len(self.data):
+				raise Error("Command or environment definition is illegal: "+name)
+			item = self.next()
+			if not simple_ty == item.type:
+				raise Error("Illegal command or environment definition: "+name)
+			numargs = item.val
+			if not pos_digit_re.match(numargs):
+				raise Error("%s must be argument number after %s" % (numargs, name))
+			numargs = int(numargs)
+			self.next()
+			item = self.skip_blank_tokens()
+			if not simple_ty == item.type:
+				raise Error("Illegal command definition: "+name)
+			if "]" != item.val:
+				raise Error("Illegal command definition: "+name)
+			self.next()
+			self.skip_blank_tokens()
+		return numargs
+	def scan_command_def(self):
+		"""Scan a command definition.
+		Return command_def.
+		Assumes that the number of arguments is at most 9.
+		"""
+		if not self.legal():
+			raise Error("No command definition to scan.")
+		item = self.item
+		if not 2 < len(self.data):
+			raise Error("Command definition is illegal.")
+		# newcommand or renewcommand
+		if not item.type in [esc_symb_ty, esc_str_ty]:
+			raise Error("Command definition should begin with control sequence: "+item.val)
+		if item.val not in ["newcommand", "renewcommand"]:
+			raise Error("Command definition should begin with control sequence.")
+		self.next()
+		self.skip_blank_tokens()
+		cmd_name = self.scan_command_name()
+		numargs = self.scan_numargs(cmd_name)
+		body_group = self.scan_group()
+		if group_ty != body_group.type:
+			raise Error("Command body missing: "+cmd_name)
+		body_val = strip_comments(body_group.val)
+		return Command_def(cmd_name, numargs, body_val)
+	def scan_env_name(self):
+		"""Starts on a {.
+		Returns name.
+		"""
+		if not self.legal():
+			raise Error("No environment name to scan.")
+		item = self.item
+		if not "{" == item.val:
+			raise Error("Env. definition begins with %s, not with {" % (item.val))
+		self.next()
+		item = self.skip_blank_tokens()
+		name = ""
+		if not simple_ty == item.type:
+			raise Error("1. Env. def. begins with cont. seq. %s, not with env.name."
+				 % (item.val))
+		while self.uplegal() and not blank_or_rbrace_re.match(item.val):
+			name += item.val
+			item = self.next()
+			if not simple_ty == item.type:
+				raise Error("2. Env. def. begins with cont. seq. %s, not with env.name."
+					   % (item.val))
+		item = self.skip_blank_tokens()
+		if not "}" == item.val:
+			raise Error("Command definition does not begin with control sequence.")
+		self.next()
+		self.skip_blank_tokens()
+		return name
+	def scan_env_def(self):
+		"""Scan an environment definition.
+		Return env_def
+		Assumes that the number of arguments is at most 9.
+		"""
+		if not self.legal():
+			raise Error("No environment definition to scan.")
+		item = self.item
+		if not 7 < len(self.data):
+			raise Error("Environment definition is illegal.")
+		pos = 0
+		if not item.type in [esc_symb_ty, esc_str_ty]:
+			raise Error("Env. definition does not begin with control sequence:"+
+				   item.val)
+		if item.val not in ["newenvironment", "renewenvironment"]:
+			raise Error("Env. definition does not begin with control sequence.")
+		self.next()
+		self.skip_blank_tokens()
+		env_name = self.scan_env_name()
+		numargs = self.scan_numargs(env_name)
+		self.skip_blank_tokens()
+		begin_group = self.scan_group()
+		if group_ty != begin_group.type:
+			raise Error("Begin body missing: "+env_name)
+		begin_val = strip_comments(begin_group.val)
+		self.skip_blank_tokens()
+		end_group = self.scan_group()
+		if group_ty != end_group.type:
+			raise Error("End body missing:"+env_name)
+		end_val = strip_comments(end_group.val)
+		return Env_def(env_name, numargs, begin_val, end_val)
+	def scan_defs(self):
+		if not self.legal():
+			raise Error("No definitions to scan.")
+		self.reset()
+		command_defs, env_defs = self.defs
+		while self.uplegal():
+			if (esc_str_ty == self.item.type
+				and self.item.val in ["newcommand", "renewcommand"]):
+				def_start_pos = self.pos
+				command_def = self.scan_command_def()
+				command_defs[command_def.name] = command_def
+				def_end_pos = self.pos
+				for del_pos in range(def_start_pos,def_end_pos):
+					del self.data[def_start_pos]
+				self.pos = def_start_pos
+				self.item = self.data[self.pos]
+			elif (esc_str_ty == self.item.type and self.item.val
+				  in ["newenvironment", "renewenvironment"]):
+				def_start_pos = self.pos
+				env_def = self.scan_env_def()
+				env_defs[env_def.name] = env_def
+				def_end_pos = self.pos
+				for del_pos in range(def_start_pos,def_end_pos):
+					del self.data[def_start_pos]
+				self.pos = def_start_pos
+				self.item = self.data[self.pos]
+			else:
+				self.next()
+	# Instances
+	def scan_args(self, command_or_env_def):
+		"""Scan the arguments of a command or environment.
+		Return [args].
+		"""
+		if not self.legal():
+			raise Error("No arguments to scan.")
+		numargs = command_or_env_def.numargs
+		name = command_or_env_def.name
+		args = []
+		for i in range(numargs):
+			arg = []
+			if not (simple_ty == self.item.type and "{" == self.item.val):
+				arg = [self.item]
+				self.next()
+			else:
+				group = self.scan_group()
+				arg = group.val
+			args.append(arg)
+		return args
+	def scan_command(self, command_def):
+		"""Scan the arguments of a command.
+		Return command_instance
+		"""
+		if not self.legal():
+			raise Error("No command to scan.")
+		if not self.item.type in [esc_symb_ty, esc_str_ty]:
+			raise Error("Command does not begin with control sequence.")
+		name = self.item.val
+		self.next()
+		if 0 < command_def.numargs:
+			self.skip_blank_tokens()
+			args = self.scan_args(command_def)
+		else:
+			args = []
+		return Command_instance(name, args)
+	def test_env_boundary(self, item):
+		"""Check whether an environment begin or end follows.
+		Return 1 if \begin, -1 if \end, 0 otherwise.
+		"""
+		d = 0
+		if esc_str_ty == item.type:
+			if "begin"==item.val:
+				d = 1
+			elif "end"==item.val:
+				d = -1
+		return d
+	def scan_env_begin(self):
+		"""Scan an environment name.
+		Return env_name.
+		"""
+		if not self.legal():
+			raise Error("No environment begin to scan.")
+		item = self.item
+		if not (esc_str_ty == item.type and "begin" == item.val):
+			raise Error("Environment does not begin with begin.")
+		self.next()
+		name_group = self.scan_group()
+		name = detokenize(name_group.val)
+		return name
+	def scan_env_end(self):
+		"""Scan an environment end.
+		Return env_name.
+		"""
+		if not self.legal():
+			raise Error("No environment end to scan.")
+		item = self.item
+		if not (esc_str_ty == item.type and "end" == item.val):
+			raise Error("Environment does not end with end.")
+		self.next()
+		name_group = self.scan_group()
+		name = detokenize(name_group.val)
+		return name
+	def scan_env_rest(self, env_def):
+		"""Scanning starts after \begin{envname}.
+		Returns env_instance.
+		"""
+		if not self.legal():
+			raise Error("No environment rest to scan.")
+		count = 1 # We are already within a boundary.
+		args = self.scan_args(env_def)
+		body = []
+		while count and self.uplegal():
+			old_pos = self.pos
+			d = self.test_env_boundary(self.item)
+			count += d
+			if 1 == d:
+				self.scan_env_begin()
+			elif -1 == d:
+				self.scan_env_end()
+			else:
+				self.next()
+			if 0 < count:
+				body.extend(self.data[old_pos : self.pos])
+		return Env_instance(env_def.name, args, body)
+	# Definitions
+	def restore_defs(self):
+		if os.path.isfile(self.defs_db_file):
+			print("Using defs db %s" % (self.defs_db_file))
+			db_h = shelve.open(self.defs_db)
+			self.defs = db_h["defs"]
+			db_h.close()
+	def save_defs(self):
+		db_h = shelve.open(self.defs_db)
+		if "defs" in db_h:
+			del db_h["defs"]
+		db_h["defs"] = self.defs
+		db_h.close()
+	def add_defs(self, defs_file):
+		defs_file_compl = defs_file + ".sty"
+		if not os.path.isfile(defs_file_compl):
+			raise Error("%s does not exist" % (defs_file_compl))
+		defs_db_file = self.defs_db_file
+		if newer(defs_db_file, defs_file_compl):
+			print("Using defs db %s for %s" % (defs_db_file, defs_file))
+		else:
+			defs_fp = open(defs_file_compl, "r")
+			defs_str = defs_fp.read()
+			defs_fp.close()
+			ds = Tex_stream()
+			ds.defs = self.defs
+			defs_text = ds.smart_tokenize(defs_str,isatletter=True)
+			# changing ds.defs will change self.defs
+			if self.debug:
+				defs_seen_file = "%s-seen.sty" % (defs_file)
+				defs_seen_fp = open(defs_seen_file, "w")
+				out = detokenize(defs_text,isatletter=True)
+				defs_seen_fp.write(out)
+				defs_seen_fp.close()
+			ds.scan_defs()
+			if self.debug:
+				out = ""
+				command_defs, env_defs = self.defs
+				for def_name in command_defs.keys():
+					out += command_defs[def_name].show() + "\n"
+				for def_name in env_defs.keys():
+					out += env_defs[def_name].show() +"\n"
+				print("Definitions after reading %s:" % (defs_file))
+				print(out)
+	# Applying definitions, recursively
+	# (maybe not quite in Knuth order, so avoid tricks!)
+	def subst_args(self, body, args):
+		out = []
+		pos = 0
+		while pos < len(body):
+			item = body[pos]
+			if not (simple_ty == item.type and "#" == item.val):
+				out.append(item)
+				pos += 1
+				continue
+			pos += 1
+			token = body[pos]
+			argnum = token.val
+			if not pos_digit_re.match(argnum):
+				raise Error("# is not followed by number.")
+			argnum = int(argnum)
+			if argnum > len(args):
+				raise Error("Too large argument number.")
+			arg = args[argnum-1]
+			out += arg
+			pos += 1
+		return out
+	def apply_command_recur(self, command_instance):
+		command_defs, env_defs = self.defs
+		name = command_instance.name
+		command_def = command_defs[name]
+		args = command_instance.args
+		body = command_def.body
+		result = self.subst_args(body, args)
+		try:
+			result = self.apply_all_recur(result)
+		except Empty_text_error as e:
+			raise Error("apply_all_recur fails on command instance %s: %s, %s" % \
+				  (command_instance.show(), detokenize(e.data), e.message))
+		return result
+	def apply_env_recur(self, env_instance):
+		command_defs, env_defs = self.defs
+		name = env_instance.name
+		env_def = env_defs[name]
+		begin, end = env_def.begin, env_def.end
+		body, args = env_instance.body, env_instance.args
+		out = self.subst_args(begin, args) + body + self.subst_args(end, args)
+		return self.apply_all_recur(out)
+	def apply_all_recur(self, data, report=False):
+		ts = Tex_stream(data)
+		ts.defs = self.defs
+		command_defs, env_defs = self.defs
+		out = []
+		progress_step = 10000
+		progress = progress_step
+		if not ts.legal():
+			raise Empty_text_error(data, "No text to process.")
+		while ts.uplegal():
+			if self.pos > progress:
+				if report:
+					print(self.pos)
+				progress += progress_step
+			if not ts.item.type in [esc_symb_ty, esc_str_ty]:
+				out.append(ts.item)
+				ts.next()
+				continue
+			if 1 == ts.test_env_boundary(ts.item):
+				old_pos = ts.pos
+				env_name = ts.scan_env_begin()
+				if env_name not in env_defs:
+					out.extend(ts.data[old_pos : ts.pos])
+					continue
+				else:
+					env_def = env_defs[env_name]
+					env_instance = ts.scan_env_rest(env_def)
+					result = ts.apply_env_recur(env_instance)
+					out.extend(result)
+			elif ts.item.val not in command_defs:
+				out.append(ts.item)
+				ts.next()
+				continue
+			else:
+				command_def = command_defs[ts.item.val]
+				command_inst = ts.scan_command(command_def)
+				result = ts.apply_command_recur(command_inst)
+				out.extend(result)
+		return out
+	# Processing files
+	def process_file(self, file):
+		"""Returns the new defs.
+		"""
+		file = cut_extension(file, ".tex")
+		source_file = "%s.tex" % (file)
+		print("File %s [" % (source_file))
+		source_fp = open(source_file, "r")
+		text_str = source_fp.read()
+		source_fp.close()
+		self.smart_tokenize(text_str, handle_inputs=True)
+		if not self.data:
+			raise Error("Empty tokenization result.")
+		self.reset()
+		if self.debug:
+			source_seen_fname = "%s-seen.tex" % (file)
+			source_seen_fp = open(source_seen_fname, "w")
+			source_seen_fp.write(detokenize(self.data))
+			source_seen_fp.close()
+		self.scan_defs()
+		self.data = self.apply_all_recur(self.data, report=True)
+		result_fname = "%s-clean.tex" % (file)
+		print("Writing %s [" % (result_fname))
+		result_fp = open(result_fname, "w")
+		result_fp.write(self.smart_detokenize())
+		result_fp.close()
+		print("] file %s" % (result_fname))
+		print("] file %s" % (source_file))
+	def process_if_newer(self, file):
+		"""
+		\input{file} is added to the token list.
+		If the input file is newer it is processed.
+		Returns tokenized \input{file}.
+		"""
+		file = cut_extension(file, ".tex")
+		tex_file = file+".tex"
+		clean_tex_file = file+"-clean.tex"
+		if newer(clean_tex_file, tex_file):
+			print("Using %s." % (clean_tex_file))
+		else:
+			ts = Tex_stream()
+			ts.data = []
+			ts.defs = self.defs
+			ts.process_file(file)
+		to_add = "\\input{%s}" % (file)
+		return tokenize(to_add)
+# Main
+long_optlist = ["debug","defs="]
+options, restargs = getopt_map("x", long_optlist)
+debug = False
+if "--debug" in options:
+	debug = True
+root = restargs[0]
+root = cut_extension(root, ".tex")
+if "--defs" in options:
+	defs_root = options["--defs"]
+else:
+	defs_root = "%s" % (root)
+defs_db = defs_root
+defs_db_file = defs_root+".db"
+ts = Tex_stream()
+ts.defs_db = defs_db
+ts.defs_db_file = defs_db_file
+ts.debug = debug
+ts.restore_defs()
+for root in restargs:
+	ts.process_file(root)
+print("(Re)creating defs db %s" % (defs_db))
+ts.save_defs()

utils/def_handle.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import argparse
+import re
+def main():
+    args = parse_command_line()
+    data = read(args.input)
+    data = convert(data)
+    write(args.output, data)
+def parse_command_line():
+    parser = argparse.ArgumentParser(
+        description='Replace \\def with \\newcommand where possible.',
+    )
+    parser.add_argument(
+        'input',
+        help='TeX input file with \\def',
+    )
+    parser.add_argument(
+        '--output',
+        '-o',
+        required=True,
+        help='TeX output file with \\newcommand',
+    )
+    return parser.parse_args()
+def read(path):
+    with open(path, mode='rb') as handle:
+        return handle.read()
+def convert(data):
+    return re.sub(
+        rb'((?:\\(?:expandafter|global|long|outer|protected)'
+        rb'(?: +|\r?\n *)?)*)?'
+        rb'\\def *(\\[a-zA-Z]+) *(?:#+([0-9]))*\{',
+        replace,
+        data,
+    )
+def replace(match):
+    prefix = match.group(1)
+    if (
+            prefix is not None and
+            (
+                b'expandafter' in prefix or
+                b'global' in prefix or
+                b'outer' in prefix or
+                b'protected' in prefix
+            )
+    ):
+        pass #return match.group(0)
+    result = rb'\newcommand'
+    result += b'{' + match.group(2) + b'}'
+    if match.lastindex == 3:
+        result += b'[' + match.group(3) + b']'
+    result += b'{'
+    return result
+def write(path, data):
+    with open(path, mode='wb') as handle:
+        handle.write(data)
+    print('=> File written: {0}'.format(path))
+if __name__ == '__main__':
+    main()

utils/gradio_utils.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from transformers import StoppingCriteria
+import sys
+# Handle termination signal
+def signal_handler(sig, frame):
+    print("\nTermination signal received. Shutting down Gradio interface.")
+    sys.exit(0)
+# Custom stopping criteria
+class StopOnTokens(StoppingCriteria):
+    def __call__(self, input_ids, scores, **kwargs):
+        stop_ids = [29, 0]  # Define specific stop token IDs
+        return input_ids[0][-1] in stop_ids
+# Toggle task selection
+def toggle_selection(current_task, new_task):
+    """Toggle task selection: deselect if clicked again, otherwise update selection."""
+    updated_task = "" if current_task == new_task else new_task
+    return updated_task

utils/graph_utils.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import regex
+import re
+def retrieve_text_cite(text, command):
+    base_pattern = (
+        r'\\' + command + r"(?:\[(?:.*?)\])*\{((?:[^{}]+|\{(?1)\})*)\}(?:\[(?:.*?)\])*"
+    )
+    def extract_text_inside_curly_braces(text):
+        pattern = r"\{((?:[^{}]|(?R))*)\}"
+        match = regex.search(pattern, text)
+        if match:
+            return match.group(1)
+        else:
+            return ""
+    found_texts = []
+    for match in regex.finditer(base_pattern, text):
+        temp_substring = text[match.span()[0] : match.span()[1]]
+        found_texts.append(extract_text_inside_curly_braces(temp_substring))
+    return found_texts
+def get_citing_sentences(content):
+    content_new = re.sub(r'[\n]+', ' ', content) # keep only one \n
+    content_new = re.sub(r'e\.g\.' , 'eg', content_new)
+    content_new = re.sub(r'i\.e\.' , 'eg', content_new)
+    content_new = re.sub(r'etc\.' , 'etc', content_new)
+    content_new = re.sub(r' +', ' ', content_new)
+    sentences = [sentence + '.' for sentence in content_new.split('.')]
+    citing_sentences = [s for s in sentences if '\\cite' in s]
+    results = {}
+    for s in citing_sentences:
+        citations = retrieve_text_cite(s, 'cite')
+        final_citations = []
+        for cite in citations:
+            final_citations.extend(cite.split(','))
+        results[s] = final_citations
+    return results
+def get_intro(content):
+    sections = retrieve_text_cite(content, 'section')
+    if sections == []:
+        return ''
+    try_intro = [x for x in sections if x.strip().lower() == 'introduction']
+    if try_intro == []:
+        return ''
+    else:
+        to_find = try_intro[0]
+        ind = sections.index(to_find)
+    if ind + 1 < len(sections):
+        start_marker = f'\\section{{{sections[ind]}}}'
+        end_marker = f'\\section{{{sections[ind+1]}}}'
+        start_point = content.find(start_marker)
+        end_point = content.find(end_marker)
+        return content[start_point+len(start_marker):end_point]
+    else:
+        return ''
+def get_related_works(content):
+    sections = retrieve_text_cite(content, 'section')
+    if sections == []:
+        return ''
+    possible_related = [
+        "Literature Review",
+        "Related Work",
+        "Related Works",
+        "Prior Work",
+        "Prior Works",
+        "Related Research",
+        "Research Overview",
+        "Previous Work",
+        "Previous Works",
+        "Review of the Literature",
+        "Review of Related Literature",
+        "Survey of Related Work",
+        "Survey of Related Works",
+        "Background",
+        "Research Background",
+        "Review of Prior Research",
+        "Literature Survey",
+        "Overview of Literature",
+        "Existing Literature",
+        "Review of Existing Work",
+        "Review of Existing Works",
+        "Review of Previous Studies",
+        "Review of Prior Literature",
+        "Summary of Related Research",
+        "Survey of Existing Literature",
+        "Survey of Literature",
+        "Existing Research Overview",
+        "Prior Literature Review"
+    ]
+    possible_sections = [x for x in sections if any([True for y in possible_related if y.lower() == x.strip().lower()])]
+    if possible_sections == []:
+        return ''
+    else:
+        to_find = possible_sections[0]
+        ind = sections.index(to_find)
+    if ind + 1 < len(sections):
+        start_marker = f'\\section{{{sections[ind]}}}'
+        end_marker = f'\\section{{{sections[ind+1]}}}'
+        start_point = content.find(start_marker)
+        end_point = content.find(end_marker)
+        return content[start_point+len(start_marker):end_point]
+    else:
+        return ''

utils/latexpand ADDED Viewed

	@@ -0,0 +1,713 @@

+#!/usr/bin/perl
+# Inspired by latexpand by D. Musliner, University of Michigan
+# 2012-2023: Matthieu Moy <[email protected]>
+# BSD License
+use strict;
+use Cwd;
+use Getopt::Long;
+use IO::Handle;
+use File::Spec;
+my $TEXINPUTS = $ENV{'TEXINPUTS'};
+# By default, search in current directory. We use '.' and not getcwd()
+# to avoid issues if the working directory contains a ':' character.
+if (!$TEXINPUTS) { $TEXINPUTS = '.'; }
+my $verbose;
+my $keep_comments;
+my $keep_includes;
+my $empty_comments;
+my $help;
+my $long_help;
+my %defines = ();
+my $output;
+my $explain;
+my $show_graphics;
+my $graphics_extensions = ":.pdf:.png:.jpg:.eps";
+my $expand_usepackage;
+my $expand_bbl;
+my $biber;
+my $fatal;
+my $version;
+my $makeatletter;
+my $inside_import;
+my $in_enc = "bytes";
+my $out_enc = "bytes";
+GetOptions (
+	'h' => \$help,
+	'help' => \$long_help,
+	'verbose|v' => \$verbose,
+	'keep-comments' => \$keep_comments,
+	'keep-includes' => \$keep_includes,
+	'empty-comments' => \$empty_comments,
+	'define|d=s%' => \%defines,
+	'output|o=s' => \$output,
+	'explain' => \$explain,
+	'show-graphics' => \$show_graphics,
+	'graphics-extensions' => \$graphics_extensions,
+	'expand-usepackage' => \$expand_usepackage,
+	'expand-bbl=s' => \$expand_bbl,
+	'biber=s' => \$biber,
+	'fatal' => \$fatal,
+	'version' => \$version,
+        'makeatletter' => \$makeatletter,
+	'in-encoding=s' => \$in_enc,
+	'out-encoding=s' => \$out_enc,
+) or pod2usage_wrapper(2);
+version() if $version;
+pod2usage_wrapper(0) if $help;
+pod2usage_wrapper(-exitstatus => 0, -output => \*STDOUT, -verbose => 2) if $long_help;
+sub pod2usage_wrapper
+{
+	# Like pod2usage, but fall back to a simpler implem in case
+	# pod2usage can't be found.
+	if  (eval {require Pod::Usage;1;} ne 1) {
+		print "Please install perldoc and Pod::Usage to get proper help.\n";
+		my $started = 0;
+		open (my $in, '<', "$0") or die $!;
+		while (<$in>) {
+			if ($started) {
+				print;
+			}
+			if (/^__END__$/) {
+				$started = 1;
+			}
+		}
+	} else {
+		Pod::Usage->import();
+		pod2usage(@_);
+	}
+}
+sub get_version
+{
+	# $VERSION's value will be substituted by 'make dist', but the
+	# next line won't (the string has to be broken to avoid it).
+	my $VERSION = 'v1.7.2';
+	if ($VERSION eq '@LATEXPAND' . '_VERSION@') {
+		my($vol,$dir,$file) = File::Spec->splitpath($0);
+		chdir($dir);
+		$VERSION = `git describe --tags HEAD 2>/dev/null`;
+	}
+	if ($VERSION eq '') {
+		$VERSION = '<unknown version>';
+	}
+	$VERSION =~ s/^\s+|\s+$//g;
+	return $VERSION;
+}
+sub version
+{
+	print "latexpand version ". get_version() .".\n";
+	exit(0);
+}
+my $nl = "";
+if ($empty_comments) {
+	$nl = "%\n";
+}
+if ($output && $output ne "-") {
+	open (my $OUTPUT, '>', "$output") or die $!;
+	STDOUT->fdopen(\*$OUTPUT, 'w') or die $!;
+}
+sub say
+{
+	if ($verbose) {
+		print STDERR "$_[0]";
+	}
+}
+my $makeatletter_found;
+my $in_preamble;
+use open IN  => ":$in_enc", OUT => ":$out_enc";
+foreach my $file (@ARGV)
+{
+	say "processing $file\n";
+	$makeatletter_found = 0;
+	$in_preamble = 1;
+        $inside_import = "";
+        if ($file =~ /\.bib$/) {
+                warn "WARNING: latexpand is not meant to be used on BibTeX files like '$file'.\n" .
+                        "    Run latexpand on your main .tex file, using '--expand-bbl FILE'\n" .
+                        "    or '--biber FILE' if needed to inline the generated bbl file.\n";
+        } elsif (not $file =~ /\.tex$/) {
+                warn "WARNING: latexpand is meant to be used on .tex files, which $file isn't.\n";
+        }
+	process_file($file, "  ");
+}
+sub cat_file
+{
+	my $file = shift;
+	open (my $INFILE, "<", $file) || die "could not open input file '$file'\n";
+	while (<$INFILE>) {
+		print;
+	}
+	close ($INFILE);
+}
+sub process_file
+{
+	my $file = shift;
+	my $prefix = (shift || "");
+	my $in_comment = 0;
+	open(my $FILE, "<", $file) or die "could not open input file '$file'\n";
+	my $commented_newline = 0;
+	while (my $line = <$FILE>) {
+		if ($line =~ /^[ \t]*\\endinput/) {
+			# Surprisingly, text after \endinput on the
+			# same line is kept in output. Also, add a
+			# space (before %), automatically inserted by
+			# TeX at the end of file.
+			$line =~ s/\\endinput(.*)\n?/$1 % /;
+			$in_comment = 1;
+			process_line($line, $prefix, \$commented_newline);
+			last;
+		}
+		while (my ($k, $v) = each (%defines))
+		{
+			$line=~s!\\$k!$v!g;
+		}
+		process_line($line, $prefix, \$commented_newline, $file);
+		if ($line =~ /^%.*[^\n]\z/ || $line =~ /[^\\]%.*[^\n]\z/) {
+			# file ends with a comment not ending with a newline
+			print "\n";
+		}
+		# Garbage at end of line after \end{document} is
+		# ignored by LaTeX, but we don't allow anything before
+		# to avoid e.g. \verb|\end{document}| from terminating
+		# the file.
+		if (!$keep_comments && $line =~ /^[ \t]*\\end\{document\}/) {
+			last;
+		}
+	}
+	close($FILE);
+	return $in_comment;
+}
+sub process_line
+{
+	my ($line, $prefix, $commented_newline, $file) = @_;
+	$_ = $line;
+	if ($$commented_newline) {
+		# Leading whitespaces after a comment is ignored.
+		# There's no space in:
+		# Line 1%
+		#    Line 2.
+		# Match just space and tabs (\s would match \n)
+		s/^[ \t]*//;
+		if (/^$/) {
+			# Deal with:
+			#
+			# Line 1 % comment
+			#
+			# Line 2
+			#
+			# The newline after Line 1 is commented, but we still
+			# want a new paragraph. We strip the comment together
+			# with its newline, but re-add a newline to chnge
+			# paragraph here if needed:
+			print "\n";
+		}
+	}
+	$$commented_newline = 0;
+	# Consider \makeatletter only in preamble, because we do want
+	# to warn on \someCommand{\makeatletter\command@with@arobase}.
+	if ($in_preamble && /^[^%]*\\makeatletter/) {
+		$makeatletter_found = 1;
+	}
+	if ($in_preamble && /^[^%]*\\makeatother/) {
+		$makeatletter_found = 0;
+	}
+	my $command;
+	if (!$makeatletter && !$makeatletter_found
+	    && (($command) = /^[^%]*(\\[[:alpha:]]*@[[:alpha:]]*)/)
+	    && ($command ne '\@')) {
+		print STDERR "Warning: command $command containing @ found in\n";
+		print STDERR "Warning: $file.\n";
+		print STDERR "Warning: consider using --makeatletter if the result is not compilable.\n";
+	}
+	# non-comment is a sequence of:
+	# - escaped character (\\.), including \% and \\
+	# - neither '%' nor '\'.
+	my $NON_COMMENT = '([^\\\\%]|\\\\.)*';
+	unless ($keep_comments) {
+		# Special-case for \url{} commands, which may contain '%'
+		# characters. It's hard to catch them in $NON_COMMENT since we'd
+		# need a regexp so that "\url{foo" can't match as non-comment in
+		# the line \url{foo%bar}, but "\url{foo%bar}" would match.
+		# Escaping these '%' is not mandatory, but allowed, hence we can
+		# pre-process the line by escaping them, and let latexpand work
+		# as normal afterwards.
+		# Known limitation: latexpand doesn't do balanced braces
+		# recognition, and just refuses both { and } within \url{}
+		# argument for %-detection to work ([^{}%] below). Fix should be
+		# possible using
+		# https://stackoverflow.com/questions/15301708/perl-regular-expression-match-nested-brackets
+		# but is it worth the trouble? (file an issue or send a merge
+		# request if you think it is)
+		# While there are \url{URL} with unescaped % in URL ...
+		my $NON_PERCENT = '([^\\}]%|[^{}%])*';
+		while (/^(?<before>.*\\url\{)(?<url>$NON_PERCENT[^\\}]%$NON_PERCENT)(?<after>\}.*)$/) {
+			my ($before, $url, $after) = ($+{before}, $+{url}, $+{after});
+			# escape unescaped % in URL, if any
+			$url =~ s/([^\\])%/$1\\%/g;
+			$_ = $before . $url . $after ."\n";
+		}
+		if (!$empty_comments) {
+			# Include \n in pattern to avoid matching
+			# comments at end of files
+			# remove comments + whitespace-only lines completely
+			if (s/^\s*%.*\n//) {
+				$$commented_newline = 1;
+			}
+			# Special-case commands at end of line. We
+			# don't want "\\foo%\nbar" to become
+			# "\\foobar" (but we still want \@% to result
+			# in no space!)
+			if (s/^($NON_COMMENT\\([[:alpha:]]|[[:alpha:]@]{2,}))%.*\n/$1 /) {
+				$$commented_newline = 1;
+			} elsif (s/^($NON_COMMENT)%.*\n/$1/) {
+				# remove only the comment if the line has actual content
+				$$commented_newline = 1;
+			}
+		}
+		# Apply the "empty comments" treatment unconditionally
+		# for comments not matched above (it doesn't harm to
+		# keep an empty comment sometimes, but it may harm to
+		# leave a real comment if the goal was to strip them).
+		s/^(([^\\%]|\\.)*)%.*$/$1%/;
+	}
+	unless ($keep_includes) {
+		# \input{foo.tex}
+		my $ARGBRACES = '\{\\s*([^"}\\s][^}]*)(\\s*)\}';
+		# \input{"foo bar.tex"}
+		my $ARGQUOTED = '\{\\s*"([^"]*)"(\\s*)\}';
+		# \input foo.tex
+		my $ARGSPACES = '\\s([^\{\\s][^\\s]+?)\\s()';
+		my $ARGUMENT = "\\s*?(?|$ARGBRACES|$ARGQUOTED|$ARGSPACES)";
+		if (my ($before, $ignored, $full_filename, $trailing, $after)
+		    = /^($NON_COMMENT)\\include$ARGUMENT(.*)$/) {
+			$full_filename = find_tex_file($full_filename . ".tex");
+			if ($full_filename) {
+				say $prefix . "Found include for file: $full_filename\n";
+				print $before . $nl;
+				print '\clearpage{}' . $nl;
+				print "% start include $full_filename\n" if ($explain);
+				my $in_comment = process_file($full_filename, $prefix . "  ");
+				if ($explain) {
+				    print " % end include $full_filename\n";
+				} elsif ($in_comment) {
+				    print "\n";
+				}
+				print '\clearpage{}' . $nl;
+				print $nl . $after . "\n";
+				$_ = "";
+			}
+		} elsif (my ($before, $ignored, $full_filename, $trailing,  $after)
+			 = /^($NON_COMMENT)\\input$ARGUMENT(.*)$/) {
+                        if ($inside_import) {
+                              $full_filename = $inside_import . $full_filename;
+                        }
+			$full_filename = find_tex_file($full_filename, ":.tex");
+			if ($full_filename) {
+				say $prefix . "Found input for file: $full_filename\n";
+				# Apparently, in some versions of LaTeX, a space
+				# after filename in \input{foo.tex } is inserted
+				# _before_ the inclusion. That was the case for
+                                # me when 31fa806 (deal with space after
+                                # filename in \input and \include, 2019-12-11)
+                                # was written, but is not anymore, hence we just
+                                # throw $trailing away.
+				print $before . $nl;
+				print "% start input $full_filename\n" if ($explain);
+				my $in_comment = process_file($full_filename, $prefix . "  ");
+				if ($explain) {
+				    print " % end input $full_filename\n";
+				} elsif ($in_comment) {
+				    print "\n";
+				}
+				if ($after =~ /[^\s]/) {
+				    # LaTeX produces this space, so let's do it also
+				    print " " . $nl . $after . "\n";
+				} else {
+				    print " ";
+				}
+				$_ = "";
+			}
+		} elsif (my ($before, $ignored, $dir, $ignored, $full_filename, $ignored,  $after)
+			 = /^($NON_COMMENT)\\(?:sub)?import$ARGUMENT$ARGUMENT(.*)$/) {
+                        if ($explain) {
+                              print "% dir " . $dir ."\n";
+                              print "% full_filename " . $full_filename ."\n";
+                              print "% after " . $after ."\n";
+                              print "% inside_import $inside_import\n";
+                        }
+                        $full_filename = $dir . $full_filename;
+                        if ($inside_import) {
+                              $full_filename = $inside_import . $full_filename;
+                        }
+                        print "% cat(inside_import,dir,full_filename) " . $full_filename ."\n" if ($explain);
+			$full_filename = find_tex_file($full_filename, ":.tex");
+			if ($full_filename) {
+				say $prefix . "Found input for file: $full_filename\n";
+				print $before . $nl;
+				print "% start input $full_filename\n" if ($explain);
+                                my $previous_import_dir = $inside_import;
+                                $inside_import = $inside_import . $dir;
+				my $in_comment = process_file($full_filename, $prefix . "  ");
+                                $inside_import = $previous_import_dir;
+				if ($explain) {
+				    print " % end input $full_filename\n";
+				} elsif ($in_comment) {
+				    print "\n";
+				}
+				if ($after =~ /[^\s]/) {
+				    # LaTeX produces this space, so let's do it also
+				    print " " . $nl . $after . "\n";
+				} else {
+				    print " ";
+				}
+				$_ = "";
+			}
+		} elsif (my ($before, $ignored, $args, $full_filename, $ignored, $after)
+			 = /^($NON_COMMENT)\\includegraphics(\[[^\]]*?\]|)$ARGUMENT(.*)$/) {
+                        if ($explain) {
+                                print "% inside_import " . $inside_import ."\n";
+                                print "% before " . $before ."\n";
+                                print "% ignored " . $ignored ."\n";
+                                print "% args " . $args ."\n";
+                                print "% full_filename " . $full_filename ."\n";
+                                print "% after " . $after ."\n";
+                        }
+                        if ($inside_import) {
+                                $full_filename = $inside_import . $full_filename;
+                                print "$before\\includegraphics" . "$args" . "{$full_filename}$after\n";
+                                $_ = "";
+                        }
+		} elsif (my ($before, $ignored, $args, $full_filename, $ignored, $after)
+			 = /^($NON_COMMENT)\\lstinputlisting(\[[^\]]*?\]|)$ARGUMENT(.*)$/) {
+                        if ($explain) {
+                                print "% inside_import " . $inside_import ."\n";
+                                print "% before " . $before ."\n";
+                                print "% ignored " . $ignored ."\n";
+                                print "% args " . $args ."\n";
+                                print "% full_filename " . $full_filename ."\n";
+                                print "% after " . $after ."\n";
+                        }
+                        if ($inside_import) {
+                                $full_filename = $inside_import . $full_filename;
+                                print "$before\\lstinputlisting" . "$args" . "{$full_filename}$after\n";
+                                $_ = "";
+                        }
+		}
+	}
+	if ($expand_usepackage) {
+		# Don't bother with before and after text, we just require the
+		# usepackage to be alone on its line.
+		if (my ($package_name) = /^\s*\\usepackage\{([^\}]*)\}\s*(%.*)?$/) {
+			my $full = find_file($package_name . ".sty", $TEXINPUTS);
+			if ($full) {
+				say $prefix . "Found package file: $full\n";
+				process_file($full, $prefix . "  ");
+				$_ = "";
+				# Forget about any commented newline
+				# before the \usepackage:
+				$$commented_newline = 0;
+			} else {
+				say $prefix . "Not including external package $package_name\n";
+			}
+		}
+	}
+	if ($expand_bbl) {
+		if (my ($before, $bib_name, $after)
+			 = /^(.*)\\(?:bibliography|bibselect)\{([^\}]*)\}(.*)$/) {
+			# The BBL file is not necessarily $bib_name.
+			# Take it from the command-line.
+			print $before . $nl;
+			say $prefix . "Expanding BBL file: $expand_bbl\n";
+			process_file($expand_bbl, $prefix . "  ");
+			print " " . $nl . $after . "\n";
+			$_ = "";
+		}
+	}
+	if ($biber) {
+		if (my ($before, $after)
+		    = /^(.*)\\(?:addbibresource)\{[^\}]*\}(.*)$/) {
+			# See https://tex.stackexchange.com/questions/166518/biblatex-include-bbl-problem-with-verb-field/166526#166526
+			my $biber_noext = $biber;
+			$biber_noext =~ s/.bbl//;
+			print $before . $nl;
+			say $prefix . "Expanding Biber BBL file: $biber\n";
+			print '\begin{filecontents*}{' . $biber . '}' . "\n";
+			cat_file($biber);
+			print "\n";
+			print '\end{filecontents*}
+\usepackage{xpatch}
+%Patch the biblatex input command.
+%replace "testinput-bbl" if you change the name above.
+%disable if you want to run biblatex/biber normally
+\makeatletter
+\patchcmd\blx@bblinput{\blx@blxinit}
+                      {\blx@blxinit
+                       \def\jobname{' . $biber_noext . '}%new jobname
+                      }{}{\fail}
+\makeatother
+			    ';
+			say $prefix . "End expansion of Biber BBL file: $biber\n";
+			print " " . $nl . $after . "\n";
+			$_ = "";
+		}
+	}
+	if ($show_graphics) {
+		if (/\\includegraphics(\[[^\]]*\])?{([^}]*)}/) {
+                        my $full_filename = $2;
+                        if ($inside_import) {
+                                $full_filename = $inside_import . $full_filename;
+                        }
+			my $full = find_tex_file($full_filename, $graphics_extensions);
+			say $prefix . "needs graphics file: ";
+			print STDERR "$full\n";
+		}
+	}
+	if (/^[ \t]*\\begin\{document\}/) {
+		$in_preamble = 0;
+		if ($makeatletter) {
+			print '\makeatletter' . $nl;
+		}
+	}
+	print;
+}
+sub unquote
+{
+	my $str = shift;
+	my $x = substr($str, 0, 1);
+	my $y = substr($str, -1, 1);
+	if ($x eq $y && ($x eq '"' || $x eq "'")) {
+		$str = substr($str, 1, -1);
+	}
+	# There's a weird LaTeX syntax: \include{"file\space
+	# with\space spaces"}, so remove these \space when unquoting.
+	$str =~ s/\\space / /g;
+	return $str;
+}
+# search $1 in $TEXINPUTS, with possible extensions in $2
+sub find_tex_file
+{
+	my $file = unquote(shift);
+	my $extensions = (shift || ":");
+	foreach my $ext (split(':', $extensions, -1)) {
+		my $full = find_file_global($file . $ext);
+		if ($full) {
+			return $full;
+		}
+	}
+	if ($fatal) {
+		die "ERROR: Could not find file [$file]\n";
+	} else {
+		print STDERR "Warning: Could not find file [$file]\n";
+		return;
+	}
+}
+sub find_file_global
+{
+	my $file = shift;
+	if (open(my $fh, "-|", "kpsewhich", $file)) {
+		my $full = <$fh>;
+		$full =~ s/\s+$//;
+		close($fh);
+		if ($full) {
+			return $full;
+		}
+	}
+	# Should be useless, but fall-back in case kpsewhich fails (or is not installed, or ...):
+	return find_file($file, $TEXINPUTS);
+}
+# Find files, not searching for global files (to allow not expanding global .sty packages)
+sub find_file
+{
+	my ($file, $path) = @_;
+	if (File::Spec->file_name_is_absolute($file)) {
+		if (-e "$file" && ! -d "$file") {
+			return $file;
+		} else {
+			return;
+		}
+	}
+	# TEXINPUTS=...: (trailing :) means "append default search
+	# directories". We don't want global directories here, but
+	# still add . that may be needed.
+	if (substr($path, -1) eq ':') {
+		$path .= '.';
+	}
+	foreach my $dir (split(':', $path)) {
+		if (-e "$dir/$file" && ! -d "$dir/$file") {
+			return("$dir/$file");
+		}
+	}
+	return;
+}
+__END__
+=head1 NAME
+latexpand - Flatten LaTeX file by expanding \include and \input, ... and  remove comments
+=head1 SYNOPSIS
+latexpand [options] FILE...
+=head2 Options:
+	--verbose        show what's going on
+	--keep-comments  don't strip comments (comments are lines
+                         starting with %, and anything below
+                         \end{document})
+	--empty-comments keep empty comments (i.e. % at end of lines) for clarity
+	--keep-includes  don't expand \input and \include directives
+	--expand-usepackage
+	                 Expand \usepackage{...} directives if the
+	                 corresponding .sty file is found in
+	                 $TEXINPUTS (or the current directory if
+	                 $TEXINPUTS is not set)
+	--expand-bbl FILE
+	                 Expand the bibliography by inlining FILE
+	                 (should be a *.bbl file)
+	--biber FILE	 Include \bibliography{} with FILE's content,
+	                 as needed by biblatex with the biber backend.
+	                 (similar to --expand-bbl FILE, but for
+	                 biber+biblatex).
+	--help           this help message
+	--define <key>=<val>, -d <key>=<val>
+	                 defines a macro key to be replaced by value, e.g.,
+	                 when called with -d foo=bar would replace all occurences
+	                 of \foo in the code with bar. Can be supplied multiple times.
+	--output <file>, -o <file>
+	                 generate output in <file>
+	--explain        generate explanatory comments in output
+	--show-graphics  show included graphics
+	--graphics_extensions
+	                 colon-separated list of possible graphics extensions
+	                 (used by --show-graphics to find the actual graphics files)
+	--fatal          Die in case a file can't be found.
+	--makeatletter   Insert a \makeatletter in the preamble. In some
+	                 rare cases it may break your document, but it
+	                 may help fixing bad interactions between
+	                 @-commands and inclusion (see BUGS section).
+	--in-encoding FMT, --out-encoding FMT
+			 File encoding used by input and output files.
+			 This uses the same syntax as PerlIO's layers.
+			 Example:
+			 --in-encoding 'encoding(UTF-8)'
+			 The default is 'bytes' and should always work.
+=head1 USES
+The most common use of latexpand is to simplify distribution of source
+LaTeX files, typically to satisfy the requirement of editors and
+archival sites (springer, arXiv.org, ...) who force the authors to
+submit sources. One does not necessarily want to submit sources with
+comments, and uploading a document made of several files including
+each other is a bit painful. By default, latexpand answers both
+problems by outputing a single LaTeX file that contain no comment.
+=head1 GETTING LATEXPAND
+The latest version of latexpand is available here:
+  https://gitlab.com/latexpand/latexpand
+Versions are uploaded to ctan.org from time to time:
+  http://www.ctan.org/pkg/latexpand
+=head1 BUGS
+Please, report bugs on the issue tracker on the project site:
+  https://gitlab.com/latexpand/latexpand/issues
+=head2 Known bugs
+=head3 Verbatim
+latexpand currently ignores \begin{verbatim} ... \end{verbatim}, and
+will therefore process any \include, \input, ... directives that
+appear within verbatim environments (while it shouldn't).
+LaTeX comments inside verbatim environments are also incorrectly
+stripped. You can use --keep-comments as a workaround to avoid this.
+=head3 Comment environment
+It would be nice to remove code between \begin{comment} and
+\end{comment} too if \usepackage{comment} is used.
+Code like
+	foo%
+	\begin{comment}
+will produce the incorrect
+	foo\begin{comment}
+A workaround is to use --empty-comments when such tricky usage of the
+comments package is done.
+=head3 \makeatletter and use with transfig/xfig with \scalebox{}
+If \input{} or \include{} appears as argument to a command, and the
+file included contains \makeatletter, then after expansion, the
+\makeatletter and the @-command appear as argument to the command,
+which is forbidden because the argument is parsed (and the @-command
+badly tokenized) before being executed.
+This happens with
+	\scalebox{ \input{file-generated-by-xfig.pdf_t} }
+Workaround: add \makeatletter before the scalebox manually in your
+code, like
+        \makeatletter{}
+	\scalebox{ \input{file-generated-by-xfig.pdf_t} }
+        \makeatother{}
+In the case of xfig generated files, it is necessary only for the
+first occurence.
+A more brute-force workaround is to use latexpand --makeatletter.
+=head1 SEE ALSO
+Instructions to include only the relevant .bib items (french):
+https://lacl.fr/~caubert/notes/portabilite-du-tex.html#dependances
+=head1 VERSION
+This is latexpand version v1.7.2.

utils/utils.py ADDED Viewed

	@@ -0,0 +1,701 @@

+import sys
+import regex
+import yaml
+import shutil
+import bibtexparser
+from charset_normalizer import from_path
+from langdetect import detect
+import os
+import subprocess
+import numpy as np
+import networkx as nx
+import re
+def is_venv():
+    return (hasattr(sys, 'real_prefix') or
+            (hasattr(sys, 'base_prefix') and sys.base_prefix != sys.prefix))
+def read_yaml_file(file_path):
+    with open(file_path, 'r') as file:
+        try:
+            data = yaml.safe_load(file)
+            return data
+        except yaml.YAMLError as e:
+            print(f"Error reading YAML file: {e}")
+def read_tex_file(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        tex_content = file.read()
+    return tex_content
+def write_tex_file(file_path, s):
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.write(s)
+def get_core(s):
+    start = '\\begin{document}'
+    end = '\\end{document}'
+    beginning_doc = s.find(start)
+    end_doc = s.rfind(end)
+    return s[beginning_doc+len(start):end_doc]
+def retrieve_text(text, command, keep_text=False):
+    """Removes '\\command{*}' from the string 'text'.
+    Regex `base_pattern` used to match balanced parentheses taken from:
+    https://stackoverflow.com/questions/546433/regular-expression-to-match-balanced-parentheses/35271017#35271017
+    """
+    base_pattern = (
+        r'\\' + command + r"(?:\[(?:.*?)\])*\{((?:[^{}]+|\{(?1)\})*)\}(?:\[(?:.*?)\])*"
+    )
+    def extract_text_inside_curly_braces(text):
+        """Extract text inside of {} from command string"""
+        pattern = r"\{((?:[^{}]|(?R))*)\}"
+        match = regex.search(pattern, text)
+        if match:
+            return match.group(1)
+        else:
+            return ""
+    # Loops in case of nested commands that need to retain text, e.g. \red{hello \red{world}}.
+    while True:
+        all_substitutions = []
+        has_match = False
+        for match in regex.finditer(base_pattern, text):
+            # In case there are only spaces or nothing up to the following newline,
+            # adds a percent, not to alter the newlines.
+            has_match = True
+            if not keep_text:
+                new_substring = ""
+            else:
+                temp_substring = text[match.span()[0] : match.span()[1]]
+                return extract_text_inside_curly_braces(temp_substring)
+            if match.span()[1] < len(text):
+                next_newline = text[match.span()[1] :].find("\n")
+                if next_newline != -1:
+                    text_until_newline = text[
+                        match.span()[1] : match.span()[1] + next_newline
+                    ]
+                    if (
+                        not text_until_newline or text_until_newline.isspace()
+                    ) and not keep_text:
+                        new_substring = "%"
+            all_substitutions.append((match.span()[0], match.span()[1], new_substring))
+        for start, end, new_substring in reversed(all_substitutions):
+            text = text[:start] + new_substring + text[end:]
+        if not keep_text or not has_match:
+            break
+def reduce_linebreaks(s):
+    return re.sub(r'(\n[ \t]*)+(\n[ \t]*)+', '\n\n', s)
+def replace_percentage(s):
+    return re.sub(r'% *\n', '\n', s)
+def reduce_spaces(s):
+    return re.sub(' +', ' ', s)
+def delete_urls(s):
+    return re.sub(r'http\S+', '', s)
+def remove_tilde(s):
+    s1 = re.sub(r'[~ ]\.', '.', s)
+    s2 = re.sub(r'[~ ],', ',', s1)
+    return re.sub(r'{}', '', s2)
+def remove_verbatim_words(s):
+    with open("configs/latex_commands.yaml", "r") as stream:
+        read_config = yaml.safe_load(stream)
+    for command in read_config['verbatim_to_delete']:
+        s = s.replace(command, '')
+    for command in read_config['two_arguments']:
+        pattern = r'\\' + command + r'{[^}]*}' + r'{[^}]*}'
+        s = re.sub(pattern, '', s)
+    for command in read_config['three_arguments']:
+        pattern = r'\\' + command + r'{[^}]*}' + r'{[^}]*}' + r'{[^}]*}'
+        s = re.sub(pattern, '', s)
+    for command in read_config['two_arguments_elaborate']:
+        s = remove_multargument(s, '\\' + command, 2)
+    for command in read_config['three_arguments_elaborate']:
+        s = remove_multargument(s, '\\' + command, 3)
+    for command in read_config['replace_comments']:
+        pattern = r'\\' + command
+        s = re.sub(pattern, '%', s)
+    s = re.sub(
+      r'\\end{[\s]*abstract[\s]*}',
+      '',
+      s,
+      flags=re.IGNORECASE
+    )
+    s = re.sub(
+      r'\\begin{[\s]*abstract[\s]*}',
+      'Abstract\n\n',
+      s,
+      flags=re.IGNORECASE
+    )
+    return s
+def yes_or_no(s):
+    return 1 if "Yes" == s[0:3] else 0 if "No" == s[0:2] else -1
+def get_main(directory):
+    file_paths = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_paths.append(file_path)
+    latex_paths = [f for f in file_paths if f.endswith('.tex')]
+    number_tex = len(latex_paths)
+    if number_tex == 0:
+        return None
+    if number_tex == 1:
+        return latex_paths[0]
+    adjacency = np.zeros((number_tex, number_tex))
+    keys = [os.path.basename(path) for path in latex_paths]
+    reg_ex = r'\\input{(.*?)}|\\include{(.*?)}|\\import{(.*?)}|\\subfile{(.*?)}|\\include[*]{(.*?)}|}'
+    for i,file in enumerate(latex_paths):
+        content = read_tex_file(file)
+        find_pattern_input = re.findall(reg_ex, content)
+        find_pattern_input = [tup for tup in find_pattern_input if not all(element == "" for element in tup)]
+        number_matches = len(find_pattern_input)
+        if number_matches == 0:
+            continue
+        else:
+            content = replace_imports(file, content)
+        reg_ex_clean = r'\\input{(.*?)}|\\include{(.*?)}'
+        find_pattern_input = re.findall(reg_ex_clean, content)
+        number_matches = len(find_pattern_input)
+        for j in range(number_matches):
+            match = find_pattern_input[j]
+            non_empty_match = [t for t in match if t]
+            for non_empty in non_empty_match:
+                base_match = os.path.basename(non_empty)
+                if not base_match.endswith('.tex'):
+                    base_match = base_match + '.tex'
+                    if base_match not in keys:
+                        continue
+                ind = keys.index(base_match)
+                adjacency[i][ind] = 1
+    G = nx.from_numpy_array(adjacency, create_using=nx.DiGraph)
+    connected_components = list(nx.weakly_connected_components(G))
+    size_connected = [len(x) for x in connected_components]
+    maximum_size = max(size_connected)
+    biggest_connected = [x for x in connected_components if len(x) == maximum_size]
+    if len(biggest_connected)>1:
+        roots = [n for connected in biggest_connected for n in connected if not list(G.predecessors(n))]
+        _check = []
+        for r in roots:
+            try:
+                _check.append(check_begin(latex_paths[r]))
+            except Exception as e:
+                _check.append(False)
+        potentials_files = [latex_paths[x] for x, y in zip(roots, _check) if y == True]
+        sizes_files = [os.path.getsize(x) for x in potentials_files]
+        return potentials_files[sizes_files.index(max(sizes_files))]
+    else:
+        roots = [n for n in biggest_connected[0] if not list(G.predecessors(n))]
+        return latex_paths[roots[0]]
+def initial_clean(directory, config):
+    config_cmd = ''
+    if config == True:
+        config_cmd = '--config configs/cleaning_config.yaml'
+    temp_dir = directory[:directory.rfind('/')] + '_temp' + '/'
+    shutil.copytree(directory, temp_dir)
+    try:
+        command_res = os.system('arxiv_latex_cleaner --keep_bib {} {}'.format(directory, config_cmd))
+        if command_res != 0:
+            raise Exception('Error cleaning')
+        else:
+            shutil.rmtree(temp_dir)
+    except Exception as e:
+        shutil.rmtree(directory)
+        os.rename(temp_dir, directory)
+        file_paths = []
+        for root, _, files in os.walk(directory):
+            for file in files:
+                file_path = os.path.join(root, file)
+                file_paths.append(file_path)
+        latex_paths = [f for f in file_paths if f.endswith('.tex')]
+        for p in latex_paths:
+            results = from_path(p)
+            with open(p, 'w', encoding='utf-8') as f:
+                f.write(str(results.best()))
+        os.system('arxiv_latex_cleaner --keep_bib {} {}'.format(directory, config_cmd))
+    cleaned_directory = directory[:directory.rfind('/')] + '_arXiv'
+    shutil.rmtree(directory)
+    os.rename(cleaned_directory, directory)
+def check_begin(directory):
+    content = read_tex_file(directory)
+    english = detect(content) == 'en'
+    return True and english if re.findall(r'\\begin{document}', content) else False
+def post_processing(extracted_dir, file):
+    _dir = os.path.dirname(file) + '/'
+    perl_expand(file)
+    file = _dir + 'merged_latexpand.tex'
+    try:
+        de_macro(file)
+        file = _dir + 'merged_latexpand-clean.tex'
+    except Exception as e:
+        pass
+    try:
+        def_handle(file)
+    except Exception as e:
+        pass
+    try:
+        declare_operator(file) # has additional add-ons
+    except Exception as e:
+        pass
+    try:
+        de_macro(file)
+        file = _dir + os.path.splitext(os.path.basename(file))[0] + '-clean' + '.tex'
+    except Exception as e:
+        pass
+    initial_clean(_dir, config=True)
+    initial_clean(_dir, config=False)
+    tex_content = read_tex_file(file)
+    final_tex = reduce_spaces(
+        delete_urls(
+            remove_tilde(
+                reduce_linebreaks(
+                    replace_percentage(
+                        remove_verbatim_words(
+                            tex_content
+                        )
+                    )
+                )
+            )
+        )
+    ).strip()
+    shutil.rmtree(extracted_dir)
+    os.makedirs(extracted_dir)
+    write_tex_file(extracted_dir + 'final_cleaned.tex', final_tex)
+    initial_clean(extracted_dir, config=False)
+    return extracted_dir + 'final_cleaned.tex'
+def perl_expand(file):
+    # Save the current working directory
+    oldpwd = os.getcwd()
+    target_dir = os.path.dirname(file) + '/'
+    # Correctly construct the path
+    target = os.path.join(target_dir, 'latexpand')
+    src = './src/utils/latexpand'
+    # Copy the `latexpand` script to the target directory
+    shutil.copyfile(src, target)
+    # Change to the target directory
+    os.chdir(target_dir)
+    # Run the perl command without shell=True and handle redirection within Python
+    with open('merged_latexpand.tex', 'w') as output_file:
+        subprocess.run(['perl', 'latexpand', os.path.basename(file)],
+                       stdout=output_file, stderr=subprocess.DEVNULL)
+    # Return to the original directory
+    os.chdir(oldpwd)
+def de_macro(file):
+    # Save the current working directory\
+    oldpwd = os.getcwd()
+    target_dir = os.path.dirname(file) + '/'
+    # Construct the target path
+    target = os.path.join(target_dir, 'de-macro.py')
+    src = '.src/utils/de-macro.py'
+    # Copy the `de-macro.py` script to the target directory
+    shutil.copyfile(src, target)
+    # Change to the target directory
+    os.chdir(target_dir)
+    # Run the de-macro script without os.system and capture errors
+    try:
+        subprocess.run(['python3', 'de-macro.py', os.path.basename(file)],
+                       stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
+    except subprocess.CalledProcessError as e:
+        raise Exception(f"Error de-macro: {e}") from e
+    finally:
+        # Always return to the original directory
+        os.chdir(oldpwd)
+def def_handle(file):
+    h = os.system('python3 src/utils/def_handle.py {} --output {}'.format(file, file))
+    if h != 0:
+        raise Exception('Error def handle')
+def declare_operator(file):
+    s = read_tex_file(file)
+    ## Operators
+    pattern = r'\\DeclareMathOperator'
+    s = re.sub(pattern, r'\\newcommand', s)
+    pattern = {
+    r'\\newcommand\*': r'\\newcommand',
+    r'\\providecommand\*': r'\\newcommand',
+    r'\\providecommand': r'\\newcommand',
+    r'\\renewcommand\*': r'\\renewcommand',
+    r'\\newenvironment\*': r'\\newenvironment',
+    r'\\renewenvironment\*': r'\\renewenvironment'
+    }
+    s = re.sub(r'\\end +', r'\\end', s)
+    for key in pattern:
+        s = re.sub(key, pattern[key], s)
+    ## Title
+    start = '\\begin{document}'
+    beginning_doc = s.find(start)
+    pattern = {
+            r'\\icmltitlerunning\*': r'\\title',
+            r'\\icmltitlerunning': r'\\title',
+            r'\\inlinetitle\*': r'\\title',
+            r'\\icmltitle\*': r'\\title',
+            r'\\inlinetitle': r'\\title',
+            r'\\icmltitle': r'\\title',
+            r'\\titlerunning\*': r'\\title',
+            r'\\titlerunning': r'\\title',
+            r'\\toctitle': r'\\title',
+            r'\\title\*': r'\\title',
+            r'\\TITLE\*': r'\\title',
+            r'\\TITLE': r'\\title',
+            r'\\Title\*': r'\\title',
+            r'\\Title': r'\\title',
+        }
+    for key in pattern:
+        s = re.sub(key, pattern[key], s)
+    find_potential = s.find('\\title')
+    ## Remove \\
+    title_content = retrieve_text(s, 'title', keep_text = True)
+    if title_content != None:
+        cleaned_title = re.sub(r'\\\\', ' ', title_content)
+        cleaned_title = re.sub(r'\n',' ', cleaned_title)
+        cleaned_title = re.sub(r'\~',' ', cleaned_title)
+        s = s.replace(title_content, cleaned_title)
+        if find_potential != -1 and find_potential < beginning_doc:
+            s = s.replace('\\maketitle', cleaned_title)
+    ##  Cite and ref commands
+    pattern = {
+        r'\\citep\*': r'\\cite',
+        r'\\citet\*': r'\\cite',
+        r'\\citep': r'\\cite',
+        r'\\citet': r'\\cite',
+        r'\\cite\*': r'\\cite',
+        r'\\citealt\*': r'\\cite',
+        r'\\citealt': r'\\cite',
+        r'\\citealtp\*': r'\\cite',
+        r'\\citealp': r'\\cite',
+        r'\\citeyear\*': r'\\cite',
+        r'\\citeyear': r'\\cite',
+        r'\\citeauthor\*': r'\\cite',
+        r'\\citeauthor': r'\\cite',
+        r'\\citenum\*': r'\\cite',
+        r'\\citenum': r'\\cite',
+        r'\\cref': r'\\ref',
+        r'\\Cref': r'\\ref',
+        r'\\factref': r'\\ref',
+        r'\\appref': r'\\ref',
+        r'\\thmref': r'\\ref',
+        r'\\secref': r'\\ref',
+        r'\\lemref': r'\\ref',
+        r'\\corref': r'\\ref',
+        r'\\eqref': r'\\ref',
+        r'\\autoref': r'\\ref',
+        r'begin{thm}': r'begin{theorem}',
+        r'begin{lem}': r'begin{lemma}',
+        r'begin{cor}': r'begin{corollary}',
+        r'begin{exm}': r'begin{example}',
+        r'begin{defi}': r'begin{definition}',
+        r'begin{rem}': r'begin{remark}',
+        r'begin{prop}': r'begin{proposition}',
+        r'end{thm}': r'end{theorem}',
+        r'end{lem}': r'end{lemma}',
+        r'end{cor}': r'end{corollary}',
+        r'end{exm}': r'end{example}',
+        r'end{defi}': r'end{definition}',
+        r'end{rem}': r'end{remark}',
+        r'end{prop}': r'end{proposition}',
+    }
+    for key in pattern:
+        s = re.sub(key, pattern[key], s)
+    pattern = {
+        r'subsubsection':  r'section',
+        r'subsubsection ': r'section',
+        r'subsubsection\*':  r'section',
+        r'subsubsection\* ':  r'section',
+        r'subsection': r'section',
+        r'subsection ':  r'section',
+        r'subsection\*': r'section',
+        r'subsection\* ': r'section',
+        r'section ':  r'section',
+        r'section\*': r'section',
+        r'section\* ': r'section',
+        r'chapter':  r'section',
+        r'chapter ': r'section',
+        r'chapter\*':  r'section',
+        r'chapter\* ':  r'section',
+        r'mysubsubsection': r'section',
+        r'mysubsection':  r'section',
+        r'mysection':  r'section',
+    }
+    for key in pattern:
+        s = re.sub(key, pattern[key], s)
+    # In case any new commands for appendix/appendices
+    s = re.sub(r'newcommand{\\appendix}', '', s)
+    s = re.sub(r'newcommand{\\appendices}', '', s)
+    s = get_core(s)
+    ## In case of double titles being defined
+    title_content = retrieve_text(s, 'title', keep_text = True)
+    if title_content != None:
+        cleaned_title = re.sub(r'\\\\', ' ', title_content)
+        cleaned_title = re.sub(r'\n',' ', cleaned_title)
+        cleaned_title = re.sub(r'\~',' ', cleaned_title)
+        s = s.replace(title_content, cleaned_title)
+    write_tex_file(file, s)
+def replace_imports(file, s):
+    regex_p1 = r'\\import{(.*?)}{(.*?)}'
+    s = re.sub(regex_p1, r"\\input{\1\2}", s)
+    regex_p2 = r'\\subfile{(.*?)}'
+    s = re.sub(regex_p2, r"\\input{\1}", s)
+    regex_p3 = r'\\include[*]{(.*?)}'
+    s = re.sub(regex_p3, r"\\input{\1}", s)
+    write_tex_file(file, s)
+    return s
+def remove_multargument(s, target, k):
+    ind = s.find(target)
+    while ind != -1:
+        start_ind = ind + len(target)
+        stack_open = 0
+        stack_close = 0
+        track_arg  = 0
+        for i, char in enumerate(s[start_ind:]):
+            if char == '{':
+                stack_open += 1
+            if char == '}':
+                stack_close += 1
+            if stack_open !=0 and stack_close !=0:
+                if stack_open == stack_close:
+                    track_arg += 1
+                    stack_open = 0
+                    stack_close = 0
+            if track_arg == k:
+                break
+        s = s[:ind] + s[start_ind + i + 1:]
+        ind = s.find(target)
+    return s
+def fix_citations(s):
+    pattern = {
+    r'\\citep\*': r'\\cite',
+    r'\\citet\*': r'\\cite',
+    r'\\citep': r'\\cite',
+    r'\\citet': r'\\cite',
+    r'\\cite\*': r'\\cite',
+    r'\\citealt\*': r'\\cite',
+    r'\\citealt': r'\\cite',
+    r'\\citealtp\*': r'\\cite',
+    r'\\citealp': r'\\cite',
+    r'\\citeyear\*': r'\\cite',
+    r'\\citeyear': r'\\cite',
+    r'\\citeauthor\*': r'\\cite',
+    r'\\citeauthor': r'\\cite',
+    r'\\citenum\*': r'\\cite',
+    r'\\citenum': r'\\cite'
+    }
+    for key in pattern:
+        s = re.sub(key, pattern[key], s)
+    return s
+def find_bib(directory):
+    file_paths = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_paths.append(file_path)
+    bib_paths = [f for f in file_paths if f.endswith('.bib')]
+    return bib_paths
+def create_bib_from_bbl(bibfile):
+    with open(bibfile, 'r') as f:
+        content = f.read()
+    library_raw = bibtexparser.parse_string(content)
+    library = {}
+    for block in library_raw.blocks:
+        if isinstance(
+            block,
+            (bibtexparser.model.DuplicateBlockKeyBlock, bibtexparser.model.ParsingFailedBlock, bibtexparser.model.ImplicitComment)
+        ):
+            continue
+        fields = {}
+        for field in block.fields:
+            fields[field.key] = field.value
+        ## Get a good title one ##
+        field_content = fields["note"]
+        field_content = field_content.replace("\n", " ")
+        field_content = re.sub(" +", " ", field_content)
+        if field_content.find("``") != -1 and field_content.find("\'\'") != -1:
+            title = (
+                field_content[field_content.find("``") + 2 : field_content.find("\'\'")]
+                    .replace("\\emph", "")
+                    .replace("\\emp", "")
+                    .replace("\\em", "")
+                    .replace(",", "")
+                    .replace("{", "")
+                    .replace("}","")
+                    .replace("``", "")
+                    .replace("\'\'", "")
+                    .strip(".")
+                    .strip()
+                    .strip(".")
+                    .lower()
+            )
+            fields['title'] = title
+        else:
+            if field_content.count("\\newblock") == 2:
+                field_content = field_content.replace("\\newblock", "``", 1)
+                field_content = field_content.replace("\\newblock", "\'\'", 1)
+                if field_content.find("``") != -1 and field_content.find("\'\'") != -1:
+                    title = (
+                        field_content[field_content.find("``") + 2 : field_content.find("\'\'")]
+                        .replace("\\emph", "")
+                        .replace("\\emp", "")
+                        .replace("\\em", "")
+                        .replace(",", "")
+                        .replace("{", "")
+                        .replace("}","")
+                        .replace("``", "")
+                        .replace("\'\'", "")
+                        .strip(".")
+                        .strip()
+                        .strip(".")
+                        .lower()
+                    )
+                    fields['title'] = title
+        library[block.key] = fields
+    return library
+def create_bib(bibfile):
+    with open(bibfile, 'r') as f:
+        content = f.read()
+    library_raw = bibtexparser.parse_string(content)
+    library = {}
+    for block in library_raw.blocks:
+        if isinstance(
+            block,
+            (bibtexparser.model.DuplicateBlockKeyBlock, bibtexparser.model.ParsingFailedBlock, bibtexparser.model.ImplicitComment)
+        ):
+            continue
+        fields = {}
+        for field in block.fields:
+            fields[field.key] = field.value.replace('{', '').replace('}', '')
+            if field.key == 'title':
+                title = re.sub(r'[\n]+', ' ', field.value) # keep only one \n
+                title = re.sub(r' +', ' ', title)
+                fields[field.key] = (
+                    title.replace("\\emph", "")
+                    .replace("\\emp", "")
+                    .replace("\\em", "")
+                    .replace(",", "")
+                    .replace("{", "")
+                    .replace("}", "")
+                    .strip(".")
+                    .strip()
+                    .strip(".")
+                    .lower()
+                )
+        if 'title' not in fields:
+            continue
+        library[block.key] = fields
+    return library
+def find_bbl(directory):
+    file_paths = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_paths.append(file_path)
+    bib_paths = [f for f in file_paths if f.endswith('.bbl')]
+    return bib_paths
+def textobib(file):
+    oldpwd = os.getcwd()
+    target_dir = os.path.dirname(file) + '/'
+    target = target_dir + 'tex2bib'
+    src = './tex2bib'
+    shutil.copyfile(src, target)
+    os.chdir(target_dir)
+    output_file = os.path.splitext(os.path.basename(file))[0]  + '.bib'
+    os.system('perl tex2bib -i {} -o {}'.format(os.path.basename(file), output_file))
+    os.chdir(oldpwd)
+    return target_dir + output_file
+def get_library_bib(bib_files):
+    library = []
+    for bib_file in bib_files:
+        library.append(create_bib(bib_file))
+    final_library = {}
+    for d in library:
+        final_library.update(d)
+    return final_library
+def get_library_bbl(bbl_files):
+    bib_files = []
+    for bbl_file in bbl_files:
+        bib_files.append(textobib(bbl_file))
+    library = []
+    for bib_file in bib_files:
+        library.append(create_bib_from_bbl(bib_file))
+    final_library = {}
+    for d in library:
+        final_library.update(d)
+    return final_library