Spaces:

amyeroberts
/

transformers-github-bot

Sleeping

App Files Files Community

Amy Roberts commited on Feb 7, 2024

Commit

9b744c5

1 Parent(s): c9976b6

Draft

Browse files

Files changed (11) hide show

.gitignore +167 -0
app.py +101 -0
build_embeddings.py +117 -0
build_issue_dict.py +19 -0
defaults.py +5 -0
find_similar_issues.py +91 -0
get_issues.py +129 -0
get_topic.py +43 -0
retrieval.py +76 -0
update_embeddings.py +109 -0
update_stored_issues.py +142 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Data
+*.json
+*.png
+*.npy
+*.jpg
+*.pdf

app.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+import gradio as gr
+import gradio as gr
+from find_similar_issues import get_similar_issues
+import requests
+from html2image import Html2Image
+import io
+hti = Html2Image(size=(1920, 1080 * 3))
+from defaults import OWNER, REPO, TOKEN
+def get_query_issue_information(issue_no, token):
+    headers = {
+        "Accept": "application/vnd.github+json",
+        "Authorization": f"{token}",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "User-Agent": "amyeroberts",
+    }
+    request = requests.get(
+        f"https://api.github.com/repos/{OWNER}/{REPO}/issues/{issue_no}",
+        headers=headers,
+    )
+    if request.status_code != 200:
+        raise ValueError(f"Request failed with status code {request.status_code} and message {request.text}")
+    return request.json()
+def find_similar_issues(issue, token):
+    similar_issues = get_similar_issues(issue, token=token)
+    similar_issues_summary = [f"#{issue['number']} - {issue['title']}" for issue in similar_issues]
+    return similar_issues_summary
+def render_issue_as_image(issue, filename="image.png"):
+    url = issue["html_url"]
+    print(url)
+    hti.screenshot(url=url, save_as=filename)
+    return filename
+def run_find_similar_issues(issue, token, n_issues):
+    issue_information = get_query_issue_information(issue, token=token)
+    # issue_information_summary = f"#{issue_information['number']} - {issue_information['title']}\n\n{issue_information['body']}"
+    similar_issues = get_similar_issues(issue, token=token, top_k=n_issues)
+    # similar_issues_summary = [f"#{issue['number']} - {issue['title']}" for issue in similar_issues]
+    issue_image = render_issue_as_image(issue_information, filename="query_issue.png")
+    image_names = []
+    for i, issue in enumerate(similar_issues):
+        image_names.append(render_issue_as_image(issue, filename=f"image{i}.png"))
+    # return issue_information_summary, image_names
+    page_html = requests.get(issue_information["html_url"]).text
+    return issue_image, page_html, image_names
+with gr.Blocks(title="Github Bot") as demo:
+    with gr.Tab("Find similar issues"):
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    issue = gr.Textbox(label="Github Issue", placeholder="Github issue you want to find similar issues to")
+                    token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
+                with gr.Row():
+                    n_issues = gr.Slider(1, 50, value=5, label="Number of similar issues", info="Choose between 1 and 50")
+        with gr.Row():
+            submit_button = gr.Button(value="Submit")
+        with gr.Row():
+            with gr.Column():
+                issue_image = gr.Image(type="filepath", label="Your issue")
+            with gr.Column():
+                similar_issues_screenshots = gr.Gallery(label="Similar Issues")
+                issue_text = gr.HTML(label="Issue text", elem_id="issue_text")
+                submit_button.click(run_find_similar_issues, outputs=[issue_image, issue_text, similar_issues_screenshots], inputs=[issue, token, n_issues])
+    with gr.Tab("Search issues"):
+        with gr.Row():
+            query = gr.Textbox(label="Query", placeholder="Search for issues")
+        with gr.Row():
+            token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
+        with gr.Row():
+            pass
+    with gr.Tab("Find maintainers to ping"):
+        with gr.Row():
+            issue = gr.Textbox(label="Github Issue / PR", placeholder="Issue or PR you want to find maintainers to ping for")
+        with gr.Row():
+            token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
+if __name__ == "__main__":
+    demo.launch()

build_embeddings.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import argparse
+import json
+import logging
+import os
+import numpy as np
+from sentence_transformers import SentenceTransformer
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def load_model(model_id: str):
+    return SentenceTransformer(model_id)
+class EmbeddingWriter:
+    def __init__(self, output_embedding_filename, output_index_filename, update, embedding_to_issue_index) -> None:
+        self.output_embedding_filename = output_embedding_filename
+        self.output_index_filename = output_index_filename
+        self.embeddings = []
+        self.embedding_to_issue_index = embedding_to_issue_index
+        self.update = update
+    def __enter__(self):
+        return self.embeddings
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if len(self.embeddings) == 0:
+            return
+        embeddings = np.array(self.embeddings)
+        if self.update and os.path.exists(self.output_embedding_filename):
+            embeddings = np.concatenate([np.load(self.output_embedding_filename), embeddings])
+        logger.info(f"Saving embeddings to {self.output_embedding_filename}")
+        np.save(self.output_embedding_filename, embeddings)
+        logger.info(f"Saving embedding index to {self.output_index_filename}")
+        with open(self.output_index_filename, "w") as f:
+            json.dump(self.embedding_to_issue_index, f, indent=4)
+def embed_issues(
+    input_filename: str,
+    model_id: str,
+    issue_type: str,
+    n_issues: int = -1,
+    update: bool = False
+):
+    model = load_model(model_id)
+    output_embedding_filename = f"{issue_type}_embeddings.npy"
+    output_index_filename = f"embedding_index_to_{issue_type}.json"
+    with open(input_filename, "r") as f:
+        issues = json.load(f)
+    if update and os.path.exists(output_index_filename):
+        with open(output_index_filename, "r") as f:
+            embedding_to_issue_index = json.load(f)
+        embedding_index = len(embedding_to_issue_index)
+    else:
+        embedding_to_issue_index = {}
+        embedding_index = 0
+    max_issues = n_issues if n_issues > 0 else len(issues)
+    n_issues = 0
+    with EmbeddingWriter(
+        output_embedding_filename=output_embedding_filename,
+        output_index_filename=output_index_filename,
+        update=update,
+        embedding_to_issue_index=embedding_to_issue_index
+    ) as embeddings: #, embedding_to_issue_index:
+        for issue_id, issue in issues.items():
+            if n_issues >= max_issues:
+                break
+            if issue_id in embedding_to_issue_index.values() and update:
+                logger.info(f"Skipping issue {issue_id} as it is already embedded")
+                continue
+            if "body" not in issue:
+                logger.info(f"Skipping issue {issue_id} as it has no body")
+                continue
+            if issue_type == "pull_request" and "pull_request" not in issue:
+                logger.info(f"Skipping issue {issue_id} as it is not a pull request")
+                continue
+            elif issue_type == "issue" and "pull_request" in issue:
+                logger.info(f"Skipping issue {issue_id} as it is a pull request")
+                continue
+            title = issue["title"] if issue["title"] is not None else ""
+            body = issue["body"] if issue["body"] is not None else ""
+            logger.info(f"Embedding issue {issue_id}")
+            embedding = model.encode(title + "\n" + body)
+            embedding_to_issue_index[embedding_index] = issue_id
+            embeddings.append(embedding)
+            embedding_index += 1
+            n_issues += 1
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('issue_type', choices=['issue', 'pull'], default='issue')
+    parser.add_argument("--input_filename", type=str, default="issues_dict.json")
+    parser.add_argument("--model_id", type=str, default="all-mpnet-base-v2")
+    parser.add_argument("--n_issues", type=int, default=-1)
+    parser.add_argument("--update", action="store_true")
+    args = parser.parse_args()
+    embed_issues(**vars(args))

build_issue_dict.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import argparse
+import json
+def build_json_file(input_filename, output_filename):
+    with open(input_filename, "r") as f:
+        json_lines = f.readlines()
+    issues = [json.loads(line) for line in json_lines]
+    json_dict = {issue["number"]: issue for issue in issues}
+    with open(output_filename, "w") as f:
+        json.dump(json_dict, f, indent=4)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_filename", type=str, default="issues.json")
+    parser.add_argument("--output_filename", type=str, default="issues_dict.json")
+    args = parser.parse_args()
+    build_json_file(**vars(args))

defaults.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import os
+OWNER = "huggingface"
+REPO = "transformers"
+TOKEN = os.environ.get("GITHUB_TOKEN")

find_similar_issues.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import pprint
+import json
+import argparse
+import requests
+from defaults import OWNER, REPO, TOKEN
+from sentence_transformers import SentenceTransformer
+import numpy as np
+model_id = "all-mpnet-base-v2"
+model = SentenceTransformer(model_id)
+def load_embeddings():
+    """
+    Function to load embeddings from file
+    """
+    embeddings = np.load("issue_embeddings.npy")
+    return embeddings
+def load_issue_information():
+    """
+    Function to load issue information from file
+    """
+    with open("embedding_index_to_issue.json", "r") as f:
+        embedding_index_to_issue = json.load(f)
+    with open("issues_dict.json", "r") as f:
+        issues = json.load(f)
+    return embedding_index_to_issue, issues
+def cosine_similarity(a, b):
+    if a.ndim == 1:
+        a = a.reshape(1, -1)
+    if b.ndim == 1:
+        b = b.reshape(1, -1)
+    return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
+def get_similar_issues(issue_no, top_k=5, token=TOKEN, owner=OWNER, repo=REPO):
+    """
+    Function to find similar issues
+    """
+    url = f"https://api.github.com/repos/{owner}/{repo}/issues"
+    headers = {
+        "Accept": "application/vnd.github+json",
+        f"Authorization": "{token}",
+        "X-GitHub-Api-Version": "2022-11-28",
+        "User-Agent": "amyeroberts",
+    }
+    request = requests.get(
+        f"https://api.github.com/repos/{OWNER}/{REPO}/issues/{issue_no}",
+        headers=headers,
+    )
+    if request.status_code != 200:
+        raise ValueError(f"Request failed with status code {request.status_code}")
+    query_embedding = model.encode(request.json()["body"])
+    query_embedding = query_embedding.reshape(1, -1)
+    embeddings = load_embeddings()
+    # Calculate the cosine similarity between the query and all the issues
+    cosine_similarities = cosine_similarity(query_embedding, embeddings)
+    # Get the index of the most similar issue
+    most_similar_indices = np.argsort(cosine_similarities)
+    most_similar_indices = most_similar_indices[0][::-1]
+    embedding_index_to_issue, issues = load_issue_information()
+    similar_issues = []
+    for i in most_similar_indices[:top_k]:
+        issue_no = embedding_index_to_issue[str(i)]
+        similar_issues.append(issues[issue_no])
+    return similar_issues
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("issue_no", type=int)
+    parser.add_argument("--top_k", type=int, default=5)
+    parser.add_argument("--token", type=str, default=TOKEN)
+    parser.add_argument("--owner", type=str, default=OWNER)
+    parser.add_argument("--repo", type=str, default=REPO)
+    args = parser.parse_args()
+    get_similar_issues(args.issue_no, args.top_k, args.token, args.owner, args.repo)

get_issues.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import json
+import argparse
+import requests
+import os
+import numpy as np
+import json
+import datetime
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+today = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+OWNER = "huggingface"
+REPO = "transformers"
+GITHUB_API_VERSION = "2022-11-28"
+TOKEN = os.environ.get("GITHUB_TOKEN")
+JSON_FILE = f"issues.json"
+UPDATE_FILE = False
+OVERWRITE_FILE = True
+def get_last_entry(file_path):
+    with open(file_path, 'r') as file:
+        # Read the last line
+        last_line = file.readlines()[-1]
+    return json.loads(last_line)
+def get_last_issue_number(file_path):
+    if os.path.exists(file_path):
+        last_entry = get_last_entry(file_path=file_path)
+        return last_entry['number']
+    return 0
+def get_issues(
+    overwrite=OVERWRITE_FILE,
+    update=UPDATE_FILE,
+    output_filename=JSON_FILE,
+    github_api_version=GITHUB_API_VERSION,
+    owner=OWNER,
+    repo=REPO,
+    token=TOKEN,
+    n_pages=-1,
+):
+    """
+    Function to get the issues from the transformers repo and save them to a json file
+    """
+    # If file exists and we want to overwrite it, delete it
+    if os.path.exists(output_filename) and overwrite:
+        logging.info(f"Deleting file {output_filename}")
+        os.remove(output_filename)
+    # Define the URL and headers
+    url = f"https://api.github.com/repos/{owner}/{repo}/issues"
+    headers = {
+        "Accept": "application/vnd.github+json",
+        f"Authorization": f"{token}",
+        "X-GitHub-Api-Version": f"{github_api_version}",
+        "User-Agent": "amyeroberts",
+    }
+    last_issue_number = get_last_issue_number(file_path=output_filename)
+    per_page = 100
+    page = last_issue_number // per_page + 1
+    query_params = {
+        "state": "all",
+        "per_page": per_page,
+        "sort": "created",
+        "direction": "asc",
+        "page": page,
+    }
+    if os.path.exists(output_filename) and not update and not overwrite:
+        raise ValueError(f"File {output_filename} already exists")
+    page_limit = (n_pages + page) if n_pages > 0 else np.inf
+    while True:
+        if page >= page_limit:
+            break
+        # Send the GET request
+        response = requests.get(url, headers=headers, params=query_params)
+        if not response.status_code == 200:
+            raise ValueError(
+                f"Request failed with status code {response.status_code} and message {response.text}"
+            )
+        json_response = response.json()
+        logger.info(f"Page: {page}, number of issues: {len(json_response)}")
+        # If we get an empty response, we've reached the end of the issues
+        if len(json_response) == 0:
+            break
+        with open(output_filename, "a") as f:
+            for value in json_response:
+                if value["number"] <= last_issue_number:
+                    continue
+                json.dump(value, f)
+                f.write("\n")
+        if len(json_response) < per_page:
+            break
+        page += 1
+        query_params["page"] = page
+    return output_filename
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--update", action="store_true", default=True)
+    parser.add_argument("--overwrite", action="store_true", default=False)
+    parser.add_argument("--output_filename", type=str, default=JSON_FILE)
+    parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
+    parser.add_argument("--owner", type=str, default=OWNER)
+    parser.add_argument("--repo", type=str, default=REPO)
+    parser.add_argument("--token", type=str, default=TOKEN)
+    parser.add_argument("--n_pages", type=int, default=-1)
+    args = parser.parse_args()
+    get_issues(**vars(args))

get_topic.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"text": {}
+topic_maintainers_map ={
+    "text models": ["@ArthurZucker", "@younesbelkada"],
+    "vision models": "@amyeroberts",
+    "speech models": "@sanchit-gandhi",
+    "graph models": "@clefourrier",
+    "flax": "@sanchit-gandhi",
+    "generate": "@gante",
+    "pipelines": "@Narsil",
+    "tensorflow": ["@gante", "@Rocketknight1"],
+    "tokenizers": "@ArthurZucker",
+    "trainer": ["@muellerzr", "@pacman100"],
+    "deepspeed": "@pacman100",
+    "ray/raytune": ["@richardliaw", "@amogkam"],
+    "Big Model Inference": "@SunMarc",
+    "quantization (bitsandbytes, autogpt)": ["@SunMarc", "@younesbelkada"],
+    "Documentation": ["@stevhliu", "@MKhalusova"],
+    "accelerate": "different repo",
+    "datasets": "different repo",
+    "diffusers": "different repo",
+    "rust tokenizers": "different repo",
+    "Flax examples": "@sanchit-gandhi",
+    "PyTorch vision examples": "@amyeroberts",
+    "PyTorch text examples": "@ArthurZucker",
+    "PyTorch speech examples": "@sanchit-gandhi",
+    "PyTorch generate examples": "@gante",
+    "TensorFlow": "@Rocketknight1",
+    "Research projects and examples": "not maintained",
+}
+from transformers import AutoTokenizer, LlamaForCausalLM
+model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+prompt = f"Which of the following topics {list(topic_maintainers_map.keys())} is this issue about:\n{issue['body']}"
+inputs = tokenizer(prompt, return_tensors="pt")
+# Generate
+generate_ids = model.generate(inputs.input_ids, max_length=30)
+tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

retrieval.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import argparse
+import json
+import pprint
+import numpy as np
+from sentence_transformers import SentenceTransformer
+def cosine_similarity(a, b):
+    if a.ndim == 1:
+        a = a.reshape(1, -1)
+    if b.ndim == 1:
+        b = b.reshape(1, -1)
+    return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
+def retrieve_issue_rankings(
+    query: str,
+    model_id: str,
+    input_embedding_filename: str,
+):
+    """
+    Given a query returns the list of issues sorted by similarity to the query
+    according to their embedding index
+    """
+    model = SentenceTransformer(model_id)
+    embeddings = np.load(input_embedding_filename)
+    query_embedding = model.encode(query)
+    # Calculate the cosine similarity between the query and all the issues
+    cosine_similarities = cosine_similarity(query_embedding, embeddings)
+    # Get the index of the most similar issue
+    most_similar_indices = np.argsort(cosine_similarities)
+    most_similar_indices = most_similar_indices[0][::-1]
+    return most_similar_indices
+def print_issue(issues, issue_id):
+    # Get the issue id of the most similar issue
+    issue_info = issues[issue_id]
+    print(f"#{issue_id}", issue_info["title"])
+    print(issue_info["body"])
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("query", type=str)
+    parser.add_argument("--model_id", type=str, default="all-mpnet-base-v2")
+    parser.add_argument("--input_embedding_filename", type=str, default="issue_embeddings.npy")
+    parser.add_argument("--input_index_filename", type=str, default="embedding_index_to_issue.json")
+    args = parser.parse_args()
+    issue_rankings = retrieve_issue_rankings(
+        query=args.query,
+        model_id=args.model_id,
+        input_embedding_filename=args.input_embedding_filename,
+    )
+    with open("issues_dict.json", "r") as f:
+        issues = json.load(f)
+    with open(args.input_index_filename, "r") as f:
+        embedding_index_to_issue = json.load(f)
+    issue_ids = [embedding_index_to_issue[str(i)] for i in issue_rankings]
+    for issue_id in issue_ids[:3]:
+        print(issue_id)
+        print_issue(issues, issue_id)
+        print("\n\n\n")

update_embeddings.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import argparse
+import json
+import logging
+import os
+import numpy as np
+from sentence_transformers import SentenceTransformer
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def load_model(model_id: str):
+    return SentenceTransformer(model_id)
+class EmbeddingWriter:
+    def __init__(
+        self,
+        output_embedding_filename,
+        output_index_filename,
+        update,
+        embedding_to_issue_index,
+        embeddings=None
+    ) -> None:
+        self.output_embedding_filename = output_embedding_filename
+        self.output_index_filename = output_index_filename
+        self.embeddings = [] if embeddings is None else list(embeddings)
+        self.embedding_to_issue_index = embedding_to_issue_index
+        self.update = update
+    def __enter__(self):
+        return self.embeddings
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        embeddings = np.array(self.embeddings)
+        if self.update and os.path.exists(self.output_embedding_filename):
+            embeddings = np.concatenate([np.load(self.output_embedding_filename), embeddings])
+        logger.info(f"Saving embeddings to {self.output_embedding_filename}")
+        np.save(self.output_embedding_filename, embeddings)
+        logger.info(f"Saving embedding index to {self.output_index_filename}")
+        with open(self.output_index_filename, "w") as f:
+            json.dump(self.embedding_to_issue_index, f, indent=4)
+def embed_issues(
+    input_filename: str,
+    model_id: str,
+    issue_type: str,
+):
+    output_embedding_filename = f"{issue_type}_embeddings.npy"
+    output_index_filename = f"embedding_index_to_{issue_type}.json"
+    model = load_model(model_id)
+    with open(input_filename, "r") as f:
+        updated_issues = json.load(f)
+    with open(output_index_filename, "r") as f:
+        embedding_to_issue_index = json.load(f)
+    embeddings = np.load(output_embedding_filename)
+    issue_to_embedding_index = {v: k for k, v in embedding_to_issue_index.items()}
+    with EmbeddingWriter(
+        output_embedding_filename=output_embedding_filename,
+        output_index_filename=output_index_filename,
+        update=False,
+        embedding_to_issue_index=embedding_to_issue_index,
+        embeddings=embeddings
+    ) as embeddings:
+        for issue_id, issue in updated_issues.items():
+            if "body" not in issue:
+                logger.info(f"Skipping issue {issue_id} as it has no body")
+                continue
+            if issue_type == "pull_request" and "pull_request" not in issue:
+                logger.info(f"Skipping issue {issue_id} as it is not a pull request")
+                continue
+            elif issue_type == "issue" and "pull_request" in issue:
+                logger.info(f"Skipping issue {issue_id} as it is a pull request")
+                continue
+            logger.info(f"Embedding issue {issue_id}")
+            embedding = model.encode(issue["body"])
+            if issue_id in issue_to_embedding_index:
+                index = issue_to_embedding_index[issue_id]
+                embeddings[index] = embedding
+            else:
+                index = len(embeddings)
+                # embeddings = np.concatenate([embeddings, embedding.reshape(1, -1)])
+                embeddings.append(embedding)
+                issue_to_embedding_index[issue_id] = index
+                embedding_to_issue_index[index] = issue_id
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('issue_type', choices=['issue', 'pull'], default='issue')
+    parser.add_argument("--input_filename", type=str, default="updated_issues.json")
+    parser.add_argument("--model_id", type=str, default="all-mpnet-base-v2")
+    args = parser.parse_args()
+    embed_issues(**vars(args))

update_stored_issues.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Module which updates any of the issues to reflect changes in the issue state
+"""
+import json
+import datetime
+from defaults import TOKEN, OWNER, REPO
+GITHUB_API_VERSION = "2022-11-28"
+# Get the issues that have been updated since the last update
+import json
+import argparse
+import requests
+import os
+import numpy as np
+import json
+import datetime
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+today = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+OWNER = "huggingface"
+REPO = "transformers"
+GITHUB_API_VERSION = "2022-11-28"
+TOKEN = os.environ.get("GITHUB_TOKEN")
+JSON_FILE = f"issues.json"
+def get_issues(
+    input_filename=JSON_FILE,
+    output_filename=JSON_FILE,
+    github_api_version=GITHUB_API_VERSION,
+    owner=OWNER,
+    repo=REPO,
+    token=TOKEN,
+    n_pages=-1,
+):
+    """
+    Function to get the issues from the transformers repo and save them to a json file
+    """
+    with open("issues_dict.json", "r") as f:
+        issues = json.load(f)
+    # Get most recent updated at information
+    updated_at = [issue["updated_at"] for issue in issues.values()]
+    most_recent = max(updated_at)
+    # If file exists and we want to overwrite it, delete it
+    if not os.path.exists(output_filename):
+        raise ValueError(f"File {output_filename} does not exist")
+    # Define the URL and headers
+    url = f"https://api.github.com/repos/{owner}/{repo}/issues"
+    headers = {
+        "Accept": "application/vnd.github+json",
+        f"Authorization": f"{token}",
+        "X-GitHub-Api-Version": f"{github_api_version}",
+        "User-Agent": "amyeroberts",
+    }
+    per_page = 100
+    page = 1
+    query_params = {
+        "state": "all",
+        "since": "2024-02-01T11:33:35Z",
+        # "since": most_recent,
+        "sort": "created",
+        "direction": "asc",
+        "page": page,
+    }
+    new_lines = []
+    page_limit = (n_pages + page) if n_pages > 0 else np.inf
+    while True:
+        if page >= page_limit:
+            break
+        # Send the GET request
+        response = requests.get(url, headers=headers, params=query_params)
+        if not response.status_code == 200:
+            raise ValueError(
+                f"Request failed with status code {response.status_code} and message {response.text}"
+            )
+        json_response = response.json()
+        logger.info(f"Page: {page}, number of issues: {len(json_response)}")
+        # If we get an empty response, we've reached the end of the issues
+        if len(json_response) == 0:
+            break
+        new_lines.extend(json_response)
+        # If we get less than the number of issues per page, we've reached the end of the issues
+        if len(json_response) < per_page:
+            break
+        page += 1
+        query_params["page"] = page
+    issue_lines_map = {issue["number"]: issue for issue in new_lines}
+    with open(input_filename, "r") as f:
+        with open("tmp_" + output_filename, "a") as g:
+            for line in f:
+                issue = json.loads(line)
+                number = issue["number"]
+                if number in issue_lines_map:
+                    g.write(json.dumps(issue_lines_map[number]))
+                    g.write("\n")
+                else:
+                    g.write(line)
+    os.rename("tmp_" + output_filename, output_filename)
+    with open("updated_issues.json", "w") as f:
+        json.dump(issue_lines_map, f, indent=4, sort_keys=True)
+    return output_filename
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_filename", type=str, default=JSON_FILE)
+    parser.add_argument("--output_filename", type=str, default=JSON_FILE)
+    parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
+    parser.add_argument("--owner", type=str, default=OWNER)
+    parser.add_argument("--repo", type=str, default=REPO)
+    parser.add_argument("--token", type=str, default=TOKEN)
+    parser.add_argument("--n_pages", type=int, default=-1)
+    args = parser.parse_args()
+    get_issues(**vars(args))