Spaces:
Sleeping
Sleeping
Amy Roberts
commited on
Commit
·
9b744c5
1
Parent(s):
c9976b6
Draft
Browse files- .gitignore +167 -0
- app.py +101 -0
- build_embeddings.py +117 -0
- build_issue_dict.py +19 -0
- defaults.py +5 -0
- find_similar_issues.py +91 -0
- get_issues.py +129 -0
- get_topic.py +43 -0
- retrieval.py +76 -0
- update_embeddings.py +109 -0
- update_stored_issues.py +142 -0
.gitignore
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# poetry
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
102 |
+
#poetry.lock
|
103 |
+
|
104 |
+
# pdm
|
105 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
106 |
+
#pdm.lock
|
107 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
108 |
+
# in version control.
|
109 |
+
# https://pdm.fming.dev/#use-with-ide
|
110 |
+
.pdm.toml
|
111 |
+
|
112 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
113 |
+
__pypackages__/
|
114 |
+
|
115 |
+
# Celery stuff
|
116 |
+
celerybeat-schedule
|
117 |
+
celerybeat.pid
|
118 |
+
|
119 |
+
# SageMath parsed files
|
120 |
+
*.sage.py
|
121 |
+
|
122 |
+
# Environments
|
123 |
+
.env
|
124 |
+
.venv
|
125 |
+
env/
|
126 |
+
venv/
|
127 |
+
ENV/
|
128 |
+
env.bak/
|
129 |
+
venv.bak/
|
130 |
+
|
131 |
+
# Spyder project settings
|
132 |
+
.spyderproject
|
133 |
+
.spyproject
|
134 |
+
|
135 |
+
# Rope project settings
|
136 |
+
.ropeproject
|
137 |
+
|
138 |
+
# mkdocs documentation
|
139 |
+
/site
|
140 |
+
|
141 |
+
# mypy
|
142 |
+
.mypy_cache/
|
143 |
+
.dmypy.json
|
144 |
+
dmypy.json
|
145 |
+
|
146 |
+
# Pyre type checker
|
147 |
+
.pyre/
|
148 |
+
|
149 |
+
# pytype static type analyzer
|
150 |
+
.pytype/
|
151 |
+
|
152 |
+
# Cython debug symbols
|
153 |
+
cython_debug/
|
154 |
+
|
155 |
+
# PyCharm
|
156 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
157 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
158 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
159 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
160 |
+
#.idea/
|
161 |
+
|
162 |
+
# Data
|
163 |
+
*.json
|
164 |
+
*.png
|
165 |
+
*.npy
|
166 |
+
*.jpg
|
167 |
+
*.pdf
|
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
from find_similar_issues import get_similar_issues
|
7 |
+
import requests
|
8 |
+
from html2image import Html2Image
|
9 |
+
import io
|
10 |
+
|
11 |
+
hti = Html2Image(size=(1920, 1080 * 3))
|
12 |
+
|
13 |
+
from defaults import OWNER, REPO, TOKEN
|
14 |
+
|
15 |
+
|
16 |
+
def get_query_issue_information(issue_no, token):
|
17 |
+
headers = {
|
18 |
+
"Accept": "application/vnd.github+json",
|
19 |
+
"Authorization": f"{token}",
|
20 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
21 |
+
"User-Agent": "amyeroberts",
|
22 |
+
}
|
23 |
+
request = requests.get(
|
24 |
+
f"https://api.github.com/repos/{OWNER}/{REPO}/issues/{issue_no}",
|
25 |
+
headers=headers,
|
26 |
+
)
|
27 |
+
if request.status_code != 200:
|
28 |
+
raise ValueError(f"Request failed with status code {request.status_code} and message {request.text}")
|
29 |
+
|
30 |
+
return request.json()
|
31 |
+
|
32 |
+
|
33 |
+
def find_similar_issues(issue, token):
|
34 |
+
similar_issues = get_similar_issues(issue, token=token)
|
35 |
+
similar_issues_summary = [f"#{issue['number']} - {issue['title']}" for issue in similar_issues]
|
36 |
+
return similar_issues_summary
|
37 |
+
|
38 |
+
|
39 |
+
def render_issue_as_image(issue, filename="image.png"):
|
40 |
+
url = issue["html_url"]
|
41 |
+
print(url)
|
42 |
+
hti.screenshot(url=url, save_as=filename)
|
43 |
+
return filename
|
44 |
+
|
45 |
+
|
46 |
+
def run_find_similar_issues(issue, token, n_issues):
|
47 |
+
issue_information = get_query_issue_information(issue, token=token)
|
48 |
+
# issue_information_summary = f"#{issue_information['number']} - {issue_information['title']}\n\n{issue_information['body']}"
|
49 |
+
similar_issues = get_similar_issues(issue, token=token, top_k=n_issues)
|
50 |
+
# similar_issues_summary = [f"#{issue['number']} - {issue['title']}" for issue in similar_issues]
|
51 |
+
|
52 |
+
issue_image = render_issue_as_image(issue_information, filename="query_issue.png")
|
53 |
+
|
54 |
+
image_names = []
|
55 |
+
for i, issue in enumerate(similar_issues):
|
56 |
+
image_names.append(render_issue_as_image(issue, filename=f"image{i}.png"))
|
57 |
+
|
58 |
+
# return issue_information_summary, image_names
|
59 |
+
page_html = requests.get(issue_information["html_url"]).text
|
60 |
+
|
61 |
+
return issue_image, page_html, image_names
|
62 |
+
|
63 |
+
|
64 |
+
with gr.Blocks(title="Github Bot") as demo:
|
65 |
+
with gr.Tab("Find similar issues"):
|
66 |
+
with gr.Row():
|
67 |
+
with gr.Column():
|
68 |
+
with gr.Row():
|
69 |
+
issue = gr.Textbox(label="Github Issue", placeholder="Github issue you want to find similar issues to")
|
70 |
+
token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
|
71 |
+
with gr.Row():
|
72 |
+
n_issues = gr.Slider(1, 50, value=5, label="Number of similar issues", info="Choose between 1 and 50")
|
73 |
+
|
74 |
+
with gr.Row():
|
75 |
+
submit_button = gr.Button(value="Submit")
|
76 |
+
|
77 |
+
with gr.Row():
|
78 |
+
with gr.Column():
|
79 |
+
issue_image = gr.Image(type="filepath", label="Your issue")
|
80 |
+
with gr.Column():
|
81 |
+
similar_issues_screenshots = gr.Gallery(label="Similar Issues")
|
82 |
+
issue_text = gr.HTML(label="Issue text", elem_id="issue_text")
|
83 |
+
submit_button.click(run_find_similar_issues, outputs=[issue_image, issue_text, similar_issues_screenshots], inputs=[issue, token, n_issues])
|
84 |
+
|
85 |
+
with gr.Tab("Search issues"):
|
86 |
+
with gr.Row():
|
87 |
+
query = gr.Textbox(label="Query", placeholder="Search for issues")
|
88 |
+
with gr.Row():
|
89 |
+
token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
|
90 |
+
with gr.Row():
|
91 |
+
pass
|
92 |
+
|
93 |
+
with gr.Tab("Find maintainers to ping"):
|
94 |
+
with gr.Row():
|
95 |
+
issue = gr.Textbox(label="Github Issue / PR", placeholder="Issue or PR you want to find maintainers to ping for")
|
96 |
+
with gr.Row():
|
97 |
+
token = gr.Textbox(label="Github Token", placeholder="Your github token for authentication. This is not stored anywhere.")
|
98 |
+
|
99 |
+
|
100 |
+
if __name__ == "__main__":
|
101 |
+
demo.launch()
|
build_embeddings.py
ADDED
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import logging
|
4 |
+
import os
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
|
9 |
+
logging.basicConfig(level=logging.INFO)
|
10 |
+
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
|
14 |
+
def load_model(model_id: str):
|
15 |
+
return SentenceTransformer(model_id)
|
16 |
+
|
17 |
+
|
18 |
+
class EmbeddingWriter:
|
19 |
+
def __init__(self, output_embedding_filename, output_index_filename, update, embedding_to_issue_index) -> None:
|
20 |
+
self.output_embedding_filename = output_embedding_filename
|
21 |
+
self.output_index_filename = output_index_filename
|
22 |
+
self.embeddings = []
|
23 |
+
self.embedding_to_issue_index = embedding_to_issue_index
|
24 |
+
self.update = update
|
25 |
+
|
26 |
+
def __enter__(self):
|
27 |
+
return self.embeddings
|
28 |
+
|
29 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
30 |
+
if len(self.embeddings) == 0:
|
31 |
+
return
|
32 |
+
|
33 |
+
embeddings = np.array(self.embeddings)
|
34 |
+
|
35 |
+
if self.update and os.path.exists(self.output_embedding_filename):
|
36 |
+
embeddings = np.concatenate([np.load(self.output_embedding_filename), embeddings])
|
37 |
+
|
38 |
+
logger.info(f"Saving embeddings to {self.output_embedding_filename}")
|
39 |
+
np.save(self.output_embedding_filename, embeddings)
|
40 |
+
|
41 |
+
logger.info(f"Saving embedding index to {self.output_index_filename}")
|
42 |
+
with open(self.output_index_filename, "w") as f:
|
43 |
+
json.dump(self.embedding_to_issue_index, f, indent=4)
|
44 |
+
|
45 |
+
|
46 |
+
def embed_issues(
|
47 |
+
input_filename: str,
|
48 |
+
model_id: str,
|
49 |
+
issue_type: str,
|
50 |
+
n_issues: int = -1,
|
51 |
+
update: bool = False
|
52 |
+
):
|
53 |
+
model = load_model(model_id)
|
54 |
+
|
55 |
+
output_embedding_filename = f"{issue_type}_embeddings.npy"
|
56 |
+
output_index_filename = f"embedding_index_to_{issue_type}.json"
|
57 |
+
|
58 |
+
with open(input_filename, "r") as f:
|
59 |
+
issues = json.load(f)
|
60 |
+
|
61 |
+
if update and os.path.exists(output_index_filename):
|
62 |
+
with open(output_index_filename, "r") as f:
|
63 |
+
embedding_to_issue_index = json.load(f)
|
64 |
+
embedding_index = len(embedding_to_issue_index)
|
65 |
+
else:
|
66 |
+
embedding_to_issue_index = {}
|
67 |
+
embedding_index = 0
|
68 |
+
|
69 |
+
max_issues = n_issues if n_issues > 0 else len(issues)
|
70 |
+
n_issues = 0
|
71 |
+
|
72 |
+
with EmbeddingWriter(
|
73 |
+
output_embedding_filename=output_embedding_filename,
|
74 |
+
output_index_filename=output_index_filename,
|
75 |
+
update=update,
|
76 |
+
embedding_to_issue_index=embedding_to_issue_index
|
77 |
+
) as embeddings: #, embedding_to_issue_index:
|
78 |
+
for issue_id, issue in issues.items():
|
79 |
+
if n_issues >= max_issues:
|
80 |
+
break
|
81 |
+
|
82 |
+
if issue_id in embedding_to_issue_index.values() and update:
|
83 |
+
logger.info(f"Skipping issue {issue_id} as it is already embedded")
|
84 |
+
continue
|
85 |
+
|
86 |
+
if "body" not in issue:
|
87 |
+
logger.info(f"Skipping issue {issue_id} as it has no body")
|
88 |
+
continue
|
89 |
+
|
90 |
+
if issue_type == "pull_request" and "pull_request" not in issue:
|
91 |
+
logger.info(f"Skipping issue {issue_id} as it is not a pull request")
|
92 |
+
continue
|
93 |
+
|
94 |
+
elif issue_type == "issue" and "pull_request" in issue:
|
95 |
+
logger.info(f"Skipping issue {issue_id} as it is a pull request")
|
96 |
+
continue
|
97 |
+
|
98 |
+
title = issue["title"] if issue["title"] is not None else ""
|
99 |
+
body = issue["body"] if issue["body"] is not None else ""
|
100 |
+
|
101 |
+
logger.info(f"Embedding issue {issue_id}")
|
102 |
+
embedding = model.encode(title + "\n" + body)
|
103 |
+
embedding_to_issue_index[embedding_index] = issue_id
|
104 |
+
embeddings.append(embedding)
|
105 |
+
embedding_index += 1
|
106 |
+
n_issues += 1
|
107 |
+
|
108 |
+
|
109 |
+
if __name__ == "__main__":
|
110 |
+
parser = argparse.ArgumentParser()
|
111 |
+
parser.add_argument('issue_type', choices=['issue', 'pull'], default='issue')
|
112 |
+
parser.add_argument("--input_filename", type=str, default="issues_dict.json")
|
113 |
+
parser.add_argument("--model_id", type=str, default="all-mpnet-base-v2")
|
114 |
+
parser.add_argument("--n_issues", type=int, default=-1)
|
115 |
+
parser.add_argument("--update", action="store_true")
|
116 |
+
args = parser.parse_args()
|
117 |
+
embed_issues(**vars(args))
|
build_issue_dict.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
|
4 |
+
def build_json_file(input_filename, output_filename):
|
5 |
+
with open(input_filename, "r") as f:
|
6 |
+
json_lines = f.readlines()
|
7 |
+
|
8 |
+
issues = [json.loads(line) for line in json_lines]
|
9 |
+
json_dict = {issue["number"]: issue for issue in issues}
|
10 |
+
|
11 |
+
with open(output_filename, "w") as f:
|
12 |
+
json.dump(json_dict, f, indent=4)
|
13 |
+
|
14 |
+
if __name__ == "__main__":
|
15 |
+
parser = argparse.ArgumentParser()
|
16 |
+
parser.add_argument("--input_filename", type=str, default="issues.json")
|
17 |
+
parser.add_argument("--output_filename", type=str, default="issues_dict.json")
|
18 |
+
args = parser.parse_args()
|
19 |
+
build_json_file(**vars(args))
|
defaults.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
OWNER = "huggingface"
|
4 |
+
REPO = "transformers"
|
5 |
+
TOKEN = os.environ.get("GITHUB_TOKEN")
|
find_similar_issues.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pprint
|
2 |
+
import json
|
3 |
+
import argparse
|
4 |
+
import requests
|
5 |
+
from defaults import OWNER, REPO, TOKEN
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
import numpy as np
|
8 |
+
|
9 |
+
model_id = "all-mpnet-base-v2"
|
10 |
+
model = SentenceTransformer(model_id)
|
11 |
+
|
12 |
+
|
13 |
+
def load_embeddings():
|
14 |
+
"""
|
15 |
+
Function to load embeddings from file
|
16 |
+
"""
|
17 |
+
embeddings = np.load("issue_embeddings.npy")
|
18 |
+
return embeddings
|
19 |
+
|
20 |
+
def load_issue_information():
|
21 |
+
"""
|
22 |
+
Function to load issue information from file
|
23 |
+
"""
|
24 |
+
with open("embedding_index_to_issue.json", "r") as f:
|
25 |
+
embedding_index_to_issue = json.load(f)
|
26 |
+
|
27 |
+
with open("issues_dict.json", "r") as f:
|
28 |
+
issues = json.load(f)
|
29 |
+
|
30 |
+
return embedding_index_to_issue, issues
|
31 |
+
|
32 |
+
|
33 |
+
def cosine_similarity(a, b):
|
34 |
+
if a.ndim == 1:
|
35 |
+
a = a.reshape(1, -1)
|
36 |
+
|
37 |
+
if b.ndim == 1:
|
38 |
+
b = b.reshape(1, -1)
|
39 |
+
|
40 |
+
return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
|
41 |
+
|
42 |
+
|
43 |
+
def get_similar_issues(issue_no, top_k=5, token=TOKEN, owner=OWNER, repo=REPO):
|
44 |
+
"""
|
45 |
+
Function to find similar issues
|
46 |
+
"""
|
47 |
+
url = f"https://api.github.com/repos/{owner}/{repo}/issues"
|
48 |
+
headers = {
|
49 |
+
"Accept": "application/vnd.github+json",
|
50 |
+
f"Authorization": "{token}",
|
51 |
+
"X-GitHub-Api-Version": "2022-11-28",
|
52 |
+
"User-Agent": "amyeroberts",
|
53 |
+
}
|
54 |
+
request = requests.get(
|
55 |
+
f"https://api.github.com/repos/{OWNER}/{REPO}/issues/{issue_no}",
|
56 |
+
headers=headers,
|
57 |
+
)
|
58 |
+
|
59 |
+
if request.status_code != 200:
|
60 |
+
raise ValueError(f"Request failed with status code {request.status_code}")
|
61 |
+
|
62 |
+
query_embedding = model.encode(request.json()["body"])
|
63 |
+
query_embedding = query_embedding.reshape(1, -1)
|
64 |
+
embeddings = load_embeddings()
|
65 |
+
|
66 |
+
# Calculate the cosine similarity between the query and all the issues
|
67 |
+
cosine_similarities = cosine_similarity(query_embedding, embeddings)
|
68 |
+
|
69 |
+
# Get the index of the most similar issue
|
70 |
+
most_similar_indices = np.argsort(cosine_similarities)
|
71 |
+
most_similar_indices = most_similar_indices[0][::-1]
|
72 |
+
|
73 |
+
embedding_index_to_issue, issues = load_issue_information()
|
74 |
+
|
75 |
+
similar_issues = []
|
76 |
+
for i in most_similar_indices[:top_k]:
|
77 |
+
issue_no = embedding_index_to_issue[str(i)]
|
78 |
+
similar_issues.append(issues[issue_no])
|
79 |
+
|
80 |
+
return similar_issues
|
81 |
+
|
82 |
+
|
83 |
+
if __name__ == "__main__":
|
84 |
+
parser = argparse.ArgumentParser()
|
85 |
+
parser.add_argument("issue_no", type=int)
|
86 |
+
parser.add_argument("--top_k", type=int, default=5)
|
87 |
+
parser.add_argument("--token", type=str, default=TOKEN)
|
88 |
+
parser.add_argument("--owner", type=str, default=OWNER)
|
89 |
+
parser.add_argument("--repo", type=str, default=REPO)
|
90 |
+
args = parser.parse_args()
|
91 |
+
get_similar_issues(args.issue_no, args.top_k, args.token, args.owner, args.repo)
|
get_issues.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
import requests
|
6 |
+
import os
|
7 |
+
import numpy as np
|
8 |
+
import json
|
9 |
+
import datetime
|
10 |
+
import logging
|
11 |
+
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
today = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
|
17 |
+
|
18 |
+
OWNER = "huggingface"
|
19 |
+
REPO = "transformers"
|
20 |
+
GITHUB_API_VERSION = "2022-11-28"
|
21 |
+
TOKEN = os.environ.get("GITHUB_TOKEN")
|
22 |
+
JSON_FILE = f"issues.json"
|
23 |
+
UPDATE_FILE = False
|
24 |
+
OVERWRITE_FILE = True
|
25 |
+
|
26 |
+
|
27 |
+
def get_last_entry(file_path):
|
28 |
+
with open(file_path, 'r') as file:
|
29 |
+
# Read the last line
|
30 |
+
last_line = file.readlines()[-1]
|
31 |
+
return json.loads(last_line)
|
32 |
+
|
33 |
+
|
34 |
+
def get_last_issue_number(file_path):
|
35 |
+
if os.path.exists(file_path):
|
36 |
+
last_entry = get_last_entry(file_path=file_path)
|
37 |
+
return last_entry['number']
|
38 |
+
return 0
|
39 |
+
|
40 |
+
|
41 |
+
def get_issues(
|
42 |
+
overwrite=OVERWRITE_FILE,
|
43 |
+
update=UPDATE_FILE,
|
44 |
+
output_filename=JSON_FILE,
|
45 |
+
github_api_version=GITHUB_API_VERSION,
|
46 |
+
owner=OWNER,
|
47 |
+
repo=REPO,
|
48 |
+
token=TOKEN,
|
49 |
+
n_pages=-1,
|
50 |
+
):
|
51 |
+
"""
|
52 |
+
Function to get the issues from the transformers repo and save them to a json file
|
53 |
+
"""
|
54 |
+
|
55 |
+
# If file exists and we want to overwrite it, delete it
|
56 |
+
if os.path.exists(output_filename) and overwrite:
|
57 |
+
logging.info(f"Deleting file {output_filename}")
|
58 |
+
os.remove(output_filename)
|
59 |
+
|
60 |
+
# Define the URL and headers
|
61 |
+
url = f"https://api.github.com/repos/{owner}/{repo}/issues"
|
62 |
+
headers = {
|
63 |
+
"Accept": "application/vnd.github+json",
|
64 |
+
f"Authorization": f"{token}",
|
65 |
+
"X-GitHub-Api-Version": f"{github_api_version}",
|
66 |
+
"User-Agent": "amyeroberts",
|
67 |
+
}
|
68 |
+
last_issue_number = get_last_issue_number(file_path=output_filename)
|
69 |
+
per_page = 100
|
70 |
+
page = last_issue_number // per_page + 1
|
71 |
+
query_params = {
|
72 |
+
"state": "all",
|
73 |
+
"per_page": per_page,
|
74 |
+
"sort": "created",
|
75 |
+
"direction": "asc",
|
76 |
+
"page": page,
|
77 |
+
}
|
78 |
+
|
79 |
+
if os.path.exists(output_filename) and not update and not overwrite:
|
80 |
+
raise ValueError(f"File {output_filename} already exists")
|
81 |
+
|
82 |
+
page_limit = (n_pages + page) if n_pages > 0 else np.inf
|
83 |
+
while True:
|
84 |
+
if page >= page_limit:
|
85 |
+
break
|
86 |
+
|
87 |
+
# Send the GET request
|
88 |
+
response = requests.get(url, headers=headers, params=query_params)
|
89 |
+
|
90 |
+
if not response.status_code == 200:
|
91 |
+
raise ValueError(
|
92 |
+
f"Request failed with status code {response.status_code} and message {response.text}"
|
93 |
+
)
|
94 |
+
|
95 |
+
json_response = response.json()
|
96 |
+
logger.info(f"Page: {page}, number of issues: {len(json_response)}")
|
97 |
+
|
98 |
+
# If we get an empty response, we've reached the end of the issues
|
99 |
+
if len(json_response) == 0:
|
100 |
+
break
|
101 |
+
|
102 |
+
with open(output_filename, "a") as f:
|
103 |
+
for value in json_response:
|
104 |
+
if value["number"] <= last_issue_number:
|
105 |
+
continue
|
106 |
+
json.dump(value, f)
|
107 |
+
f.write("\n")
|
108 |
+
|
109 |
+
if len(json_response) < per_page:
|
110 |
+
break
|
111 |
+
|
112 |
+
page += 1
|
113 |
+
query_params["page"] = page
|
114 |
+
|
115 |
+
return output_filename
|
116 |
+
|
117 |
+
|
118 |
+
if __name__ == "__main__":
|
119 |
+
parser = argparse.ArgumentParser()
|
120 |
+
parser.add_argument("--update", action="store_true", default=True)
|
121 |
+
parser.add_argument("--overwrite", action="store_true", default=False)
|
122 |
+
parser.add_argument("--output_filename", type=str, default=JSON_FILE)
|
123 |
+
parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
|
124 |
+
parser.add_argument("--owner", type=str, default=OWNER)
|
125 |
+
parser.add_argument("--repo", type=str, default=REPO)
|
126 |
+
parser.add_argument("--token", type=str, default=TOKEN)
|
127 |
+
parser.add_argument("--n_pages", type=int, default=-1)
|
128 |
+
args = parser.parse_args()
|
129 |
+
get_issues(**vars(args))
|
get_topic.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"text": {}
|
2 |
+
|
3 |
+
topic_maintainers_map ={
|
4 |
+
"text models": ["@ArthurZucker", "@younesbelkada"],
|
5 |
+
"vision models": "@amyeroberts",
|
6 |
+
"speech models": "@sanchit-gandhi",
|
7 |
+
"graph models": "@clefourrier",
|
8 |
+
"flax": "@sanchit-gandhi",
|
9 |
+
"generate": "@gante",
|
10 |
+
"pipelines": "@Narsil",
|
11 |
+
"tensorflow": ["@gante", "@Rocketknight1"],
|
12 |
+
"tokenizers": "@ArthurZucker",
|
13 |
+
"trainer": ["@muellerzr", "@pacman100"],
|
14 |
+
"deepspeed": "@pacman100",
|
15 |
+
"ray/raytune": ["@richardliaw", "@amogkam"],
|
16 |
+
"Big Model Inference": "@SunMarc",
|
17 |
+
"quantization (bitsandbytes, autogpt)": ["@SunMarc", "@younesbelkada"],
|
18 |
+
"Documentation": ["@stevhliu", "@MKhalusova"],
|
19 |
+
"accelerate": "different repo",
|
20 |
+
"datasets": "different repo",
|
21 |
+
"diffusers": "different repo",
|
22 |
+
"rust tokenizers": "different repo",
|
23 |
+
"Flax examples": "@sanchit-gandhi",
|
24 |
+
"PyTorch vision examples": "@amyeroberts",
|
25 |
+
"PyTorch text examples": "@ArthurZucker",
|
26 |
+
"PyTorch speech examples": "@sanchit-gandhi",
|
27 |
+
"PyTorch generate examples": "@gante",
|
28 |
+
"TensorFlow": "@Rocketknight1",
|
29 |
+
"Research projects and examples": "not maintained",
|
30 |
+
}
|
31 |
+
|
32 |
+
|
33 |
+
from transformers import AutoTokenizer, LlamaForCausalLM
|
34 |
+
|
35 |
+
model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
|
36 |
+
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
|
37 |
+
|
38 |
+
prompt = f"Which of the following topics {list(topic_maintainers_map.keys())} is this issue about:\n{issue['body']}"
|
39 |
+
inputs = tokenizer(prompt, return_tensors="pt")
|
40 |
+
|
41 |
+
# Generate
|
42 |
+
generate_ids = model.generate(inputs.input_ids, max_length=30)
|
43 |
+
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
retrieval.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import pprint
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
from sentence_transformers import SentenceTransformer
|
7 |
+
|
8 |
+
def cosine_similarity(a, b):
|
9 |
+
if a.ndim == 1:
|
10 |
+
a = a.reshape(1, -1)
|
11 |
+
|
12 |
+
if b.ndim == 1:
|
13 |
+
b = b.reshape(1, -1)
|
14 |
+
|
15 |
+
return np.dot(a, b.T) / (np.linalg.norm(a, axis=1) * np.linalg.norm(b, axis=1))
|
16 |
+
|
17 |
+
|
18 |
+
def retrieve_issue_rankings(
|
19 |
+
query: str,
|
20 |
+
model_id: str,
|
21 |
+
input_embedding_filename: str,
|
22 |
+
):
|
23 |
+
"""
|
24 |
+
Given a query returns the list of issues sorted by similarity to the query
|
25 |
+
according to their embedding index
|
26 |
+
"""
|
27 |
+
model = SentenceTransformer(model_id)
|
28 |
+
|
29 |
+
embeddings = np.load(input_embedding_filename)
|
30 |
+
|
31 |
+
query_embedding = model.encode(query)
|
32 |
+
|
33 |
+
# Calculate the cosine similarity between the query and all the issues
|
34 |
+
cosine_similarities = cosine_similarity(query_embedding, embeddings)
|
35 |
+
|
36 |
+
# Get the index of the most similar issue
|
37 |
+
most_similar_indices = np.argsort(cosine_similarities)
|
38 |
+
most_similar_indices = most_similar_indices[0][::-1]
|
39 |
+
return most_similar_indices
|
40 |
+
|
41 |
+
|
42 |
+
def print_issue(issues, issue_id):
|
43 |
+
# Get the issue id of the most similar issue
|
44 |
+
issue_info = issues[issue_id]
|
45 |
+
|
46 |
+
print(f"#{issue_id}", issue_info["title"])
|
47 |
+
print(issue_info["body"])
|
48 |
+
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
parser = argparse.ArgumentParser()
|
52 |
+
parser.add_argument("query", type=str)
|
53 |
+
parser.add_argument("--model_id", type=str, default="all-mpnet-base-v2")
|
54 |
+
parser.add_argument("--input_embedding_filename", type=str, default="issue_embeddings.npy")
|
55 |
+
parser.add_argument("--input_index_filename", type=str, default="embedding_index_to_issue.json")
|
56 |
+
|
57 |
+
args = parser.parse_args()
|
58 |
+
|
59 |
+
issue_rankings = retrieve_issue_rankings(
|
60 |
+
query=args.query,
|
61 |
+
model_id=args.model_id,
|
62 |
+
input_embedding_filename=args.input_embedding_filename,
|
63 |
+
)
|
64 |
+
|
65 |
+
with open("issues_dict.json", "r") as f:
|
66 |
+
issues = json.load(f)
|
67 |
+
|
68 |
+
with open(args.input_index_filename, "r") as f:
|
69 |
+
embedding_index_to_issue = json.load(f)
|
70 |
+
|
71 |
+
issue_ids = [embedding_index_to_issue[str(i)] for i in issue_rankings]
|
72 |
+
|
73 |
+
for issue_id in issue_ids[:3]:
|
74 |
+
print(issue_id)
|
75 |
+
print_issue(issues, issue_id)
|
76 |
+
print("\n\n\n")
|
update_embeddings.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import logging
|
4 |
+
import os
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
|
9 |
+
logging.basicConfig(level=logging.INFO)
|
10 |
+
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
+
|
13 |
+
|
14 |
+
def load_model(model_id: str):
|
15 |
+
return SentenceTransformer(model_id)
|
16 |
+
|
17 |
+
|
18 |
+
class EmbeddingWriter:
|
19 |
+
def __init__(
|
20 |
+
self,
|
21 |
+
output_embedding_filename,
|
22 |
+
output_index_filename,
|
23 |
+
update,
|
24 |
+
embedding_to_issue_index,
|
25 |
+
embeddings=None
|
26 |
+
) -> None:
|
27 |
+
self.output_embedding_filename = output_embedding_filename
|
28 |
+
self.output_index_filename = output_index_filename
|
29 |
+
self.embeddings = [] if embeddings is None else list(embeddings)
|
30 |
+
self.embedding_to_issue_index = embedding_to_issue_index
|
31 |
+
self.update = update
|
32 |
+
|
33 |
+
def __enter__(self):
|
34 |
+
return self.embeddings
|
35 |
+
|
36 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
37 |
+
embeddings = np.array(self.embeddings)
|
38 |
+
|
39 |
+
if self.update and os.path.exists(self.output_embedding_filename):
|
40 |
+
embeddings = np.concatenate([np.load(self.output_embedding_filename), embeddings])
|
41 |
+
|
42 |
+
logger.info(f"Saving embeddings to {self.output_embedding_filename}")
|
43 |
+
np.save(self.output_embedding_filename, embeddings)
|
44 |
+
|
45 |
+
logger.info(f"Saving embedding index to {self.output_index_filename}")
|
46 |
+
with open(self.output_index_filename, "w") as f:
|
47 |
+
json.dump(self.embedding_to_issue_index, f, indent=4)
|
48 |
+
|
49 |
+
|
50 |
+
def embed_issues(
|
51 |
+
input_filename: str,
|
52 |
+
model_id: str,
|
53 |
+
issue_type: str,
|
54 |
+
):
|
55 |
+
output_embedding_filename = f"{issue_type}_embeddings.npy"
|
56 |
+
output_index_filename = f"embedding_index_to_{issue_type}.json"
|
57 |
+
model = load_model(model_id)
|
58 |
+
|
59 |
+
with open(input_filename, "r") as f:
|
60 |
+
updated_issues = json.load(f)
|
61 |
+
|
62 |
+
with open(output_index_filename, "r") as f:
|
63 |
+
embedding_to_issue_index = json.load(f)
|
64 |
+
|
65 |
+
embeddings = np.load(output_embedding_filename)
|
66 |
+
|
67 |
+
issue_to_embedding_index = {v: k for k, v in embedding_to_issue_index.items()}
|
68 |
+
|
69 |
+
with EmbeddingWriter(
|
70 |
+
output_embedding_filename=output_embedding_filename,
|
71 |
+
output_index_filename=output_index_filename,
|
72 |
+
update=False,
|
73 |
+
embedding_to_issue_index=embedding_to_issue_index,
|
74 |
+
embeddings=embeddings
|
75 |
+
) as embeddings:
|
76 |
+
for issue_id, issue in updated_issues.items():
|
77 |
+
if "body" not in issue:
|
78 |
+
logger.info(f"Skipping issue {issue_id} as it has no body")
|
79 |
+
continue
|
80 |
+
|
81 |
+
if issue_type == "pull_request" and "pull_request" not in issue:
|
82 |
+
logger.info(f"Skipping issue {issue_id} as it is not a pull request")
|
83 |
+
continue
|
84 |
+
|
85 |
+
elif issue_type == "issue" and "pull_request" in issue:
|
86 |
+
logger.info(f"Skipping issue {issue_id} as it is a pull request")
|
87 |
+
continue
|
88 |
+
|
89 |
+
logger.info(f"Embedding issue {issue_id}")
|
90 |
+
embedding = model.encode(issue["body"])
|
91 |
+
|
92 |
+
if issue_id in issue_to_embedding_index:
|
93 |
+
index = issue_to_embedding_index[issue_id]
|
94 |
+
embeddings[index] = embedding
|
95 |
+
else:
|
96 |
+
index = len(embeddings)
|
97 |
+
# embeddings = np.concatenate([embeddings, embedding.reshape(1, -1)])
|
98 |
+
embeddings.append(embedding)
|
99 |
+
issue_to_embedding_index[issue_id] = index
|
100 |
+
embedding_to_issue_index[index] = issue_id
|
101 |
+
|
102 |
+
|
103 |
+
if __name__ == "__main__":
|
104 |
+
parser = argparse.ArgumentParser()
|
105 |
+
parser.add_argument('issue_type', choices=['issue', 'pull'], default='issue')
|
106 |
+
parser.add_argument("--input_filename", type=str, default="updated_issues.json")
|
107 |
+
parser.add_argument("--model_id", type=str, default="all-mpnet-base-v2")
|
108 |
+
args = parser.parse_args()
|
109 |
+
embed_issues(**vars(args))
|
update_stored_issues.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module which updates any of the issues to reflect changes in the issue state
|
3 |
+
"""
|
4 |
+
import json
|
5 |
+
import datetime
|
6 |
+
from defaults import TOKEN, OWNER, REPO
|
7 |
+
|
8 |
+
GITHUB_API_VERSION = "2022-11-28"
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
# Get the issues that have been updated since the last update
|
13 |
+
|
14 |
+
import json
|
15 |
+
|
16 |
+
import argparse
|
17 |
+
|
18 |
+
import requests
|
19 |
+
import os
|
20 |
+
import numpy as np
|
21 |
+
import json
|
22 |
+
import datetime
|
23 |
+
import logging
|
24 |
+
|
25 |
+
logging.basicConfig(level=logging.INFO)
|
26 |
+
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
|
29 |
+
today = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
|
30 |
+
|
31 |
+
OWNER = "huggingface"
|
32 |
+
REPO = "transformers"
|
33 |
+
GITHUB_API_VERSION = "2022-11-28"
|
34 |
+
TOKEN = os.environ.get("GITHUB_TOKEN")
|
35 |
+
JSON_FILE = f"issues.json"
|
36 |
+
|
37 |
+
|
38 |
+
def get_issues(
|
39 |
+
input_filename=JSON_FILE,
|
40 |
+
output_filename=JSON_FILE,
|
41 |
+
github_api_version=GITHUB_API_VERSION,
|
42 |
+
owner=OWNER,
|
43 |
+
repo=REPO,
|
44 |
+
token=TOKEN,
|
45 |
+
n_pages=-1,
|
46 |
+
):
|
47 |
+
"""
|
48 |
+
Function to get the issues from the transformers repo and save them to a json file
|
49 |
+
"""
|
50 |
+
with open("issues_dict.json", "r") as f:
|
51 |
+
issues = json.load(f)
|
52 |
+
|
53 |
+
# Get most recent updated at information
|
54 |
+
updated_at = [issue["updated_at"] for issue in issues.values()]
|
55 |
+
most_recent = max(updated_at)
|
56 |
+
|
57 |
+
# If file exists and we want to overwrite it, delete it
|
58 |
+
if not os.path.exists(output_filename):
|
59 |
+
raise ValueError(f"File {output_filename} does not exist")
|
60 |
+
|
61 |
+
# Define the URL and headers
|
62 |
+
url = f"https://api.github.com/repos/{owner}/{repo}/issues"
|
63 |
+
headers = {
|
64 |
+
"Accept": "application/vnd.github+json",
|
65 |
+
f"Authorization": f"{token}",
|
66 |
+
"X-GitHub-Api-Version": f"{github_api_version}",
|
67 |
+
"User-Agent": "amyeroberts",
|
68 |
+
}
|
69 |
+
per_page = 100
|
70 |
+
page = 1
|
71 |
+
query_params = {
|
72 |
+
"state": "all",
|
73 |
+
"since": "2024-02-01T11:33:35Z",
|
74 |
+
# "since": most_recent,
|
75 |
+
"sort": "created",
|
76 |
+
"direction": "asc",
|
77 |
+
"page": page,
|
78 |
+
}
|
79 |
+
|
80 |
+
new_lines = []
|
81 |
+
|
82 |
+
page_limit = (n_pages + page) if n_pages > 0 else np.inf
|
83 |
+
while True:
|
84 |
+
if page >= page_limit:
|
85 |
+
break
|
86 |
+
|
87 |
+
# Send the GET request
|
88 |
+
response = requests.get(url, headers=headers, params=query_params)
|
89 |
+
|
90 |
+
if not response.status_code == 200:
|
91 |
+
raise ValueError(
|
92 |
+
f"Request failed with status code {response.status_code} and message {response.text}"
|
93 |
+
)
|
94 |
+
|
95 |
+
json_response = response.json()
|
96 |
+
logger.info(f"Page: {page}, number of issues: {len(json_response)}")
|
97 |
+
|
98 |
+
# If we get an empty response, we've reached the end of the issues
|
99 |
+
if len(json_response) == 0:
|
100 |
+
break
|
101 |
+
|
102 |
+
new_lines.extend(json_response)
|
103 |
+
|
104 |
+
# If we get less than the number of issues per page, we've reached the end of the issues
|
105 |
+
if len(json_response) < per_page:
|
106 |
+
break
|
107 |
+
|
108 |
+
page += 1
|
109 |
+
query_params["page"] = page
|
110 |
+
|
111 |
+
issue_lines_map = {issue["number"]: issue for issue in new_lines}
|
112 |
+
|
113 |
+
with open(input_filename, "r") as f:
|
114 |
+
with open("tmp_" + output_filename, "a") as g:
|
115 |
+
for line in f:
|
116 |
+
issue = json.loads(line)
|
117 |
+
number = issue["number"]
|
118 |
+
if number in issue_lines_map:
|
119 |
+
g.write(json.dumps(issue_lines_map[number]))
|
120 |
+
g.write("\n")
|
121 |
+
else:
|
122 |
+
g.write(line)
|
123 |
+
|
124 |
+
os.rename("tmp_" + output_filename, output_filename)
|
125 |
+
|
126 |
+
with open("updated_issues.json", "w") as f:
|
127 |
+
json.dump(issue_lines_map, f, indent=4, sort_keys=True)
|
128 |
+
|
129 |
+
return output_filename
|
130 |
+
|
131 |
+
|
132 |
+
if __name__ == "__main__":
|
133 |
+
parser = argparse.ArgumentParser()
|
134 |
+
parser.add_argument("--input_filename", type=str, default=JSON_FILE)
|
135 |
+
parser.add_argument("--output_filename", type=str, default=JSON_FILE)
|
136 |
+
parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
|
137 |
+
parser.add_argument("--owner", type=str, default=OWNER)
|
138 |
+
parser.add_argument("--repo", type=str, default=REPO)
|
139 |
+
parser.add_argument("--token", type=str, default=TOKEN)
|
140 |
+
parser.add_argument("--n_pages", type=int, default=-1)
|
141 |
+
args = parser.parse_args()
|
142 |
+
get_issues(**vars(args))
|