Spaces:
Sleeping
Sleeping
Amy Roberts
commited on
Commit
·
12ae336
1
Parent(s):
2c3812c
Add documentation
Browse files- build_embeddings.py +18 -0
- build_issue_dict.py +4 -0
- defaults.py +1 -0
- fetch.py +19 -12
- retrieval.py +5 -0
- update_stored_issues.py +13 -4
build_embeddings.py
CHANGED
@@ -1,3 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import argparse
|
2 |
import json
|
3 |
import logging
|
|
|
1 |
+
"""
|
2 |
+
Module which builds embeddings for issues and pull requests
|
3 |
+
|
4 |
+
The module is designed to be run from the command line and takes the following arguments:
|
5 |
+
|
6 |
+
--input_filename: The name of the file containing the issues and pull requests
|
7 |
+
--model_id: The name of the sentence transformer model to use
|
8 |
+
--issue_type: The type of issue to embed (either "issue" or "pull")
|
9 |
+
--n_issues: The number of issues to embed
|
10 |
+
--update: Whether to update the existing embeddings
|
11 |
+
|
12 |
+
The module saves the embeddings to a file called <issue_type>_embeddings.npy and the index to a file called
|
13 |
+
embedding_index_to_<issue_type>.json
|
14 |
+
|
15 |
+
The index provides a mapping from the index of the embedding to the issue or pull request number.
|
16 |
+
|
17 |
+
"""
|
18 |
+
|
19 |
import argparse
|
20 |
import json
|
21 |
import logging
|
build_issue_dict.py
CHANGED
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
1 |
import argparse
|
2 |
import json
|
3 |
|
|
|
1 |
+
"""
|
2 |
+
Module which builds a dictionary keyed by issue number from a json file
|
3 |
+
"""
|
4 |
+
|
5 |
import argparse
|
6 |
import json
|
7 |
|
defaults.py
CHANGED
@@ -4,3 +4,4 @@ OWNER = "huggingface"
|
|
4 |
REPO = "transformers"
|
5 |
TOKEN = os.environ.get("GITHUB_TOKEN")
|
6 |
GITHUB_API_VERSION = "2022-11-28"
|
|
|
|
4 |
REPO = "transformers"
|
5 |
TOKEN = os.environ.get("GITHUB_TOKEN")
|
6 |
GITHUB_API_VERSION = "2022-11-28"
|
7 |
+
ISSUE_JSON_FILE = "issues.json"
|
fetch.py
CHANGED
@@ -1,9 +1,20 @@
|
|
1 |
"""
|
2 |
Script to fetch issues from the transformers repo and save them to a json file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
"""
|
4 |
|
5 |
import argparse
|
6 |
-
import datetime
|
7 |
import logging
|
8 |
import json
|
9 |
import os
|
@@ -11,17 +22,13 @@ import os
|
|
11 |
import requests
|
12 |
import numpy as np
|
13 |
|
14 |
-
from defaults import OWNER, REPO, GITHUB_API_VERSION, TOKEN
|
15 |
|
16 |
logging.basicConfig(level=logging.INFO)
|
17 |
-
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
JSON_FILE = "issues.json"
|
23 |
-
UPDATE_FILE = False
|
24 |
-
OVERWRITE_FILE = True
|
25 |
|
26 |
|
27 |
def get_last_entry(file_path):
|
@@ -41,7 +48,7 @@ def get_last_issue_number(file_path):
|
|
41 |
def get_issues(
|
42 |
overwrite=OVERWRITE_FILE,
|
43 |
update=UPDATE_FILE,
|
44 |
-
output_filename=
|
45 |
github_api_version=GITHUB_API_VERSION,
|
46 |
owner=OWNER,
|
47 |
repo=REPO,
|
@@ -125,9 +132,9 @@ def get_issues(
|
|
125 |
|
126 |
if __name__ == "__main__":
|
127 |
parser = argparse.ArgumentParser()
|
128 |
-
parser.add_argument("--update", action="store_true", default=
|
129 |
-
parser.add_argument("--overwrite", action="store_true", default=
|
130 |
-
parser.add_argument("--output_filename", type=str, default=
|
131 |
parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
|
132 |
parser.add_argument("--owner", type=str, default=OWNER)
|
133 |
parser.add_argument("--repo", type=str, default=REPO)
|
|
|
1 |
"""
|
2 |
Script to fetch issues from the transformers repo and save them to a json file
|
3 |
+
|
4 |
+
The script can be run from the command line with the following arguments:
|
5 |
+
--update: Whether to update the existing file. If True the script will fetch
|
6 |
+
the most recent issues and append them to the file
|
7 |
+
--overwrite: Whether to overwrite the existing file
|
8 |
+
--output_filename: The name of the output file
|
9 |
+
--github_api_version: The version of the GitHub API to use
|
10 |
+
--owner: The owner of the repo
|
11 |
+
--repo: The name of the repo
|
12 |
+
--token: The GitHub token to use
|
13 |
+
--n_pages: The number of pages to fetch. Useful for testing
|
14 |
+
|
15 |
"""
|
16 |
|
17 |
import argparse
|
|
|
18 |
import logging
|
19 |
import json
|
20 |
import os
|
|
|
22 |
import requests
|
23 |
import numpy as np
|
24 |
|
25 |
+
from defaults import OWNER, REPO, GITHUB_API_VERSION, TOKEN, ISSUE_JSON_FILE
|
26 |
|
27 |
logging.basicConfig(level=logging.INFO)
|
|
|
28 |
logger = logging.getLogger(__name__)
|
29 |
|
30 |
+
UPDATE_FILE = True
|
31 |
+
OVERWRITE_FILE = False
|
|
|
|
|
|
|
32 |
|
33 |
|
34 |
def get_last_entry(file_path):
|
|
|
48 |
def get_issues(
|
49 |
overwrite=OVERWRITE_FILE,
|
50 |
update=UPDATE_FILE,
|
51 |
+
output_filename=ISSUE_JSON_FILE,
|
52 |
github_api_version=GITHUB_API_VERSION,
|
53 |
owner=OWNER,
|
54 |
repo=REPO,
|
|
|
132 |
|
133 |
if __name__ == "__main__":
|
134 |
parser = argparse.ArgumentParser()
|
135 |
+
parser.add_argument("--update", action="store_true", default=UPDATE_FILE)
|
136 |
+
parser.add_argument("--overwrite", action="store_true", default=OVERWRITE_FILE)
|
137 |
+
parser.add_argument("--output_filename", type=str, default=ISSUE_JSON_FILE)
|
138 |
parser.add_argument("--github_api_version", type=str, default=GITHUB_API_VERSION)
|
139 |
parser.add_argument("--owner", type=str, default=OWNER)
|
140 |
parser.add_argument("--repo", type=str, default=REPO)
|
retrieval.py
CHANGED
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import argparse
|
2 |
import json
|
3 |
|
|
|
1 |
+
"""
|
2 |
+
Module which contains functionality to retrieve the most similar issues for a given query
|
3 |
+
"""
|
4 |
+
|
5 |
+
|
6 |
import argparse
|
7 |
import json
|
8 |
|
update_stored_issues.py
CHANGED
@@ -1,5 +1,14 @@
|
|
1 |
"""
|
2 |
-
Module which updates any of the issues to reflect changes in the issue state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
"""
|
4 |
import argparse
|
5 |
import json
|
@@ -9,7 +18,7 @@ import os
|
|
9 |
import numpy as np
|
10 |
import requests
|
11 |
|
12 |
-
from defaults import TOKEN, OWNER, REPO, GITHUB_API_VERSION
|
13 |
|
14 |
logging.basicConfig(level=logging.INFO)
|
15 |
logger = logging.getLogger(__name__)
|
@@ -18,8 +27,8 @@ JSON_FILE = "issues.json"
|
|
18 |
|
19 |
|
20 |
def update_issues(
|
21 |
-
input_filename=
|
22 |
-
output_filename=
|
23 |
github_api_version=GITHUB_API_VERSION,
|
24 |
owner=OWNER,
|
25 |
repo=REPO,
|
|
|
1 |
"""
|
2 |
+
Module which updates any of the issues to reflect changes in the issue state e.g. new comments
|
3 |
+
|
4 |
+
The module can be run from the command line using the following arguments:
|
5 |
+
--input_filename: The name of the input file containing the issues
|
6 |
+
--output_filename: The name of the output file to save the updated issues
|
7 |
+
--github_api_version: The version of the GitHub API to use
|
8 |
+
--owner: The owner of the repo
|
9 |
+
--repo: The name of the repo
|
10 |
+
--token: The GitHub token to use
|
11 |
+
--n_pages: The number of pages to fetch. Useful for testing
|
12 |
"""
|
13 |
import argparse
|
14 |
import json
|
|
|
18 |
import numpy as np
|
19 |
import requests
|
20 |
|
21 |
+
from defaults import TOKEN, OWNER, REPO, GITHUB_API_VERSION, ISSUE_JSON_FILE
|
22 |
|
23 |
logging.basicConfig(level=logging.INFO)
|
24 |
logger = logging.getLogger(__name__)
|
|
|
27 |
|
28 |
|
29 |
def update_issues(
|
30 |
+
input_filename=ISSUE_JSON_FILE,
|
31 |
+
output_filename=ISSUE_JSON_FILE,
|
32 |
github_api_version=GITHUB_API_VERSION,
|
33 |
owner=OWNER,
|
34 |
repo=REPO,
|