daqc commited on
Commit
52b089b
·
verified ·
1 Parent(s): e728c21

Upload 36 files

Browse files
.env.template ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy this file to .env and uncomment/modify the variables you need.
2
+
3
+ # NOTES:
4
+ # 1. For Hugging Face usage: Only HF_TOKEN is required or set in UI
5
+ # 2. For Ollama usage: Uncomment MODEL_ID, OPENAI_API_BASE, and OPENAI_API_KEY
6
+
7
+
8
+ # HUGGING FACE CONFIGURATION (RECOMMENDED)
9
+ HF_TOKEN=your_huggingface_token_here
10
+ MODEL_ID=Qwen/Qwen2.5-Coder-32B-Instruct
11
+
12
+
13
+ # OLLAMA CONFIGURATION (LOCAL MODELS)
14
+ # To use Ollama with local models, uncomment and configure these variables:
15
+ # This uses OpenAI-compatible variables for Ollama integration
16
+
17
+ # MODEL_ID=qwen2.5-coder:7b # replace with any model (qwen2.5-coder is recommended)
18
+ # OPENAI_API_BASE=http://localhost:11434/v1
19
+ # OPENAI_API_KEY=ollama
20
+
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python virtual environment
2
+ venv/
3
+ env/
4
+ ENV/
5
+
6
+ # Python cache files
7
+ __pycache__/
8
+ *.py[cod]
9
+ *$py.class
10
+
11
+ # Distribution / packaging
12
+ dist/
13
+ build/
14
+ *.egg-info/
15
+
16
+ # Environment variables
17
+ .env
18
+
19
+ # IDE specific files
20
+ .idea/
21
+ .vscode/
22
+ *.swp
23
+ *.swo
24
+
25
+ # Project specific
26
+ downloads_folder/
27
+ *.log
28
+
29
+ # Jupyter Notebook
30
+ .ipynb_checkpoints
31
+
32
+ # Local development settings
33
+ *.local
34
+
35
+ .gradio
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open Deep Research Vulnerability Intelligence
2
+
3
+ **Open Deep Research Vulnerability Intelligence** is an AI-powered platform for automated vulnerability and threat intelligence research, built on [Hugging Face's Open Deep Research](https://huggingface.co/blog/open-deep-research) architecture.
4
+
5
+ ## 🎯 What it does
6
+
7
+ This AI agent specializes in automated vulnerability research and analysis. It searches across multiple security databases to provide comprehensive vulnerability intelligence reports with CVSS scores, EPSS predictions, and remediation advice.
8
+
9
+ ## 🎯 Motivation
10
+
11
+ The rapid growth of software systems and the increasing complexity of digital infrastructures have led to an explosion in the number and diversity of software vulnerabilities. Traditional manual approaches to vulnerability intelligence are no longer sufficient to keep pace with the evolving threat landscape. There is a critical need for automated, scalable, and intelligent systems that can:
12
+
13
+ - Aggregate and correlate data from multiple heterogeneous sources (NVD, CVEDB, KEV, EPSS, etc.)
14
+ - Provide timely, actionable insights for security analysts and decision-makers
15
+ - Reduce the cognitive load and manual effort required for vulnerability triage and reporting
16
+ - Enable reproducible, transparent, and explainable research in vulnerability intelligence
17
+
18
+ This project addresses these challenges by leveraging state-of-the-art language models and multi-source data aggregation, providing a research-grade platform for both academic and industry use.
19
+
20
+ ## 🛠️ Available Tools & APIs
21
+
22
+ - **🛡️ [NIST NVD](https://nvd.nist.gov/)** - National Vulnerability Database (free API)
23
+ - **📊 [Shodan CVEDB](https://cvedb.com/)** - Comprehensive vulnerability database (free API)
24
+ - **⚠️ [KEVin](https://kevin.gtfkd.com/)** - Known Exploited Vulnerabilities database (free API)
25
+ - **📈 [EPSS](https://www.first.org/epss/)** - Exploit Prediction Scoring System (free API)
26
+ - **🌐 Web Browser** - Navigate and extract information from web pages
27
+
28
+ ## 🚀 Features
29
+
30
+ - **Multi-Source Intelligence**: Searches NVD, CVEDB, KEV, EPSS, and web sources
31
+ - **Smart Product Detection**: Automatically strips version numbers for accurate searches
32
+ - **Comprehensive Reports**: Generates detailed vulnerability reports with hyperlinks
33
+ - **Session Management**: Secure API key handling with session-based storage
34
+ - **Responsive UI**: Works on desktop and mobile devices
35
+ - **Example Prompts**: Built-in examples to get started quickly
36
+
37
+ ## 📋 Requirements
38
+
39
+ - Python 3.8+
40
+ - Hugging Face API key (free)
41
+ - Internet connection
42
+ - **Optional**: [Ollama](https://ollama.ai/) for local model inference
43
+
44
+ ## 🚀 Quick Start
45
+
46
+ ```bash
47
+ # Clone the repository
48
+ git clone https://github.com/mcdaqc/open-deep-research-vulnerability-intelligence.git
49
+ cd open-deep-research-vulnerability-intelligence
50
+
51
+ # Create virtual environment
52
+ python -m venv venv
53
+ venv\Scripts\activate # Windows
54
+ # source venv/bin/activate # Linux/Mac
55
+
56
+ # Install dependencies
57
+ pip install -r requirements.txt
58
+
59
+ # Run the application
60
+ python app.py
61
+ ```
62
+
63
+
64
+
65
+
66
+
67
+ ## 🏗️ Project Structure
68
+
69
+ ```
70
+ ├── app.py # Main application with Gradio UI
71
+ ├── scripts/ # Tool implementations
72
+ ├── cvedb_tool.py # Shodan CVEDB integration
73
+ ├── nvd_tool.py # NIST NVD integration
74
+ ├── kevin_tool.py # KEVin database integration
75
+ ├── epss_tool.py # EPSS scoring integration
76
+ └── text_web_browser.py # Web browsing capabilities
77
+ ```
78
+
79
+
80
+
81
+ ---
82
+
83
+ **Powered by** <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png" width="20" height="20" style="vertical-align: middle; margin-right: 8px;"> **[hf/smolagents](https://github.com/huggingface/smolagents)**
app.py ADDED
@@ -0,0 +1,999 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import mimetypes
3
+ import os
4
+ import re
5
+ import shutil
6
+ import threading
7
+ from typing import Optional
8
+ from loguru import logger
9
+ from datetime import datetime
10
+
11
+ import gradio as gr
12
+ from dotenv import load_dotenv
13
+ from huggingface_hub import login, HfApi
14
+ from smolagents import (
15
+ CodeAgent,
16
+ InferenceClientModel,
17
+ Tool,
18
+ DuckDuckGoSearchTool,
19
+ )
20
+ from smolagents.agent_types import (
21
+ AgentAudio,
22
+ AgentImage,
23
+ AgentText,
24
+ handle_agent_output_types,
25
+ )
26
+ from smolagents.gradio_ui import stream_to_gradio
27
+
28
+ from scripts.text_inspector_tool import TextInspectorTool
29
+ from scripts.text_web_browser import (
30
+ ArchiveSearchTool,
31
+ FinderTool,
32
+ FindNextTool,
33
+ PageDownTool,
34
+ PageUpTool,
35
+ SimpleTextBrowser,
36
+ VisitTool,
37
+ )
38
+ from scripts.visual_qa import visualizer
39
+ from scripts.cvedb_tool import CVEDBTool
40
+ from scripts.report_generator import ReportGeneratorTool
41
+ from scripts.epss_tool import EpsTool
42
+ from scripts.nvd_tool import NvdTool
43
+ from scripts.kevin_tool import KevinTool
44
+
45
+ # web_search = GoogleSearchTool(provider="serper")
46
+ web_search = DuckDuckGoSearchTool()
47
+
48
+ AUTHORIZED_IMPORTS = [
49
+ "requests",
50
+ "zipfile",
51
+ "pandas",
52
+ "numpy",
53
+ "sympy",
54
+ "json",
55
+ "bs4",
56
+ "pubchempy",
57
+ "xml",
58
+ "yahoo_finance",
59
+ "Bio",
60
+ "sklearn",
61
+ "scipy",
62
+ "pydub",
63
+ "PIL",
64
+ "chess",
65
+ "PyPDF2",
66
+ "pptx",
67
+ "torch",
68
+ "datetime",
69
+ "fractions",
70
+ "csv",
71
+ "plotly",
72
+ "plotly.express",
73
+ "plotly.graph_objects",
74
+ "jinja2",
75
+ ]
76
+
77
+ load_dotenv(override=True)
78
+
79
+ # Only login if HF_TOKEN is available and valid in environment
80
+ if os.getenv("HF_TOKEN"):
81
+ try:
82
+ login(os.getenv("HF_TOKEN"))
83
+ logger.info("Successfully logged in with HF_TOKEN from environment")
84
+ except Exception as e:
85
+ logger.warning(f"Failed to login with HF_TOKEN from environment: {e}")
86
+ logger.info("You can still use the application by providing a valid API key in the interface")
87
+
88
+ append_answer_lock = threading.Lock()
89
+
90
+ custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
91
+
92
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
93
+
94
+ BROWSER_CONFIG = {
95
+ "viewport_size": 1024 * 5,
96
+ "downloads_folder": "downloads_folder",
97
+ "request_kwargs": {
98
+ "headers": {"User-Agent": user_agent},
99
+ "timeout": 300,
100
+ },
101
+ "serpapi_key": os.getenv("SERPAPI_API_KEY"),
102
+ }
103
+
104
+ os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)
105
+
106
+ # Default Hugging Face model configuration
107
+ model_id = os.getenv("MODEL_ID", "Qwen/Qwen2.5-Coder-32B-Instruct")
108
+ logger.info(f"Default Hugging Face model: {model_id}")
109
+
110
+ # Define text_limit before using it
111
+ text_limit = 20000
112
+
113
+ # Default model (will be overridden by session-specific models)
114
+ # Note: This model may not work without a valid API key
115
+ default_model = None
116
+ ti_tool = None
117
+
118
+ # Only try to create default model if we have a valid token
119
+ if os.getenv("HF_TOKEN"):
120
+ try:
121
+ # Test if the token is valid
122
+ api = HfApi(token=os.getenv("HF_TOKEN"))
123
+ api.whoami() # This will raise an exception if token is invalid
124
+ # If we get here, token is valid
125
+ default_model = InferenceClientModel(
126
+ model_id,
127
+ custom_role_conversions={
128
+ "tool-call": "assistant",
129
+ "tool-response": "user"
130
+ },
131
+ token=os.getenv("HF_TOKEN")
132
+ )
133
+ ti_tool = TextInspectorTool(default_model, text_limit)
134
+ logger.info("Default model created successfully with valid token")
135
+ except Exception as e:
136
+ logger.warning(f"Failed to create default model: {e}")
137
+ default_model = None
138
+ ti_tool = None
139
+ else:
140
+ logger.info("No HF_TOKEN provided, default model will be created when user provides API key")
141
+
142
+ browser = SimpleTextBrowser(**BROWSER_CONFIG)
143
+
144
+ # Tool configuration
145
+ cvedb_tool = CVEDBTool()
146
+ report_generator = ReportGeneratorTool()
147
+ epss_tool = EpsTool()
148
+ nvd_tool = NvdTool()
149
+ kevin_tool = KevinTool()
150
+
151
+ # Default tools (will be updated with session-specific models)
152
+ WEB_TOOLS = [
153
+ web_search, # duckduckgo
154
+ VisitTool(browser),
155
+ PageUpTool(browser),
156
+ PageDownTool(browser),
157
+ FinderTool(browser),
158
+ FindNextTool(browser),
159
+ ArchiveSearchTool(browser),
160
+ ] + ([ti_tool] if ti_tool else []) + [
161
+ cvedb_tool, # CVEDB Tool
162
+ # report_generator, # Report generation tool - COMMENTED: Only works locally
163
+ epss_tool, # EPSS Tool
164
+ nvd_tool, # NVD Tool
165
+ kevin_tool, # KEVin Tool
166
+ ]
167
+
168
+ def validate_hf_api_key(api_key: str) -> tuple[bool, str]:
169
+ """Validate Hugging Face API key by making a test request."""
170
+ if not api_key or not api_key.strip():
171
+ return False, "❌ API key cannot be empty"
172
+
173
+ api_key = api_key.strip()
174
+
175
+ # Basic format validation
176
+ if not api_key.startswith("hf_"):
177
+ return False, "❌ Invalid API key format. Hugging Face API keys start with 'hf_'"
178
+
179
+ try:
180
+ # Test the API key by making a simple request
181
+ api = HfApi(token=api_key)
182
+ # Try to get user info to validate the token
183
+ user_info = api.whoami()
184
+ return True, f"✅ API key validated successfully! Welcome, {user_info.get('name', 'User')}!"
185
+ except Exception as e:
186
+ return False, f"❌ Invalid API key: {str(e)}"
187
+
188
+ def create_model_with_api_key(hf_token: str, model_id: str = None) -> InferenceClientModel:
189
+ """Create a new InferenceClientModel instance with the provided HF_TOKEN."""
190
+ if not model_id:
191
+ model_id = os.getenv("MODEL_ID", "Qwen/Qwen2.5-Coder-32B-Instruct")
192
+
193
+ logger.info(f"Creating model {model_id} with token: {hf_token[:10]}...")
194
+
195
+ # First, try to login with the token to ensure it's properly set
196
+ try:
197
+ login(hf_token)
198
+ logger.info("Successfully logged in with token")
199
+ except Exception as e:
200
+ logger.warning(f"Login failed: {e}")
201
+
202
+ # Create the model with explicit token
203
+ model = InferenceClientModel(
204
+ model_id,
205
+ custom_role_conversions={
206
+ "tool-call": "assistant",
207
+ "tool-response": "user"
208
+ },
209
+ token=hf_token
210
+ )
211
+
212
+ # Verify the token is set correctly
213
+ if hasattr(model, 'token'):
214
+ logger.info(f"Model token attribute: {model.token[:10] if model.token else 'None'}...")
215
+ else:
216
+ logger.warning("Model does not have token attribute")
217
+
218
+ # Test the model with a simple request to verify token works
219
+ try:
220
+ logger.info("Testing model with simple request...")
221
+ # This is a simple test to see if the model can be accessed
222
+ test_response = model.generate("Hello", max_new_tokens=5)
223
+ logger.info("Model test successful")
224
+ except Exception as e:
225
+ logger.error(f"Model test failed: {e}")
226
+ # Don't raise the exception, just log it
227
+
228
+ logger.info(f"Model created successfully with token")
229
+ return model
230
+
231
+ def create_tools_with_model(model: InferenceClientModel) -> list:
232
+ """Create tools list with the provided model."""
233
+ return [
234
+ web_search, # duckduckgo
235
+ VisitTool(browser),
236
+ PageUpTool(browser),
237
+ PageDownTool(browser),
238
+ FinderTool(browser),
239
+ FindNextTool(browser),
240
+ ArchiveSearchTool(browser),
241
+ TextInspectorTool(model, text_limit),
242
+ cvedb_tool, # CVEDB Tool
243
+ # report_generator, # Report generation tool - COMMENTED: Only works locally
244
+ epss_tool, # EPSS Tool
245
+ nvd_tool, # NVD Tool
246
+ kevin_tool, # KEVin Tool
247
+ ]
248
+
249
+ # Agent creation in a factory function
250
+ def create_agent(hf_token: str = None, model_id: str = None, max_steps: int = 10):
251
+ """Creates a fresh agent instance for each session"""
252
+ if not hf_token:
253
+ raise ValueError("A valid Hugging Face API key is required to create an agent.")
254
+
255
+ logger.info(f"Creating agent with token: {hf_token[:10]}...")
256
+
257
+ # Use session-specific model with HF_TOKEN
258
+ model = create_model_with_api_key(hf_token, model_id)
259
+ tools = create_tools_with_model(model)
260
+
261
+ agent = CodeAgent(
262
+ model=model,
263
+ tools=[visualizer] + tools,
264
+ max_steps=max_steps,
265
+ verbosity_level=1,
266
+ additional_authorized_imports=AUTHORIZED_IMPORTS,
267
+ planning_interval=4,
268
+ )
269
+
270
+ logger.info("Agent created successfully")
271
+ return agent
272
+
273
+ # Only create document_inspection_tool if default_model is available
274
+ if default_model:
275
+ document_inspection_tool = TextInspectorTool(default_model, 20000)
276
+ else:
277
+ document_inspection_tool = None
278
+
279
+ class GradioUI:
280
+ """A one-line interface to launch your agent in Gradio"""
281
+
282
+ def __init__(self, file_upload_folder: str | None = None):
283
+ self.file_upload_folder = file_upload_folder
284
+ if self.file_upload_folder is not None:
285
+ if not os.path.exists(file_upload_folder):
286
+ os.mkdir(file_upload_folder)
287
+ # Create reports directory
288
+ self.reports_folder = "reports"
289
+ if not os.path.exists(self.reports_folder):
290
+ os.mkdir(self.reports_folder)
291
+
292
+ def save_report(self, html_content: str) -> str:
293
+ """Saves the HTML report and returns the file path."""
294
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
295
+ filename = f"vulnerability_report_{timestamp}.html"
296
+ filepath = os.path.join(self.reports_folder, filename)
297
+
298
+ with open(filepath, "w", encoding="utf-8") as f:
299
+ f.write(html_content)
300
+
301
+ return filepath
302
+
303
+ def validate_api_key(self, api_key: str) -> tuple[str, str]:
304
+ """Validate API key and return status message."""
305
+ is_valid, message = validate_hf_api_key(api_key)
306
+ if is_valid:
307
+ return message, "success"
308
+ else:
309
+ return message, "error"
310
+
311
+ def interact_with_agent(self, prompt, messages, session_state):
312
+ # Get or create session-specific agent
313
+ if "agent" not in session_state:
314
+ # Check if we have a valid HF_TOKEN in session
315
+ hf_token = session_state.get("hf_token")
316
+
317
+ # If no token in session, try to get it from .env file
318
+ if not hf_token:
319
+ env_token = os.getenv("HF_TOKEN")
320
+ if env_token:
321
+ hf_token = env_token
322
+ session_state["hf_token"] = env_token
323
+ session_state["max_steps"] = 10 # Default max_steps
324
+ logger.info(f"Using HF_TOKEN from .env file: {env_token[:10]}...")
325
+ else:
326
+ logger.warning("No API key found in session state or .env file")
327
+ error_msg = "❌ No API key provided. Please enter your Hugging Face API key in the API Configuration section above or set HF_TOKEN in your .env file."
328
+ messages.append(gr.ChatMessage(role="assistant", content=error_msg))
329
+ yield messages
330
+ return
331
+
332
+ logger.info(f"Agent not in session, checking for token: {hf_token[:10] if hf_token else 'None'}...")
333
+
334
+ if hf_token:
335
+ try:
336
+ max_steps = session_state.get("max_steps", 10)
337
+ session_state["agent"] = create_agent(hf_token, max_steps=max_steps)
338
+ logger.info("Agent created successfully in interact_with_agent")
339
+ except Exception as e:
340
+ logger.error(f"Failed to create agent in interact_with_agent: {e}")
341
+ error_msg = f"❌ Failed to create agent with provided API key: {str(e)}"
342
+ messages.append(gr.ChatMessage(role="assistant", content=error_msg))
343
+ yield messages
344
+ return
345
+ else:
346
+ logger.info("Agent already exists in session")
347
+
348
+ # Adding monitoring
349
+ try:
350
+ # log the existence of agent memory
351
+ has_memory = hasattr(session_state["agent"], "memory")
352
+ print(f"Agent has memory: {has_memory}")
353
+ if has_memory:
354
+ print(f"Memory type: {type(session_state['agent'].memory)}")
355
+
356
+ # Prepare the system prompt
357
+ system_prompt = f"""You are a Vulnerability Intelligence Analyst. Complete the user request in {session_state.get('max_steps', 10)} steps maximum.
358
+
359
+ AVAILABLE TOOLS: nvd_search, web_search, cvedb_search, kevin_search, epss_search
360
+
361
+ CRITICAL RULES:
362
+ 1. VERSION ANALYSIS:
363
+ - FIRST: Check current version via web search to understand the latest available version
364
+ - PRIORITY: When user asks for specific version, focus ONLY on vulnerabilities affecting that version and newer
365
+ - EXCLUDE OLDER: Do NOT report vulnerabilities that only affect older versions
366
+ - VERSION LOGIC:
367
+ * "up to X" or "before X" = affects versions UP TO X, NOT newer versions
368
+ * "X+" or "X and later" = affects X and newer versions
369
+ * "X through Y" = affects versions X to Y inclusive
370
+ - CORRECT LOGIC: If CVE affects "up to v22.1" and user asks about v24.0 then v24.0 is NOT vulnerable
371
+ - CORRECT LOGIC: If CVE affects "v25.0+" and user asks about v24.0 then v24.0 is NOT vulnerable
372
+ - CORRECT LOGIC: If CVE affects "below v25.0" and user asks about v24.0 then v24.0 is vulnerable
373
+ 2. DATES: Use current date for "today" or "current". Use: from datetime import datetime; today = datetime.now().strftime("%Y-%m-%d")
374
+ 3. PRODUCT SEARCH: ALWAYS use ONLY the base product name, NEVER include versions when using vulnerability tools (nvd_search, cvedb_search, kevin_search, epss_search)
375
+ 4. SOURCES: Always prioritize vendor/original sources for CVE, CWE, and reference links (official vendor websites, security advisories)
376
+ 5. SIMPLICITY: Keep code simple and logical. Avoid unnecessary library imports. Use only basic Python functions when needed.
377
+ 6. TOOL USAGE: Use ONLY the available tools. Do not complicate tool calls with unnecessary code.
378
+ 7. STRING OPERATIONS: Use simple Python methods (in, find, startswith, endswith, etc.). NEVER use .contains() - it doesn't exist in Python. Avoid complex string parsing of tool results.
379
+ 8. VULNERABILITY ANALYSIS: When analyzing tool results:
380
+ - READ CAREFULLY: Pay attention to version ranges in vulnerability descriptions
381
+ - "up to X" means versions UP TO X, NOT including newer versions
382
+ - "below X" means versions BELOW X, NOT including X or newer
383
+ - "X+" means X and newer versions
384
+ - ONLY include vulnerabilities that actually affect the requested version
385
+ - If unsure about version compatibility, exclude the vulnerability
386
+ - DO NOT REASON: Don't create complex logic about version compatibility
387
+ - DO NOT ASSUME: If a CVE affects "up to 22.1", it does NOT affect 24.0
388
+ - SIMPLE RULE: Only include CVEs where the version range explicitly includes the requested version
389
+ 9. REPORT GENERATION: Do NOT create complex functions, or loops for the final answer. Use the information collected and format it directly following the REPORT FORMAT.
390
+
391
+ REPORT FORMAT:
392
+ # Vulnerability Report
393
+ ### [Software and Version]
394
+
395
+ #### CVE-ID: [CVE-YYYY-NNNNN]
396
+ **NIST NVD Link:** https://nvd.nist.gov/vuln/detail/CVE-YYYY-NNNNN
397
+ - **Attack Type:** [Type]
398
+ - **Published:** [Date]
399
+ - **CVSS:** [Score]
400
+ - **EPSS:** [Score]
401
+ - **KEV:** [Yes/No]
402
+ - **Affected Versions:** [Specific range]
403
+ - **Description:** [Description]
404
+ - **CWE:** [CWE-XXX] - https://cwe.mitre.org/data/definitions/XXX.html
405
+ - **Recommendations:** [Remediation advice]
406
+ - **Sources:**
407
+ - https://oficial-vendor-website.com/security-advisory
408
+ - https://official-security-source.com
409
+ - https://additional-reference-source.com
410
+
411
+ INSTRUCTIONS:
412
+ - Follow the exact format above
413
+ - Use official vendor sources when available
414
+ - Complete the task efficiently within the step limit
415
+
416
+ Now it is your turn, remember to keep code simple.
417
+ User Query: """
418
+
419
+ # Combine system prompt with user message
420
+ full_prompt = system_prompt + prompt
421
+
422
+ messages.append(gr.ChatMessage(role="user", content=prompt))
423
+ yield messages
424
+
425
+ logger.info("Starting agent interaction...")
426
+ for msg in stream_to_gradio(
427
+ session_state["agent"], task=full_prompt, reset_agent_memory=False
428
+ ):
429
+ # If the message contains an HTML report, we save it and update the message
430
+ if isinstance(msg.content, str) and msg.content.startswith("<!DOCTYPE html>"):
431
+ report_path = self.save_report(msg.content)
432
+ msg.content = f"Report generated and saved at: {report_path}\n\nYou can open the file in your browser to view the complete report."
433
+
434
+ messages.append(msg)
435
+ yield messages
436
+ yield messages
437
+ except Exception as e:
438
+ logger.error(f"Error in interaction: {str(e)}")
439
+ print(f"Error in interaction: {str(e)}")
440
+ error_msg = f"❌ Error during interaction: {str(e)}"
441
+ messages.append(gr.ChatMessage(role="assistant", content=error_msg))
442
+ yield messages
443
+
444
+ def setup_api_key(self, api_key: str, max_steps: int, session_state) -> str:
445
+ """Setup API key for the session."""
446
+ # Check if API key is provided from interface
447
+ if api_key and api_key.strip():
448
+ # Use the API key from interface
449
+ token_to_use = api_key.strip()
450
+ source = "interface"
451
+ else:
452
+ # Try to use token from .env file
453
+ env_token = os.getenv("HF_TOKEN")
454
+ if env_token:
455
+ token_to_use = env_token
456
+ source = ".env file"
457
+ else:
458
+ return "❌ No API key provided. Please enter your Hugging Face API key or set HF_TOKEN in your .env file."
459
+
460
+ # Validate the token
461
+ is_valid, message = validate_hf_api_key(token_to_use)
462
+
463
+ if is_valid:
464
+ # Store HF_TOKEN in session state
465
+ session_state["hf_token"] = token_to_use
466
+ session_state["max_steps"] = max_steps
467
+ logger.info(f"API key stored in session from {source}: {token_to_use[:10]}...")
468
+ logger.info(f"Max steps set to: {max_steps}")
469
+
470
+ # Also set the environment variable for smolagents
471
+ os.environ["HF_TOKEN"] = token_to_use
472
+ logger.info("HF_TOKEN environment variable set")
473
+
474
+ # Create new agent with the HF_TOKEN and max_steps
475
+ try:
476
+ session_state["agent"] = create_agent(token_to_use, max_steps=max_steps)
477
+ logger.info("Agent created successfully in setup_api_key")
478
+ return f"✅ API key from {source} validated and agent created successfully! {message.split('!')[1] if '!' in message else ''}"
479
+ except Exception as e:
480
+ logger.error(f"Failed to create agent in setup_api_key: {e}")
481
+ return f"❌ Failed to create agent with API key from {source}: {str(e)}"
482
+ else:
483
+ logger.warning(f"Invalid API key from {source}: {token_to_use[:10] if token_to_use else 'None'}...")
484
+ return f"❌ Invalid API key from {source}: {message}"
485
+
486
+ def upload_file(
487
+ self,
488
+ file,
489
+ file_uploads_log,
490
+ allowed_file_types=[
491
+ "application/pdf",
492
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
493
+ "text/plain",
494
+ ],
495
+ ):
496
+ """
497
+ Handle file uploads, default allowed types are .pdf, .docx, and .txt
498
+ """
499
+ if file is None:
500
+ return gr.Textbox("No file uploaded", visible=True), file_uploads_log
501
+
502
+ try:
503
+ mime_type, _ = mimetypes.guess_type(file.name)
504
+ except Exception as e:
505
+ return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log
506
+
507
+ if mime_type not in allowed_file_types:
508
+ return gr.Textbox("File type disallowed", visible=True), file_uploads_log
509
+
510
+ # Sanitize file name
511
+ original_name = os.path.basename(file.name)
512
+ sanitized_name = re.sub(
513
+ r"[^\w\-.]", "_", original_name
514
+ ) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores
515
+
516
+ type_to_ext = {}
517
+ for ext, t in mimetypes.types_map.items():
518
+ if t not in type_to_ext:
519
+ type_to_ext[t] = ext
520
+
521
+ # Ensure the extension correlates to the mime type
522
+ sanitized_name = sanitized_name.split(".")[:-1]
523
+ sanitized_name.append("" + type_to_ext[mime_type])
524
+ sanitized_name = "".join(sanitized_name)
525
+
526
+ # Save the uploaded file to the specified folder
527
+ file_path = os.path.join(
528
+ self.file_upload_folder, os.path.basename(sanitized_name)
529
+ )
530
+ shutil.copy(file.name, file_path)
531
+
532
+ return gr.Textbox(
533
+ f"File uploaded: {file_path}", visible=True
534
+ ), file_uploads_log + [file_path]
535
+
536
+ def log_user_message(self, text_input, file_uploads_log):
537
+ return (
538
+ text_input
539
+ + (
540
+ f"\nYou have been provided with these files, which might be helpful or not: {file_uploads_log}"
541
+ if len(file_uploads_log) > 0
542
+ else ""
543
+ ),
544
+ gr.Textbox(
545
+ value="",
546
+ interactive=False,
547
+ placeholder="Please wait while Steps are getting populated",
548
+ ),
549
+ gr.Button(interactive=False),
550
+ )
551
+
552
+ def detect_device(self, request: gr.Request):
553
+ # Check whether the user device is a mobile or a computer
554
+
555
+ if not request:
556
+ return "Unknown device"
557
+ # Method 1: Check sec-ch-ua-mobile header
558
+ is_mobile_header = request.headers.get("sec-ch-ua-mobile")
559
+ if is_mobile_header:
560
+ return "Mobile" if "?1" in is_mobile_header else "Desktop"
561
+
562
+ # Method 2: Check user-agent string
563
+ user_agent = request.headers.get("user-agent", "").lower()
564
+ mobile_keywords = ["android", "iphone", "ipad", "mobile", "phone"]
565
+
566
+ if any(keyword in user_agent for keyword in mobile_keywords):
567
+ return "Mobile"
568
+
569
+ # Method 3: Check platform
570
+ platform = request.headers.get("sec-ch-ua-platform", "").lower()
571
+ if platform:
572
+ if platform in ['"android"', '"ios"']:
573
+ return "Mobile"
574
+ elif platform in ['"windows"', '"macos"', '"linux"']:
575
+ return "Desktop"
576
+
577
+ # Default case if no clear indicators
578
+ return "Desktop"
579
+
580
+ def launch(self, **kwargs):
581
+ with gr.Blocks(theme="ocean", fill_height=True) as demo:
582
+ # Different layouts for mobile and computer devices
583
+ @gr.render()
584
+ def layout(request: gr.Request):
585
+ device = self.detect_device(request)
586
+ print(f"device - {device}")
587
+ # Render layout with sidebar
588
+ if device == "Desktop":
589
+ with gr.Blocks(
590
+ fill_height=True,
591
+ ):
592
+ file_uploads_log = gr.State([])
593
+ with gr.Sidebar():
594
+ # Project title and repository link at the top
595
+ gr.Markdown("""# Open Deep Research Vulnerability Intelligence""")
596
+ gr.Markdown("""<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" width="20" height="20" style="display: inline-block; vertical-align: middle; margin-right: 8px;"> <a href="https://github.com/mcdaqc/open-deep-research-vulnerability-intelligence" target="_blank">Github Repository</a>""")
597
+
598
+ # About section
599
+ with gr.Accordion("ℹ️ About", open=False):
600
+ gr.Markdown("""**What it does:**
601
+ This AI agent specializes in automated vulnerability research and analysis, built on <a href="https://huggingface.co/blog/open-deep-research" target="_blank">Hugging Face's Open Deep Research</a> architecture. It can search across multiple security databases to provide comprehensive vulnerability intelligence reports.
602
+
603
+ **Available Tools & APIs:**
604
+ - <a href="https://nvd.nist.gov/developers/vulnerabilities" target="_blank">🛡️ NIST NVD</a> - National Vulnerability Database (free API)
605
+ - <a href="https://cvedb.com/" target="_blank">📊 Shodan CVEDB</a> - Comprehensive vulnerability database (free API)
606
+ - <a href="https://kevin.gtfkd.com/" target="_blank">⚠️ KEVin</a> - Known Exploited Vulnerabilities database (free API)
607
+ - <a href="https://www.first.org/epss/" target="_blank">📈 EPSS</a> - Exploit Prediction Scoring System (free API)
608
+ - 🌐 **Web Browser** - Navigate and extract information from web pages
609
+
610
+ **Model Configuration:**
611
+ - **Default Model**: Qwen/Qwen2.5-Coder-32B-Instruct (recommended)
612
+ - **Alternative**: You can also use Ollama with local models for privacy
613
+
614
+ **How to use:**
615
+ 1. Enter your Hugging Face API key below
616
+ 2. Ask about specific software versions, CVEs, or security vulnerabilities
617
+ 3. The agent will automatically search all available databases
618
+ 4. Receive comprehensive vulnerability reports with CVSS scores, EPSS predictions, and remediation advice""")
619
+
620
+ with gr.Group():
621
+ gr.Markdown("**Your request**", container=True)
622
+ text_input = gr.Textbox(
623
+ lines=3,
624
+ label="Your request",
625
+ container=False,
626
+ placeholder="Enter your prompt here and press Shift+Enter or press the button",
627
+ )
628
+ launch_research_btn = gr.Button(
629
+ "Run", variant="primary"
630
+ )
631
+
632
+ # Examples Section
633
+ with gr.Accordion("💡 Example Prompts", open=False):
634
+ gr.Markdown("**Click any example below to populate your request field:**")
635
+
636
+ example_btn_1 = gr.Button("🔍 MobaXterm 24.0 vulnerabilities", size="sm", variant="secondary")
637
+ example_btn_2 = gr.Button("🔍 Chrome 120.0.6099.109 security issues", size="sm", variant="secondary")
638
+ example_btn_3 = gr.Button("🔍 Apache Tomcat 9.0.65 KEV check", size="sm", variant="secondary")
639
+ example_btn_4 = gr.Button("🔍 Windows 11 recent vulnerabilities", size="sm", variant="secondary")
640
+ example_btn_5 = gr.Button("🔍 CVE-2024-0001 analysis", size="sm", variant="secondary")
641
+ example_btn_6 = gr.Button("🔍 Nginx 1.24.0 security status", size="sm", variant="secondary")
642
+
643
+ # Example button events
644
+ example_btn_1.click(
645
+ lambda: "Analyze MobaXterm 24.0 for vulnerabilities as of today",
646
+ None,
647
+ [text_input]
648
+ )
649
+ example_btn_2.click(
650
+ lambda: "Check Chrome 120.0.6099.109 for security vulnerabilities",
651
+ None,
652
+ [text_input]
653
+ )
654
+ example_btn_3.click(
655
+ lambda: "Is Apache Tomcat 9.0.65 in KEV database as of today?",
656
+ None,
657
+ [text_input]
658
+ )
659
+ example_btn_4.click(
660
+ lambda: "Check Windows 11 for recent vulnerabilities as of today",
661
+ None,
662
+ [text_input]
663
+ )
664
+ example_btn_5.click(
665
+ lambda: "Analyze CVE-2024-0001 in detail",
666
+ None,
667
+ [text_input]
668
+ )
669
+ example_btn_6.click(
670
+ lambda: "Check Nginx 1.24.0 for security vulnerabilities",
671
+ None,
672
+ [text_input]
673
+ )
674
+
675
+ # API Key Configuration Section
676
+ with gr.Accordion("🔑 API Configuration", open=False):
677
+ gr.Markdown("**Configure your Hugging Face API Key**")
678
+ gr.Markdown("All API keys are stored only in session memory, not persisted.")
679
+ gr.Markdown("Get your API key from: https://huggingface.co/settings/tokens")
680
+
681
+ api_key_input = gr.Textbox(
682
+ label="Hugging Face API Key",
683
+ placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
684
+ type="password",
685
+ lines=1
686
+ )
687
+ api_key_status = gr.Textbox(
688
+ label="Status",
689
+ value="✅ HF_TOKEN found in .env file. To use a different key, enter it above and click 'Setup API Key'." if os.getenv("HF_TOKEN") else "⚠️ Please enter your Hugging Face API key or set HF_TOKEN in your .env file.",
690
+ interactive=False
691
+ )
692
+
693
+ # Agent configuration
694
+ gr.Markdown("**Agent Configuration**")
695
+ max_steps_slider = gr.Slider(
696
+ minimum=5,
697
+ maximum=30,
698
+ value=10,
699
+ step=1,
700
+ label="Maximum Steps",
701
+ info="Number of steps the agent can take per session (higher = more detailed but slower)"
702
+ )
703
+
704
+ setup_api_btn = gr.Button("Setup API Key", variant="secondary")
705
+
706
+ # If an upload folder is provided, enable the upload feature
707
+ if self.file_upload_folder is not None:
708
+ upload_file = gr.File(label="Upload a file")
709
+ upload_status = gr.Textbox(
710
+ label="Upload Status",
711
+ interactive=False,
712
+ visible=False,
713
+ )
714
+ upload_file.change(
715
+ self.upload_file,
716
+ [upload_file, file_uploads_log],
717
+ [upload_status, file_uploads_log],
718
+ )
719
+
720
+ # Powered by smolagents
721
+ with gr.Row():
722
+ gr.HTML("""<div style="display: flex; align-items: center; gap: 8px; font-family: system-ui, -apple-system, sans-serif;">Powered by
723
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png" style="width: 32px; height: 32px; object-fit: contain;" alt="logo">
724
+ <a target="_blank" href="https://github.com/huggingface/smolagents"><b>hf/smolagents</b></a>
725
+ </div>""")
726
+
727
+ # Add session state to store session-specific data
728
+ session_state = gr.State(
729
+ {}
730
+ ) # Initialize empty state for each session
731
+ stored_messages = gr.State([])
732
+ chatbot = gr.Chatbot(
733
+ label="open-Deep-Research",
734
+ type="messages",
735
+ avatar_images=(
736
+ None,
737
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
738
+ ),
739
+ resizeable=False,
740
+ scale=1,
741
+ elem_id="my-chatbot",
742
+ )
743
+
744
+ # Add component to display reports
745
+ report_viewer = gr.HTML(label="Vulnerability Report", visible=False)
746
+
747
+ # API Key setup event
748
+ setup_api_btn.click(
749
+ self.setup_api_key,
750
+ [api_key_input, max_steps_slider, session_state],
751
+ [api_key_status]
752
+ )
753
+
754
+ text_input.submit(
755
+ self.log_user_message,
756
+ [text_input, file_uploads_log],
757
+ [stored_messages, text_input, launch_research_btn],
758
+ ).then(
759
+ self.interact_with_agent,
760
+ # Include session_state in function calls
761
+ [stored_messages, chatbot, session_state],
762
+ [chatbot],
763
+ ).then(
764
+ lambda: (
765
+ gr.Textbox(
766
+ interactive=True,
767
+ placeholder="Enter your prompt here and press the button",
768
+ ),
769
+ gr.Button(interactive=True),
770
+ ),
771
+ None,
772
+ [text_input, launch_research_btn],
773
+ )
774
+ launch_research_btn.click(
775
+ self.log_user_message,
776
+ [text_input, file_uploads_log],
777
+ [stored_messages, text_input, launch_research_btn],
778
+ ).then(
779
+ self.interact_with_agent,
780
+ # Include session_state in function calls
781
+ [stored_messages, chatbot, session_state],
782
+ [chatbot],
783
+ ).then(
784
+ lambda: (
785
+ gr.Textbox(
786
+ interactive=True,
787
+ placeholder="Enter your prompt here and press the button",
788
+ ),
789
+ gr.Button(interactive=True),
790
+ ),
791
+ None,
792
+ [text_input, launch_research_btn],
793
+ )
794
+
795
+ # Render simple layout
796
+ else:
797
+ with gr.Blocks(
798
+ fill_height=True,
799
+ ):
800
+ # Project title and repository link at the top
801
+ gr.Markdown("""# Open Deep Research Vulnerability Intelligence""")
802
+ gr.Markdown("""<img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png" width="20" height="20" style="display: inline-block; vertical-align: middle; margin-right: 8px;"> <a href="https://github.com/mcdaqc/open-deep-research-vulnerability-intelligence" target="_blank">Github Repository</a>""")
803
+
804
+ # About section for mobile
805
+ with gr.Accordion("ℹ️ About", open=False):
806
+ gr.Markdown("""**What it does:**
807
+ This AI agent specializes in automated vulnerability research and analysis, built on <a href="https://huggingface.co/blog/open-deep-research" target="_blank">Hugging Face's Open Deep Research</a> architecture. It can search across multiple security databases to provide comprehensive vulnerability intelligence reports.
808
+
809
+ **Available Tools & APIs:**
810
+ - <a href="https://nvd.nist.gov/developers/vulnerabilities" target="_blank">🛡️ NIST NVD</a> - National Vulnerability Database (free API)
811
+ - <a href="https://cvedb.com/" target="_blank">📊 Shodan CVEDB</a> - Comprehensive vulnerability database (free API)
812
+ - <a href="https://kevin.gtfkd.com/" target="_blank">⚠️ KEVin</a> - Known Exploited Vulnerabilities database (free API)
813
+ - <a href="https://www.first.org/epss/" target="_blank">📈 EPSS</a> - Exploit Prediction Scoring System (free API)
814
+ - 🌐 **Web Browser** - Navigate and extract information from web pages
815
+
816
+ **Model Configuration:**
817
+ - **Default Model**: Qwen/Qwen2.5-Coder-32B-Instruct (recommended)
818
+ - **Alternative**: You can also use Ollama with local models for privacy
819
+
820
+ **How to use:**
821
+ 1. Enter your Hugging Face API key below
822
+ 2. Ask about specific software versions, CVEs, or security vulnerabilities
823
+ 3. The agent will automatically search all available databases
824
+ 4. Receive comprehensive vulnerability reports with CVSS scores, EPSS predictions, and remediation advice""")
825
+
826
+ # API Key Configuration Section for Mobile
827
+ with gr.Accordion("🔑 API Configuration", open=False):
828
+ gr.Markdown("**Configure your Hugging Face API Key**")
829
+ gr.Markdown("Due to recent API changes, you need to provide your own Hugging Face API key to use this application.")
830
+ gr.Markdown("Get your API key from: https://huggingface.co/settings/tokens")
831
+
832
+ mobile_api_key_input = gr.Textbox(
833
+ label="Hugging Face API Key",
834
+ placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
835
+ type="password",
836
+ lines=1
837
+ )
838
+ mobile_api_key_status = gr.Textbox(
839
+ label="Status",
840
+ value="✅ HF_TOKEN found in .env file. To use a different key, enter it above and click 'Setup API Key'." if os.getenv("HF_TOKEN") else "⚠️ Please enter your Hugging Face API key or set HF_TOKEN in your .env file.",
841
+ interactive=False
842
+ )
843
+
844
+ # Agent configuration for mobile
845
+ gr.Markdown("**Agent Configuration**")
846
+ mobile_max_steps_slider = gr.Slider(
847
+ minimum=5,
848
+ maximum=30,
849
+ value=10,
850
+ step=1,
851
+ label="Maximum Steps",
852
+ info="Number of steps the agent can take per session (higher = more detailed but slower)"
853
+ )
854
+
855
+ mobile_setup_api_btn = gr.Button("Setup API Key", variant="secondary")
856
+
857
+ # Add session state to store session-specific data
858
+ session_state = gr.State(
859
+ {}
860
+ ) # Initialize empty state for each session
861
+ stored_messages = gr.State([])
862
+ file_uploads_log = gr.State([])
863
+ chatbot = gr.Chatbot(
864
+ label="open-Deep-Research",
865
+ type="messages",
866
+ avatar_images=(
867
+ None,
868
+ "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png",
869
+ ),
870
+ resizeable=True,
871
+ scale=1,
872
+ )
873
+
874
+ # Mobile API Key setup event
875
+ mobile_setup_api_btn.click(
876
+ self.setup_api_key,
877
+ [mobile_api_key_input, mobile_max_steps_slider, session_state],
878
+ [mobile_api_key_status]
879
+ )
880
+
881
+ # Mobile Example button events
882
+ mobile_example_btn_1 = gr.Button("🔍 MobaXterm 24.0 vulnerabilities", size="sm", variant="secondary")
883
+ mobile_example_btn_2 = gr.Button("🔍 Chrome 120.0.6099.109 security analysis", size="sm", variant="secondary")
884
+ mobile_example_btn_3 = gr.Button("🔍 Apache Tomcat 9.0.65 KEV check", size="sm", variant="secondary")
885
+
886
+ # Mobile Example button events
887
+ mobile_example_btn_1.click(
888
+ lambda: "Analyze MobaXterm 24.0 for vulnerabilities as of today",
889
+ None,
890
+ [text_input]
891
+ )
892
+ mobile_example_btn_2.click(
893
+ lambda: "Check Chrome 120.0.6099.109 for current security issues",
894
+ None,
895
+ [text_input]
896
+ )
897
+ mobile_example_btn_3.click(
898
+ lambda: "Is Apache Tomcat 9.0.65 in KEV database as of today?",
899
+ None,
900
+ [text_input]
901
+ )
902
+
903
+ # Powered by smolagents for mobile
904
+ with gr.Row():
905
+ gr.HTML("""<div style="display: flex; align-items: center; gap: 8px; font-family: system-ui, -apple-system, sans-serif;">Powered by
906
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png" style="width: 32px; height: 32px; object-fit: contain;" alt="logo">
907
+ <a target="_blank" href="https://github.com/huggingface/smolagents"><b>hf/smolagents</b></a>
908
+ </div>""")
909
+
910
+ text_input = gr.Textbox(
911
+ lines=1,
912
+ label="Your request",
913
+ placeholder="Enter your prompt here and press the button",
914
+ )
915
+ launch_research_btn = gr.Button(
916
+ "Run",
917
+ variant="primary",
918
+ )
919
+
920
+ # Examples Section for Mobile
921
+ with gr.Accordion("💡 Example Prompts", open=False):
922
+ gr.Markdown("**Click any example below to populate your request field:**")
923
+
924
+ mobile_example_btn_1 = gr.Button("🔍 MobaXterm 24.0 vulnerabilities", size="sm", variant="secondary")
925
+ mobile_example_btn_2 = gr.Button("🔍 Chrome 120.0.6099.109 security analysis", size="sm", variant="secondary")
926
+ mobile_example_btn_3 = gr.Button("🔍 Apache Tomcat 9.0.65 KEV check", size="sm", variant="secondary")
927
+
928
+ # Mobile Example button events
929
+ mobile_example_btn_1.click(
930
+ lambda: "Analyze MobaXterm 24.0 for vulnerabilities as of today",
931
+ None,
932
+ [text_input]
933
+ )
934
+ mobile_example_btn_2.click(
935
+ lambda: "Check Chrome 120.0.6099.109 for current security issues",
936
+ None,
937
+ [text_input]
938
+ )
939
+ mobile_example_btn_3.click(
940
+ lambda: "Is Apache Tomcat 9.0.65 in KEV database as of today?",
941
+ None,
942
+ [text_input]
943
+ )
944
+
945
+ # Powered by smolagents for mobile
946
+ with gr.Row():
947
+ gr.HTML("""<div style="display: flex; align-items: center; gap: 8px; font-family: system-ui, -apple-system, sans-serif;">Powered by
948
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png" style="width: 32px; height: 32px; object-fit: contain;" alt="logo">
949
+ <a target="_blank" href="https://github.com/huggingface/smolagents"><b>hf/smolagents</b></a>
950
+ </div>""")
951
+
952
+ text_input.submit(
953
+ self.log_user_message,
954
+ [text_input, file_uploads_log],
955
+ [stored_messages, text_input, launch_research_btn],
956
+ ).then(
957
+ self.interact_with_agent,
958
+ # Include session_state in function calls
959
+ [stored_messages, chatbot, session_state],
960
+ [chatbot],
961
+ ).then(
962
+ lambda: (
963
+ gr.Textbox(
964
+ interactive=True,
965
+ placeholder="Enter your prompt here and press the button",
966
+ ),
967
+ gr.Button(interactive=True),
968
+ ),
969
+ None,
970
+ [text_input, launch_research_btn],
971
+ )
972
+ launch_research_btn.click(
973
+ self.log_user_message,
974
+ [text_input, file_uploads_log],
975
+ [stored_messages, text_input, launch_research_btn],
976
+ ).then(
977
+ self.interact_with_agent,
978
+ # Include session_state in function calls
979
+ [stored_messages, chatbot, session_state],
980
+ [chatbot],
981
+ ).then(
982
+ lambda: (
983
+ gr.Textbox(
984
+ interactive=True,
985
+ placeholder="Enter your prompt here and press the button",
986
+ ),
987
+ gr.Button(interactive=True),
988
+ ),
989
+ None,
990
+ [text_input, launch_research_btn],
991
+ )
992
+
993
+ demo.launch(debug=True, **kwargs)
994
+
995
+ # can this fix ctrl-c no response? no
996
+ try:
997
+ GradioUI().launch(mcp_server=True)
998
+ except KeyboardInterrupt:
999
+ ...
requirements.txt ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ smolagents==1.18.0
3
+ anthropic==0.54.0
4
+ beautifulsoup4==4.13.4
5
+ datasets==3.6.0
6
+ google_search_results==2.4.2
7
+ huggingface_hub==0.33.0
8
+ mammoth==1.9.1
9
+ markdownify==1.1.0
10
+ numexpr==2.11.0
11
+ numpy==2.2.6
12
+ openai==1.87.0
13
+ openpyxl==3.1.5
14
+ pandas==2.3.0
15
+ pathvalidate==3.3.1
16
+ pdfminer==20191125
17
+ pdfminer.six==20250506
18
+ Pillow==11.2.1
19
+ puremagic==1.29
20
+ pypdf==5.6.0
21
+ PyPDF2==3.0.1
22
+ python-dotenv==1.1.0
23
+ python-pptx==1.0.2
24
+ requests==2.32.4
25
+ serpapi==0.1.5
26
+ tqdm==4.67.1
27
+ torch==2.7.1
28
+ torchvision==0.22.1
29
+ transformers==4.52.4
30
+
31
+ # Additional libraries
32
+ chess==1.11.2
33
+ sympy==1.14.0
34
+ PubChemPy==1.0.4
35
+ scikit-learn==1.7.0
36
+ scipy==1.15.3
37
+ pydub==0.25.1
38
+ SpeechRecognition==3.14.3
39
+ xlrd==2.0.2
40
+
41
+ # Web search and utilities
42
+ duckduckgo_search==8.0.4
43
+ loguru==0.7.3
44
+
45
+ # UI and visualization
46
+ gradio==5.34.0
47
+ gradio_client==1.10.3
48
+ plotly==6.1.2
49
+ Jinja2==3.1.6
50
+
51
+ # MCP support
52
+ mcp==1.9.3
scripts/__pycache__/cookies.cpython-310.pyc ADDED
Binary file (10.1 kB). View file
 
scripts/__pycache__/cvedb_tool.cpython-310.pyc ADDED
Binary file (7.27 kB). View file
 
scripts/__pycache__/epss_tool.cpython-310.pyc ADDED
Binary file (4 kB). View file
 
scripts/__pycache__/kevin_tool.cpython-310.pyc ADDED
Binary file (8.43 kB). View file
 
scripts/__pycache__/markdown_report_tool.cpython-310.pyc ADDED
Binary file (7.54 kB). View file
 
scripts/__pycache__/mdconvert.cpython-310.pyc ADDED
Binary file (24.8 kB). View file
 
scripts/__pycache__/nvd_tool.cpython-310.pyc ADDED
Binary file (11.3 kB). View file
 
scripts/__pycache__/report_generator.cpython-310.pyc ADDED
Binary file (9.46 kB). View file
 
scripts/__pycache__/text_inspector_tool.cpython-310.pyc ADDED
Binary file (3.4 kB). View file
 
scripts/__pycache__/text_web_browser.cpython-310.pyc ADDED
Binary file (17.6 kB). View file
 
scripts/__pycache__/visual_qa.cpython-310.pyc ADDED
Binary file (3.42 kB). View file
 
scripts/cookies.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from requests.cookies import RequestsCookieJar
2
+
3
+
4
+ COOKIES_LIST = [
5
+ {
6
+ "domain": ".youtube.com",
7
+ "expirationDate": 1718884961,
8
+ "hostOnly": False,
9
+ "httpOnly": False,
10
+ "name": "ST-xuwub9",
11
+ "path": "/",
12
+ "sameSite": None,
13
+ "secure": False,
14
+ "session": False,
15
+ "storeId": None,
16
+ "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
17
+ },
18
+ {
19
+ "domain": ".youtube.com",
20
+ "expirationDate": 1753004444.745411,
21
+ "hostOnly": False,
22
+ "httpOnly": True,
23
+ "name": "__Secure-YEC",
24
+ "path": "/",
25
+ "sameSite": "lax",
26
+ "secure": True,
27
+ "session": False,
28
+ "storeId": None,
29
+ "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
30
+ },
31
+ {
32
+ "domain": ".youtube.com",
33
+ "expirationDate": 1753434620.050824,
34
+ "hostOnly": False,
35
+ "httpOnly": True,
36
+ "name": "__Secure-3PSID",
37
+ "path": "/",
38
+ "sameSite": "no_restriction",
39
+ "secure": True,
40
+ "session": False,
41
+ "storeId": None,
42
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
43
+ },
44
+ {
45
+ "domain": ".youtube.com",
46
+ "expirationDate": 1750420959.974642,
47
+ "hostOnly": False,
48
+ "httpOnly": False,
49
+ "name": "SIDCC",
50
+ "path": "/",
51
+ "sameSite": None,
52
+ "secure": False,
53
+ "session": False,
54
+ "storeId": None,
55
+ "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
56
+ },
57
+ {
58
+ "domain": ".youtube.com",
59
+ "expirationDate": 1753434620.050652,
60
+ "hostOnly": False,
61
+ "httpOnly": False,
62
+ "name": "SID",
63
+ "path": "/",
64
+ "sameSite": None,
65
+ "secure": False,
66
+ "session": False,
67
+ "storeId": None,
68
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
69
+ },
70
+ {
71
+ "domain": ".youtube.com",
72
+ "expirationDate": 1750420958.397534,
73
+ "hostOnly": False,
74
+ "httpOnly": True,
75
+ "name": "__Secure-1PSIDTS",
76
+ "path": "/",
77
+ "sameSite": None,
78
+ "secure": True,
79
+ "session": False,
80
+ "storeId": None,
81
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
82
+ },
83
+ {
84
+ "domain": ".youtube.com",
85
+ "expirationDate": 1753433494.44729,
86
+ "hostOnly": False,
87
+ "httpOnly": False,
88
+ "name": "_ga_M0180HEFCY",
89
+ "path": "/",
90
+ "sameSite": None,
91
+ "secure": False,
92
+ "session": False,
93
+ "storeId": None,
94
+ "value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
95
+ },
96
+ {
97
+ "domain": ".youtube.com",
98
+ "expirationDate": 1753434620.050933,
99
+ "hostOnly": False,
100
+ "httpOnly": False,
101
+ "name": "SAPISID",
102
+ "path": "/",
103
+ "sameSite": None,
104
+ "secure": True,
105
+ "session": False,
106
+ "storeId": None,
107
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
108
+ },
109
+ {
110
+ "domain": ".youtube.com",
111
+ "expirationDate": 1750420959.974764,
112
+ "hostOnly": False,
113
+ "httpOnly": True,
114
+ "name": "__Secure-1PSIDCC",
115
+ "path": "/",
116
+ "sameSite": None,
117
+ "secure": True,
118
+ "session": False,
119
+ "storeId": None,
120
+ "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
121
+ },
122
+ {
123
+ "domain": ".youtube.com",
124
+ "expirationDate": 1753434620.050881,
125
+ "hostOnly": False,
126
+ "httpOnly": True,
127
+ "name": "SSID",
128
+ "path": "/",
129
+ "sameSite": None,
130
+ "secure": True,
131
+ "session": False,
132
+ "storeId": None,
133
+ "value": "AmlwXHnQvOQ10LVd-",
134
+ },
135
+ {
136
+ "domain": ".youtube.com",
137
+ "expirationDate": 1753434620.050959,
138
+ "hostOnly": False,
139
+ "httpOnly": False,
140
+ "name": "__Secure-1PAPISID",
141
+ "path": "/",
142
+ "sameSite": None,
143
+ "secure": True,
144
+ "session": False,
145
+ "storeId": None,
146
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
147
+ },
148
+ {
149
+ "domain": ".youtube.com",
150
+ "expirationDate": 1753434620.050795,
151
+ "hostOnly": False,
152
+ "httpOnly": True,
153
+ "name": "__Secure-1PSID",
154
+ "path": "/",
155
+ "sameSite": None,
156
+ "secure": True,
157
+ "session": False,
158
+ "storeId": None,
159
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
160
+ },
161
+ {
162
+ "domain": ".youtube.com",
163
+ "expirationDate": 1753434620.050993,
164
+ "hostOnly": False,
165
+ "httpOnly": False,
166
+ "name": "__Secure-3PAPISID",
167
+ "path": "/",
168
+ "sameSite": "no_restriction",
169
+ "secure": True,
170
+ "session": False,
171
+ "storeId": None,
172
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
173
+ },
174
+ {
175
+ "domain": ".youtube.com",
176
+ "expirationDate": 1750420959.974815,
177
+ "hostOnly": False,
178
+ "httpOnly": True,
179
+ "name": "__Secure-3PSIDCC",
180
+ "path": "/",
181
+ "sameSite": "no_restriction",
182
+ "secure": True,
183
+ "session": False,
184
+ "storeId": None,
185
+ "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
186
+ },
187
+ {
188
+ "domain": ".youtube.com",
189
+ "expirationDate": 1750420958.397647,
190
+ "hostOnly": False,
191
+ "httpOnly": True,
192
+ "name": "__Secure-3PSIDTS",
193
+ "path": "/",
194
+ "sameSite": "no_restriction",
195
+ "secure": True,
196
+ "session": False,
197
+ "storeId": None,
198
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
199
+ },
200
+ {
201
+ "domain": ".youtube.com",
202
+ "expirationDate": 1753434620.050908,
203
+ "hostOnly": False,
204
+ "httpOnly": False,
205
+ "name": "APISID",
206
+ "path": "/",
207
+ "sameSite": None,
208
+ "secure": False,
209
+ "session": False,
210
+ "storeId": None,
211
+ "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
212
+ },
213
+ {
214
+ "domain": ".youtube.com",
215
+ "expirationDate": 1753434620.050855,
216
+ "hostOnly": False,
217
+ "httpOnly": True,
218
+ "name": "HSID",
219
+ "path": "/",
220
+ "sameSite": None,
221
+ "secure": False,
222
+ "session": False,
223
+ "storeId": None,
224
+ "value": "AasA7hmRuTFv7vjoq",
225
+ },
226
+ {
227
+ "domain": ".youtube.com",
228
+ "expirationDate": 1753435873.577793,
229
+ "hostOnly": False,
230
+ "httpOnly": True,
231
+ "name": "LOGIN_INFO",
232
+ "path": "/",
233
+ "sameSite": "no_restriction",
234
+ "secure": True,
235
+ "session": False,
236
+ "storeId": None,
237
+ "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
238
+ },
239
+ {
240
+ "domain": ".youtube.com",
241
+ "expirationDate": 1753444956.555608,
242
+ "hostOnly": False,
243
+ "httpOnly": False,
244
+ "name": "PREF",
245
+ "path": "/",
246
+ "sameSite": None,
247
+ "secure": True,
248
+ "session": False,
249
+ "storeId": None,
250
+ "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
251
+ },
252
+ ]
253
+
254
+ COOKIES_LIST += [
255
+ {
256
+ "domain": ".www.researchgate.net",
257
+ "hostOnly": False,
258
+ "httpOnly": True,
259
+ "name": "isInstIp",
260
+ "path": "/",
261
+ "sameSite": None,
262
+ "secure": True,
263
+ "session": True,
264
+ "storeId": None,
265
+ "value": "False",
266
+ },
267
+ {
268
+ "domain": ".researchgate.net",
269
+ "expirationDate": 1734423981,
270
+ "hostOnly": False,
271
+ "httpOnly": False,
272
+ "name": "__eoi",
273
+ "path": "/",
274
+ "sameSite": None,
275
+ "secure": False,
276
+ "session": False,
277
+ "storeId": None,
278
+ "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
279
+ },
280
+ {
281
+ "domain": ".www.researchgate.net",
282
+ "expirationDate": 1753444909.646103,
283
+ "hostOnly": False,
284
+ "httpOnly": True,
285
+ "name": "ptc",
286
+ "path": "/",
287
+ "sameSite": None,
288
+ "secure": True,
289
+ "session": False,
290
+ "storeId": None,
291
+ "value": "RG1.8947708639250500550.1718872043",
292
+ },
293
+ {
294
+ "domain": ".researchgate.net",
295
+ "expirationDate": 1750507578,
296
+ "hostOnly": False,
297
+ "httpOnly": False,
298
+ "name": "euconsent-v2-didomi",
299
+ "path": "/",
300
+ "sameSite": "lax",
301
+ "secure": True,
302
+ "session": False,
303
+ "storeId": None,
304
+ "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
305
+ },
306
+ {
307
+ "domain": ".researchgate.net",
308
+ "expirationDate": 1718885236,
309
+ "hostOnly": False,
310
+ "httpOnly": False,
311
+ "name": "_gat",
312
+ "path": "/",
313
+ "sameSite": None,
314
+ "secure": False,
315
+ "session": False,
316
+ "storeId": None,
317
+ "value": "1",
318
+ },
319
+ {
320
+ "domain": "www.researchgate.net",
321
+ "expirationDate": 1721477183,
322
+ "hostOnly": True,
323
+ "httpOnly": False,
324
+ "name": "_pbjs_userid_consent_data",
325
+ "path": "/",
326
+ "sameSite": "lax",
327
+ "secure": False,
328
+ "session": False,
329
+ "storeId": None,
330
+ "value": "3524755945110770",
331
+ },
332
+ {
333
+ "domain": ".researchgate.net",
334
+ "expirationDate": 1752567981,
335
+ "hostOnly": False,
336
+ "httpOnly": False,
337
+ "name": "__gads",
338
+ "path": "/",
339
+ "sameSite": None,
340
+ "secure": False,
341
+ "session": False,
342
+ "storeId": None,
343
+ "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
344
+ },
345
+ {
346
+ "domain": ".researchgate.net",
347
+ "expirationDate": 1718886709.646173,
348
+ "hostOnly": False,
349
+ "httpOnly": True,
350
+ "name": "__cf_bm",
351
+ "path": "/",
352
+ "sameSite": "no_restriction",
353
+ "secure": True,
354
+ "session": False,
355
+ "storeId": None,
356
+ "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
357
+ },
358
+ {
359
+ "domain": ".researchgate.net",
360
+ "expirationDate": 1752567981,
361
+ "hostOnly": False,
362
+ "httpOnly": False,
363
+ "name": "__gpi",
364
+ "path": "/",
365
+ "sameSite": None,
366
+ "secure": False,
367
+ "session": False,
368
+ "storeId": None,
369
+ "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
370
+ },
371
+ {
372
+ "domain": ".researchgate.net",
373
+ "hostOnly": False,
374
+ "httpOnly": True,
375
+ "name": "_cfuvid",
376
+ "path": "/",
377
+ "sameSite": "no_restriction",
378
+ "secure": True,
379
+ "session": True,
380
+ "storeId": None,
381
+ "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
382
+ },
383
+ {
384
+ "domain": ".researchgate.net",
385
+ "expirationDate": 1753445177.271667,
386
+ "hostOnly": False,
387
+ "httpOnly": False,
388
+ "name": "_ga",
389
+ "path": "/",
390
+ "sameSite": None,
391
+ "secure": False,
392
+ "session": False,
393
+ "storeId": None,
394
+ "value": "GA1.1.1525244793.1718885177",
395
+ },
396
+ {
397
+ "domain": ".researchgate.net",
398
+ "expirationDate": 1753445177.271482,
399
+ "hostOnly": False,
400
+ "httpOnly": False,
401
+ "name": "_ga_4P31SJ70EJ",
402
+ "path": "/",
403
+ "sameSite": None,
404
+ "secure": False,
405
+ "session": False,
406
+ "storeId": None,
407
+ "value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
408
+ },
409
+ {
410
+ "domain": ".researchgate.net",
411
+ "expirationDate": 1718971576,
412
+ "hostOnly": False,
413
+ "httpOnly": False,
414
+ "name": "_gid",
415
+ "path": "/",
416
+ "sameSite": None,
417
+ "secure": False,
418
+ "session": False,
419
+ "storeId": None,
420
+ "value": "GA1.2.854907463.1718885177",
421
+ },
422
+ {
423
+ "domain": ".www.researchgate.net",
424
+ "expirationDate": 1750407982.506505,
425
+ "hostOnly": False,
426
+ "httpOnly": True,
427
+ "name": "did",
428
+ "path": "/",
429
+ "sameSite": None,
430
+ "secure": True,
431
+ "session": False,
432
+ "storeId": None,
433
+ "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
434
+ },
435
+ {
436
+ "domain": ".researchgate.net",
437
+ "expirationDate": 1750507578,
438
+ "hostOnly": False,
439
+ "httpOnly": False,
440
+ "name": "didomi_token",
441
+ "path": "/",
442
+ "sameSite": "lax",
443
+ "secure": True,
444
+ "session": False,
445
+ "storeId": None,
446
+ "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
447
+ },
448
+ {
449
+ "domain": ".www.researchgate.net",
450
+ "hostOnly": False,
451
+ "httpOnly": True,
452
+ "name": "hasPdpNext",
453
+ "path": "/",
454
+ "sameSite": None,
455
+ "secure": True,
456
+ "session": True,
457
+ "storeId": None,
458
+ "value": "False",
459
+ },
460
+ {
461
+ "domain": ".researchgate.net",
462
+ "expirationDate": 1750421183,
463
+ "hostOnly": False,
464
+ "httpOnly": False,
465
+ "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
466
+ "path": "/",
467
+ "sameSite": "lax",
468
+ "secure": True,
469
+ "session": False,
470
+ "storeId": None,
471
+ "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
472
+ },
473
+ {
474
+ "domain": ".www.researchgate.net",
475
+ "hostOnly": False,
476
+ "httpOnly": True,
477
+ "name": "sid",
478
+ "path": "/",
479
+ "sameSite": None,
480
+ "secure": True,
481
+ "session": True,
482
+ "storeId": None,
483
+ "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
484
+ },
485
+ ]
486
+
487
+ COOKIES_LIST += [
488
+ {
489
+ "domain": "github.com",
490
+ "hostOnly": True,
491
+ "httpOnly": True,
492
+ "name": "_gh_sess",
493
+ "path": "/",
494
+ "sameSite": "lax",
495
+ "secure": True,
496
+ "session": True,
497
+ "storeId": None,
498
+ "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
499
+ },
500
+ {
501
+ "domain": ".github.com",
502
+ "expirationDate": 1750408875.763785,
503
+ "hostOnly": False,
504
+ "httpOnly": False,
505
+ "name": "_octo",
506
+ "path": "/",
507
+ "sameSite": "lax",
508
+ "secure": True,
509
+ "session": False,
510
+ "storeId": None,
511
+ "value": "GH1.1.728652011.1718872875",
512
+ },
513
+ {
514
+ "domain": ".github.com",
515
+ "expirationDate": 1750408875.763926,
516
+ "hostOnly": False,
517
+ "httpOnly": True,
518
+ "name": "logged_in",
519
+ "path": "/",
520
+ "sameSite": "lax",
521
+ "secure": True,
522
+ "session": False,
523
+ "storeId": None,
524
+ "value": "no",
525
+ },
526
+ {
527
+ "domain": ".github.com",
528
+ "hostOnly": False,
529
+ "httpOnly": False,
530
+ "name": "preferred_color_mode",
531
+ "path": "/",
532
+ "sameSite": "lax",
533
+ "secure": True,
534
+ "session": True,
535
+ "storeId": None,
536
+ "value": "dark",
537
+ },
538
+ {
539
+ "domain": ".github.com",
540
+ "hostOnly": False,
541
+ "httpOnly": False,
542
+ "name": "tz",
543
+ "path": "/",
544
+ "sameSite": "lax",
545
+ "secure": True,
546
+ "session": True,
547
+ "storeId": None,
548
+ "value": "Europe%2FParis",
549
+ },
550
+ ]
551
+
552
+ COOKIES_LIST += [
553
+ {
554
+ "domain": ".web.archive.org",
555
+ "expirationDate": 1718886430,
556
+ "hostOnly": False,
557
+ "httpOnly": False,
558
+ "name": "_gat",
559
+ "path": "/web/20201123221659/http://orcid.org/",
560
+ "sameSite": None,
561
+ "secure": False,
562
+ "session": False,
563
+ "storeId": None,
564
+ "value": "1",
565
+ },
566
+ {
567
+ "domain": ".web.archive.org",
568
+ "expirationDate": 1718972770,
569
+ "hostOnly": False,
570
+ "httpOnly": False,
571
+ "name": "_gid",
572
+ "path": "/web/20201123221659/http://orcid.org/",
573
+ "sameSite": None,
574
+ "secure": False,
575
+ "session": False,
576
+ "storeId": None,
577
+ "value": "GA1.2.402246368.1606169825",
578
+ },
579
+ {
580
+ "domain": ".web.archive.org",
581
+ "expirationDate": 1753446370.315621,
582
+ "hostOnly": False,
583
+ "httpOnly": False,
584
+ "name": "_ga",
585
+ "path": "/web/20201123221659/http://orcid.org/",
586
+ "sameSite": None,
587
+ "secure": False,
588
+ "session": False,
589
+ "storeId": None,
590
+ "value": "GA1.2.1301409987.1606169825",
591
+ },
592
+ {
593
+ "domain": ".web.archive.org",
594
+ "expirationDate": 1750422367,
595
+ "hostOnly": False,
596
+ "httpOnly": False,
597
+ "name": "_hjid",
598
+ "path": "/web/20201123221659/http://orcid.org/",
599
+ "sameSite": "lax",
600
+ "secure": False,
601
+ "session": False,
602
+ "storeId": None,
603
+ "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
604
+ },
605
+ {
606
+ "domain": ".web.archive.org",
607
+ "expirationDate": 1718888167,
608
+ "hostOnly": False,
609
+ "httpOnly": False,
610
+ "name": "_hjFirstSeen",
611
+ "path": "/web/20201123221659/http://orcid.org/",
612
+ "sameSite": "lax",
613
+ "secure": False,
614
+ "session": False,
615
+ "storeId": None,
616
+ "value": "1",
617
+ },
618
+ ]
619
+ COOKIES_LIST += [
620
+ {
621
+ "domain": "orcid.org",
622
+ "hostOnly": True,
623
+ "httpOnly": False,
624
+ "name": "AWSELBCORS",
625
+ "path": "/",
626
+ "sameSite": "no_restriction",
627
+ "secure": True,
628
+ "session": True,
629
+ "storeId": None,
630
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
631
+ },
632
+ {
633
+ "domain": ".orcid.org",
634
+ "expirationDate": 1753452454.637671,
635
+ "hostOnly": False,
636
+ "httpOnly": False,
637
+ "name": "_ga_9R61FWK9H5",
638
+ "path": "/",
639
+ "sameSite": None,
640
+ "secure": False,
641
+ "session": False,
642
+ "storeId": None,
643
+ "value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
644
+ },
645
+ {
646
+ "domain": ".orcid.org",
647
+ "expirationDate": 1753452454.63421,
648
+ "hostOnly": False,
649
+ "httpOnly": False,
650
+ "name": "_ga",
651
+ "path": "/",
652
+ "sameSite": None,
653
+ "secure": False,
654
+ "session": False,
655
+ "storeId": None,
656
+ "value": "GA1.1.2021310691.1718892455",
657
+ },
658
+ {
659
+ "domain": "orcid.org",
660
+ "hostOnly": True,
661
+ "httpOnly": False,
662
+ "name": "AWSELB",
663
+ "path": "/",
664
+ "sameSite": None,
665
+ "secure": False,
666
+ "session": True,
667
+ "storeId": None,
668
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
669
+ },
670
+ {
671
+ "domain": ".orcid.org",
672
+ "expirationDate": 1750428454,
673
+ "hostOnly": False,
674
+ "httpOnly": False,
675
+ "name": "OptanonAlertBoxClosed",
676
+ "path": "/",
677
+ "sameSite": "lax",
678
+ "secure": False,
679
+ "session": False,
680
+ "storeId": None,
681
+ "value": "2024-06-20T14:07:34.583Z",
682
+ },
683
+ {
684
+ "domain": ".orcid.org",
685
+ "expirationDate": 1750428454,
686
+ "hostOnly": False,
687
+ "httpOnly": False,
688
+ "name": "OptanonConsent",
689
+ "path": "/",
690
+ "sameSite": "lax",
691
+ "secure": False,
692
+ "session": False,
693
+ "storeId": None,
694
+ "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
695
+ },
696
+ {
697
+ "domain": "orcid.org",
698
+ "hostOnly": True,
699
+ "httpOnly": False,
700
+ "name": "XSRF-TOKEN",
701
+ "path": "/",
702
+ "sameSite": None,
703
+ "secure": True,
704
+ "session": True,
705
+ "storeId": None,
706
+ "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
707
+ },
708
+ ]
709
+
710
+ # Create a RequestsCookieJar instance
711
+ COOKIES = RequestsCookieJar()
712
+
713
+ # Add cookies to the jar
714
+ for cookie in COOKIES_LIST:
715
+ COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
scripts/cvedb_tool.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from typing import Optional
3
+ from smolagents import Tool
4
+
5
+ class CVEDBTool(Tool):
6
+ """
7
+ Tool for searching vulnerabilities using Shodan's CVEDB API (free, no API key required).
8
+ """
9
+
10
+ name = "cvedb_search"
11
+ description = """Tool for searching vulnerabilities using Shodan's CVEDB API (free, no API key required).
12
+
13
+ REQUIRED PARAMETERS:
14
+ - search_type: Must be 'cve' for specific CVE ID or 'product' for product name search
15
+ - identifier: The CVE ID or product name to search for
16
+
17
+ This tool allows searching for vulnerabilities in two ways:
18
+ 1. For a specific CVE: cvedb_search(search_type="cve", identifier="CVE-2021-44228")
19
+ 2. For a product: cvedb_search(search_type="product", identifier="microsoft")
20
+
21
+ CRITICAL - Product name format for product search:
22
+ - ALWAYS use ONLY the base product name, NEVER include versions
23
+ - CORRECT examples: "log4j", "apache", "microsoft", "chrome", "tomcat", "nginx"
24
+ - INCORRECT examples: "log4j 2.14.1", "apache http server", "microsoft windows", "chrome 120.0.6099.109"
25
+ - When searching, strip version numbers and descriptive words
26
+ - Example: "Apache Tomcat 9.0.65" → search for "tomcat"
27
+ - Example: "Google Chrome 120.0.6099.109" → search for "chrome"
28
+ - Example: "Log4j 2.14.1" → search for "log4j"
29
+
30
+ CORRECT Usage examples:
31
+ - cvedb_search(search_type="cve", identifier="CVE-2021-44228")
32
+ - cvedb_search(search_type="product", identifier="microsoft")
33
+ - cvedb_search(search_type="product", identifier="mobaxterm")
34
+
35
+ INCORRECT Usage (will cause errors):
36
+ - cvedb_search(identifier="microsoft") # Missing search_type
37
+ - cvedb_search(search_type="product") # Missing identifier"""
38
+
39
+ inputs = {
40
+ "search_type": {
41
+ "description": "Type of search to perform. Must be 'cve' for specific CVE ID or 'product' for product name search.",
42
+ "type": "string",
43
+ },
44
+ "identifier": {
45
+ "description": "The CVE ID (e.g., 'CVE-2021-44228') or product name (e.g., 'microsoft'). For products, use ONLY base product names without versions (e.g., 'microsoft' not 'microsoft windows').",
46
+ "type": "string",
47
+ },
48
+ }
49
+
50
+ output_type = "string"
51
+
52
+ def __init__(self):
53
+ super().__init__()
54
+
55
+ def forward(self, search_type: str, identifier: str) -> str:
56
+ """Search for vulnerabilities in CVEDB."""
57
+ try:
58
+ import requests
59
+
60
+ if search_type == "cve":
61
+ return self._search_by_cve(identifier)
62
+ elif search_type == "product":
63
+ return self._search_by_product(identifier)
64
+ else:
65
+ return "Error: search_type must be 'cve' or 'product'"
66
+
67
+ except Exception as e:
68
+ return f"Error searching CVEDB: {str(e)}"
69
+
70
+ def _search_by_cve(self, cve_id: str) -> str:
71
+ """Search for a specific CVE."""
72
+ import requests
73
+
74
+ url = f"https://cvedb.shodan.io/cve/{cve_id}"
75
+
76
+ try:
77
+ response = requests.get(url)
78
+ response.raise_for_status()
79
+ data = response.json()
80
+
81
+ if not data:
82
+ return f"No vulnerabilities found for CVE {cve_id}"
83
+
84
+ result = f"Vulnerability found for {cve_id}:\n\n"
85
+ result += f"- CVE ID: {data.get('cve_id', 'Not available')}\n"
86
+ result += f"- Summary: {data.get('summary', 'Not available')}\n"
87
+ result += f"- CVSS Score: {data.get('cvss', 'Not available')}\n"
88
+ result += f"- CVSS Version: {data.get('cvss_version', 'Not available')}\n"
89
+ result += f"- CVSS V2: {data.get('cvss_v2', 'Not available')}\n"
90
+ result += f"- CVSS V3: {data.get('cvss_v3', 'Not available')}\n"
91
+ result += f"- EPSS Score: {data.get('epss', 'Not available')}\n"
92
+ result += f"- EPSS Ranking: {data.get('ranking_epss', 'Not available')}\n"
93
+ result += f"- Known as exploitable (KEV): {'Yes' if data.get('kev') else 'No'}\n"
94
+ result += f"- Propose Action: {data.get('propose_action', 'Not available')}\n"
95
+ result += f"- Ransomware Campaign: {data.get('ransomware_campaign', 'Not available')}\n"
96
+ result += f"- Publication Date: {data.get('published_time', 'Not available')}\n"
97
+
98
+ # Show references
99
+ references = data.get('references', [])
100
+ if references:
101
+ result += f"- References ({len(references)} total):\n"
102
+ for i, ref in enumerate(references[:10], 1): # Show first 10 references
103
+ result += f" {i}. {ref}\n"
104
+ if len(references) > 10:
105
+ result += f" ... and {len(references) - 10} more references\n"
106
+ else:
107
+ result += "- References: None\n"
108
+
109
+ # Show CPE count but not the full list
110
+ cpes = data.get('cpes', [])
111
+ if cpes:
112
+ result += f"- Affected CPEs: {len(cpes)} CPEs found (list omitted due to length)\n"
113
+ else:
114
+ result += "- Affected CPEs: None\n"
115
+
116
+ return result
117
+
118
+ except requests.exceptions.RequestException as e:
119
+ return f"Error accessing CVEDB API: {str(e)}"
120
+
121
+ def _search_by_product(self, product: str) -> str:
122
+ """Search for vulnerabilities by product."""
123
+ import requests
124
+
125
+ # Convert product name to lowercase for consistent search
126
+ product = product.lower()
127
+
128
+ url = f"https://cvedb.shodan.io/cves"
129
+ params = {"product": product}
130
+
131
+ try:
132
+ response = requests.get(url, params=params)
133
+ response.raise_for_status()
134
+ data = response.json()
135
+
136
+ if not data or not data.get('cves'):
137
+ return f"No vulnerabilities found for the product {product}"
138
+
139
+ result = f"Vulnerabilities found for {product}:\n\n"
140
+
141
+ # Show all vulnerabilities
142
+ for i, vuln in enumerate(data['cves']):
143
+ result += f"**Vulnerability {i+1}:**\n"
144
+ result += f"- CVE ID: {vuln.get('cve_id', vuln.get('id', 'Not available'))}\n"
145
+ result += f"- Summary: {vuln.get('summary', 'Not available')}\n"
146
+ result += f"- CVSS Score: {vuln.get('cvss', 'Not available')}\n"
147
+ result += f"- CVSS Version: {vuln.get('cvss_version', 'Not available')}\n"
148
+ result += f"- CVSS V2: {vuln.get('cvss_v2', 'Not available')}\n"
149
+ result += f"- CVSS V3: {vuln.get('cvss_v3', 'Not available')}\n"
150
+ result += f"- EPSS Score: {vuln.get('epss', 'Not available')}\n"
151
+ result += f"- EPSS Ranking: {vuln.get('ranking_epss', 'Not available')}\n"
152
+ result += f"- Known as exploitable (KEV): {'Yes' if vuln.get('kev') else 'No'}\n"
153
+ result += f"- Propose Action: {vuln.get('propose_action', 'Not available')}\n"
154
+ result += f"- Ransomware Campaign: {vuln.get('ransomware_campaign', 'Not available')}\n"
155
+ result += f"- Publication Date: {vuln.get('published_time', 'Not available')}\n"
156
+
157
+ # Show references count
158
+ references = vuln.get('references', [])
159
+ if references:
160
+ result += f"- References: {len(references)} references available\n"
161
+ else:
162
+ result += "- References: None\n"
163
+
164
+ # Show CPE count but not the full list
165
+ cpes = vuln.get('cpes', [])
166
+ if cpes:
167
+ result += f"- Affected CPEs: {len(cpes)} CPEs found (list omitted due to length)\n"
168
+ else:
169
+ result += "- Affected CPEs: None\n"
170
+
171
+ result += "\n"
172
+
173
+ return result
174
+
175
+ except requests.exceptions.RequestException as e:
176
+ return f"Error accessing CVEDB API: {str(e)}"
177
+
178
+ def _format_context(self, data: dict, search_type: str, identifier: str) -> str:
179
+ """Format the data for context."""
180
+ context = []
181
+
182
+ if search_type == "cve":
183
+ if not data:
184
+ return f"No vulnerabilities found for CVE {identifier}"
185
+
186
+ context.append(f"CVSS Score: {data['cvss']}")
187
+ context.append(f"EPSS Score: {data['epss']}")
188
+ context.append(f"Known as exploitable (KEV): {'Yes' if data.get('kev') else 'No'}")
189
+
190
+ # Important dates
191
+ if data.get('published_time'):
192
+ context.append(f"Publication Date: {data['published_time']}")
193
+
194
+ context.append(f"Summary: {data.get('summary', 'Not available')}")
195
+
196
+ # Important references
197
+ if data.get('references'):
198
+ context.append("\nImportant references:")
199
+ for ref in data['references'][:3]: # Show first 3 references
200
+ context.append(f"- {ref}")
201
+
202
+ elif search_type == "product":
203
+ if not data or not data.get('cves'):
204
+ return "No vulnerabilities found for this product."
205
+
206
+ context.append(f"Found {len(data['cves'])} vulnerabilities for this product.")
207
+ context.append("\nMost relevant vulnerabilities:")
208
+
209
+ # Show all vulnerabilities
210
+ for i, vuln in enumerate(data['cves']):
211
+ context.append(f"{i+1}. {vuln.get('id', 'Unknown CVE')} - CVSS: {vuln.get('cvss', 'N/A')}")
212
+
213
+ if len(data['cves']) > 5:
214
+ context.append(f"\n... and {len(data['cves']) - 5} more vulnerabilities")
215
+
216
+ return "\n".join(context)
scripts/epss_tool.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tool for querying the Exploit Prediction Scoring System (EPSS)."""
2
+
3
+ from smolagents import Tool
4
+ import requests
5
+ import json
6
+
7
+ class EpsTool(Tool):
8
+ """
9
+ Tool for querying the Exploit Prediction Scoring System (EPSS).
10
+ """
11
+
12
+ name = "epss_search"
13
+ description = """Tool for querying the Exploit Prediction Scoring System (EPSS).
14
+
15
+ This tool allows obtaining EPSS scores for specific CVEs.
16
+ Usage: epss_search(cve_id="CVE-2022-26332", date="2022-03-05")
17
+
18
+ Example: epss_search(cve_id="CVE-2022-26332", date="2022-03-05")
19
+
20
+ The EPSS score indicates the probability that a vulnerability will be exploited in the next 30 days.
21
+ - Scores range from 0.0 to 1.0
22
+ - Higher scores indicate higher probability of exploitation
23
+ - Interpretation:
24
+ * 0.7+ (70%+): High probability of exploitation
25
+ * 0.4-0.7 (40-70%): Medium probability of exploitation
26
+ * 0.0-0.4 (0-40%): Low probability of exploitation"""
27
+
28
+ inputs = {
29
+ "cve_id": {
30
+ "description": "The CVE ID to search for EPSS score (e.g., 'CVE-2022-26332').",
31
+ "type": "string",
32
+ },
33
+ "date": {
34
+ "description": "Optional date for the EPSS score (format: YYYY-MM-DD).",
35
+ "type": "string",
36
+ "nullable": True,
37
+ },
38
+ }
39
+
40
+ output_type = "string"
41
+
42
+ def __init__(self):
43
+ super().__init__()
44
+ self.base_url = "https://api.first.org/data/v1/epss"
45
+
46
+ def forward(self, cve_id: str, date: str = None) -> str:
47
+ """Search for EPSS score for a specific CVE."""
48
+ try:
49
+ # Build the URL with parameters
50
+ params = {"cve": cve_id}
51
+ if date:
52
+ params["date"] = date
53
+
54
+ url = f"{self.base_url}?{'&'.join([f'{k}={v}' for k, v in params.items()])}"
55
+
56
+ response = requests.get(url)
57
+ response.raise_for_status()
58
+ data = response.json()
59
+
60
+ if not data.get('data'):
61
+ return f"No EPSS data found for {cve_id}"
62
+
63
+ epss_data = data['data'][0]
64
+
65
+ result = f"EPSS Score for {cve_id}:\n\n"
66
+ result += f"- CVE ID: {epss_data.get('cve', 'Not available')}\n"
67
+ result += f"- EPSS Score: {epss_data.get('epss', 'Not available')}\n"
68
+ result += f"- Percentile: {epss_data.get('percentile', 'Not available')}\n"
69
+ result += f"- Date: {epss_data.get('date', 'Not available')}\n"
70
+
71
+ # Add interpretation
72
+ epss_score = epss_data.get('epss')
73
+ if epss_score is not None:
74
+ try:
75
+ score = float(epss_score)
76
+ if score >= 0.7:
77
+ result += f"- Interpretation: High probability of exploitation ({score:.1%})\n"
78
+ elif score >= 0.4:
79
+ result += f"- Interpretation: Medium probability of exploitation ({score:.1%})\n"
80
+ else:
81
+ result += f"- Interpretation: Low probability of exploitation ({score:.1%})\n"
82
+ except ValueError:
83
+ pass
84
+
85
+ return result
86
+
87
+ except requests.exceptions.RequestException as e:
88
+ return f"Error accessing EPSS API: {str(e)}"
89
+ except Exception as e:
90
+ return f"Error processing EPSS data: {str(e)}"
91
+
92
+ def _format_context(self, data: dict, cve_id: str) -> str:
93
+ """Format the EPSS data for context."""
94
+ if not data or not data.get('data'):
95
+ return f"No EPSS data found for {cve_id}"
96
+
97
+ epss_data = data['data'][0]
98
+ context = []
99
+
100
+ context.append(f"EPSS Score: {epss_data.get('epss', 'Not available')}")
101
+ context.append(f"Percentile: {epss_data.get('percentile', 'Not available')}")
102
+ context.append(f"Date: {epss_data.get('date', 'Not available')}")
103
+
104
+ # Add interpretation
105
+ epss_score = epss_data.get('epss')
106
+ if epss_score is not None:
107
+ try:
108
+ score = float(epss_score)
109
+ if score >= 0.7:
110
+ context.append(f"Risk Level: High ({score:.1%} probability of exploitation)")
111
+ elif score >= 0.4:
112
+ context.append(f"Risk Level: Medium ({score:.1%} probability of exploitation)")
113
+ else:
114
+ context.append(f"Risk Level: Low ({score:.1%} probability of exploitation)")
115
+ except ValueError:
116
+ pass
117
+
118
+ return "\n".join(context)
scripts/gaia_scorer.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import warnings
4
+
5
+
6
+ def normalize_number_str(number_str: str) -> float:
7
+ # we replace these common units and commas to allow
8
+ # conversion to float
9
+ for char in ["$", "%", ","]:
10
+ number_str = number_str.replace(char, "")
11
+ try:
12
+ return float(number_str)
13
+ except ValueError:
14
+ print(f"String {number_str} cannot be normalized to number str.")
15
+ return float("inf")
16
+
17
+
18
+ def split_string(
19
+ s: str,
20
+ char_list: list[str] = [",", ";"],
21
+ ) -> list[str]:
22
+ pattern = f"[{''.join(char_list)}]"
23
+ return re.split(pattern, s)
24
+
25
+
26
+ def is_float(element: any) -> bool:
27
+ try:
28
+ float(element)
29
+ return True
30
+ except ValueError:
31
+ return False
32
+
33
+
34
+ def question_scorer(
35
+ model_answer: str,
36
+ ground_truth: str,
37
+ ) -> bool:
38
+ # if gt is a number
39
+ if is_float(ground_truth):
40
+ normalized_answer = normalize_number_str(str(model_answer))
41
+ return normalized_answer == float(ground_truth)
42
+
43
+ # if gt is a list
44
+ elif any(char in ground_truth for char in [",", ";"]):
45
+ # question with the fish: normalization removes punct
46
+
47
+ gt_elems = split_string(ground_truth)
48
+ ma_elems = split_string(model_answer)
49
+
50
+ # check length is the same
51
+ if len(gt_elems) != len(ma_elems):
52
+ warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
53
+ return False
54
+
55
+ # compare each element as float or str
56
+ comparisons = []
57
+ for ma_elem, gt_elem in zip(ma_elems, gt_elems):
58
+ if is_float(gt_elem):
59
+ normalized_ma_elem = normalize_number_str(ma_elem)
60
+ comparisons.append(normalized_ma_elem == float(gt_elem))
61
+ else:
62
+ # we do not remove punct since comparisons can include punct
63
+ comparisons.append(
64
+ normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
65
+ )
66
+ return all(comparisons)
67
+
68
+ # if gt is a str
69
+ else:
70
+ return normalize_str(model_answer) == normalize_str(ground_truth)
71
+
72
+
73
+ def check_prediction_contains_answer_letters_in_order(prediction, true_answer):
74
+ prediction = prediction.lower()
75
+ true_answer = true_answer.lower()
76
+ if len(prediction) > len(true_answer) * 3:
77
+ return False
78
+ i = 0
79
+ for letter in true_answer:
80
+ if letter in prediction[i:]:
81
+ i += prediction[i:].index(letter)
82
+ else:
83
+ return False
84
+ return True
85
+
86
+
87
+ def check_close_call(prediction, true_answer, is_correct):
88
+ if is_correct:
89
+ return True
90
+ else:
91
+ if is_float(true_answer):
92
+ return is_correct
93
+ else:
94
+ if (
95
+ check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer))
96
+ and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2
97
+ ):
98
+ print(f"Close call: {prediction} vs {true_answer}")
99
+ return True
100
+ else:
101
+ return False
102
+
103
+
104
+ def normalize_str(input_str, remove_punct=True) -> str:
105
+ """
106
+ Normalize a string by:
107
+ - Removing all white spaces
108
+ - Optionally removing punctuation (if remove_punct is True)
109
+ - Converting to lowercase
110
+ Parameters:
111
+ - input_str: str, the string to normalize
112
+ - remove_punct: bool, whether to remove punctuation (default: True)
113
+ Returns:
114
+ - str, the normalized string
115
+ """
116
+ # Remove all white spaces. Required e.g for seagull vs. sea gull
117
+ no_spaces = re.sub(r"\s", "", input_str)
118
+
119
+ # Remove punctuation, if specified.
120
+ if remove_punct:
121
+ translator = str.maketrans("", "", string.punctuation)
122
+ return no_spaces.lower().translate(translator)
123
+ else:
124
+ return no_spaces.lower()
scripts/kevin_tool.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tool for searching known exploited vulnerabilities (KEV) using the KEVin API."""
2
+
3
+ from smolagents import Tool
4
+ import requests
5
+ import json
6
+
7
+ class KevinTool(Tool):
8
+ """
9
+ Tool for searching known exploited vulnerabilities (KEV) using the KEVin API.
10
+ """
11
+
12
+ name = "kevin_search"
13
+ description = """Tool for searching known exploited vulnerabilities (KEV) using the KEVin API.
14
+
15
+ This tool allows searching for known exploited vulnerabilities in multiple ways:
16
+ - By CVE ID: kevin_search(search_type="cve", identifier="CVE-2021-44228")
17
+ - By keyword: kevin_search(search_type="keyword", identifier="log4j")
18
+ - Check if CVE is in KEV: kevin_search(search_type="exists", identifier="CVE-2023-22527")
19
+ - Filter by ransomware: kevin_search(search_type="ransomware")
20
+
21
+ CRITICAL - Product name format for keyword search:
22
+ - ALWAYS use ONLY the base product name, NEVER include versions
23
+ - CORRECT examples: "log4j", "apache", "microsoft", "chrome", "tomcat", "nginx"
24
+ - INCORRECT examples: "log4j 2.14.1", "apache http server", "microsoft windows", "chrome 120.0.6099.109"
25
+ - When searching, strip version numbers and descriptive words
26
+ - Example: "Apache Tomcat 9.0.65" → search for "tomcat"
27
+ - Example: "Google Chrome 120.0.6099.109" → search for "chrome"
28
+ - Example: "Log4j 2.14.1" → search for "log4j"
29
+
30
+ IMPORTANT: The "exists" search type checks if a CVE is listed in the Known Exploited Vulnerabilities (KEV) database.
31
+ A CVE can exist in NVD or other databases but not be in KEV if it hasn't been actively exploited.
32
+
33
+ Usage examples:
34
+ - kevin_search(search_type="cve", identifier="CVE-2021-44228")
35
+ - kevin_search(search_type="keyword", identifier="log4j")
36
+ - kevin_search(search_type="exists", identifier="CVE-2023-22527")
37
+ - kevin_search(search_type="ransomware")"""
38
+
39
+ inputs = {
40
+ "search_type": {
41
+ "description": "Type of search to perform. Must be 'cve' for specific CVE ID, 'keyword' for keyword search, 'exists' to check if CVE is in KEV database, or 'ransomware' to filter ransomware vulnerabilities.",
42
+ "type": "string",
43
+ },
44
+ "identifier": {
45
+ "description": "The CVE ID (e.g., 'CVE-2021-44228') or keyword (e.g., 'log4j'). For keywords, use ONLY base product names without versions (e.g., 'log4j' not 'log4j 2.14.1'). Not required for 'ransomware' search type.",
46
+ "type": "string",
47
+ "nullable": True,
48
+ },
49
+ }
50
+
51
+ output_type = "string"
52
+
53
+ def __init__(self):
54
+ super().__init__()
55
+ self.base_url = "https://kevin.gtfkd.com"
56
+
57
+ def forward(self, search_type: str, identifier: str = None) -> str:
58
+ """Search for known exploited vulnerabilities."""
59
+ try:
60
+ if search_type == "cve":
61
+ if not identifier:
62
+ return "Error: identifier is required for CVE search"
63
+ return self._search_by_cve(identifier)
64
+ elif search_type == "keyword":
65
+ if not identifier:
66
+ return "Error: identifier is required for keyword search"
67
+ return self._search_by_keyword(identifier)
68
+ elif search_type == "exists":
69
+ if not identifier:
70
+ return "Error: identifier is required for exists check"
71
+ return self._check_exists(identifier)
72
+ elif search_type == "ransomware":
73
+ return self._search_ransomware()
74
+ else:
75
+ return "Error: search_type must be 'cve', 'keyword', 'exists', or 'ransomware'"
76
+
77
+ except Exception as e:
78
+ return f"Error searching KEVin: {str(e)}"
79
+
80
+ def _check_exists(self, cve_id: str) -> str:
81
+ """Check if a CVE is listed in the Known Exploited Vulnerabilities (KEV) database."""
82
+ url = f"{self.base_url}/kev/exists"
83
+ params = {"cve": cve_id}
84
+
85
+ try:
86
+ response = requests.get(url, params=params)
87
+ response.raise_for_status()
88
+ data = response.json()
89
+
90
+ if data:
91
+ return f"✅ CVE {cve_id} IS listed in the Known Exploited Vulnerabilities (KEV) database - this means it has been actively exploited in the wild."
92
+ else:
93
+ return f"❌ CVE {cve_id} is NOT listed in the Known Exploited Vulnerabilities (KEV) database - this means it has not been reported as actively exploited (though it may exist in other vulnerability databases)."
94
+
95
+ except requests.exceptions.RequestException as e:
96
+ return f"Error checking KEVin existence: {str(e)}"
97
+
98
+ def _search_ransomware(self) -> str:
99
+ """Search for vulnerabilities known to be used in ransomware."""
100
+ url = f"{self.base_url}/kev"
101
+ params = {"filter": "ransomware"}
102
+
103
+ try:
104
+ response = requests.get(url, params=params)
105
+ response.raise_for_status()
106
+ data = response.json()
107
+
108
+ if not data:
109
+ return "No ransomware-related vulnerabilities found in KEV database."
110
+
111
+ context = []
112
+ context.append(f"Found {len(data)} ransomware-related vulnerabilities in KEV database.")
113
+ context.append("\nRansomware vulnerabilities:")
114
+
115
+ # Show all found vulnerabilities
116
+ for i, vuln in enumerate(data):
117
+ context.append(f"\n--- Vulnerability {i+1} ---")
118
+ context.append(f"CVE ID: {vuln.get('cveID', 'Not available')}")
119
+ context.append(f"Vendor: {vuln.get('vendorProject', 'Not available')}")
120
+ context.append(f"Product: {vuln.get('product', 'Not available')}")
121
+ context.append(f"Vulnerability Name: {vuln.get('vulnerabilityName', 'Not available')}")
122
+
123
+ # Important dates
124
+ context.append(f"Date Added: {vuln.get('dateAdded', 'Not available')}")
125
+ if vuln.get('dueDate'):
126
+ context.append(f"Due Date: {vuln['dueDate']}")
127
+
128
+ context.append(f"Short Description: {vuln.get('shortDescription', 'Not available')}")
129
+ context.append(f"Required Action: {vuln.get('requiredAction', 'Not available')}")
130
+
131
+ if vuln.get('notes'):
132
+ context.append(f"Notes: {vuln['notes']}")
133
+
134
+ context.append("-" * 50) # Separator between vulnerabilities
135
+
136
+ return "\n".join(context)
137
+
138
+ except requests.exceptions.RequestException as e:
139
+ return f"Error accessing KEVin API: {str(e)}"
140
+
141
+ def _search_by_cve(self, cve_id: str) -> str:
142
+ """Search for a specific CVE in KEVin."""
143
+ url = f"{self.base_url}/kev/{cve_id}"
144
+
145
+ try:
146
+ response = requests.get(url)
147
+ response.raise_for_status()
148
+ data = response.json()
149
+
150
+ if not data:
151
+ return f"No vulnerabilities found for CVE {cve_id}"
152
+
153
+ result = f"Known exploited vulnerability found for {cve_id}:\n\n"
154
+ result += f"- CVE ID: {data.get('cveID', 'Not available')}\n"
155
+ result += f"- Vendor: {data.get('vendorProject', 'Not available')}\n"
156
+ result += f"- Product: {data.get('product', 'Not available')}\n"
157
+ result += f"- Vulnerability Name: {data.get('vulnerabilityName', 'Not available')}\n"
158
+ result += f"- Date Added: {data.get('dateAdded', 'Not available')}\n"
159
+ result += f"- Short Description: {data.get('shortDescription', 'Not available')}\n"
160
+ result += f"- Required Action: {data.get('requiredAction', 'Not available')}\n"
161
+ result += f"- Due Date: {data.get('dueDate', 'Not available')}\n"
162
+ result += f"- Notes: {data.get('notes', 'Not available')}\n"
163
+
164
+ return result
165
+
166
+ except requests.exceptions.RequestException as e:
167
+ return f"Error accessing KEVin API: {str(e)}"
168
+
169
+ def _search_by_keyword(self, keyword: str) -> str:
170
+ """Search for vulnerabilities by keyword."""
171
+ url = f"{self.base_url}/kev"
172
+ params = {"search": keyword}
173
+
174
+ try:
175
+ response = requests.get(url, params=params)
176
+ response.raise_for_status()
177
+ data = response.json()
178
+
179
+ if not data:
180
+ return f"No vulnerabilities found for keyword '{keyword}'."
181
+
182
+ context = []
183
+ context.append(f"Found {len(data)} vulnerabilities for keyword '{keyword}'.")
184
+ context.append("\nVulnerabilities found:")
185
+
186
+ # Show all vulnerabilities
187
+ for i, vuln in enumerate(data):
188
+ context.append(f"{i+1}. {vuln.get('cveID', 'Unknown CVE')} - {vuln.get('product', 'Unknown Product')}")
189
+
190
+ return "\n".join(context)
191
+
192
+ except requests.exceptions.RequestException as e:
193
+ return f"Error accessing KEVin API: {str(e)}"
194
+
195
+ def _format_context(self, data: dict, search_type: str, identifier: str) -> str:
196
+ """Format the data for context."""
197
+ context = []
198
+
199
+ if search_type == "cve":
200
+ if not data:
201
+ return f"No vulnerabilities found for CVE {identifier}"
202
+
203
+ context.append(f"CVE ID: {data.get('cveID', 'Not available')}")
204
+ context.append(f"Vendor: {data.get('vendorProject', 'Not available')}")
205
+ context.append(f"Product: {data.get('product', 'Not available')}")
206
+ context.append(f"Vulnerability Name: {data.get('vulnerabilityName', 'Not available')}")
207
+ context.append(f"Date Added: {data.get('dateAdded', 'Not available')}")
208
+ context.append(f"Short Description: {data.get('shortDescription', 'Not available')}")
209
+ context.append(f"Required Action: {data.get('requiredAction', 'Not available')}")
210
+
211
+ if data.get('dueDate'):
212
+ context.append(f"Due Date: {data['dueDate']}")
213
+
214
+ if data.get('notes'):
215
+ context.append(f"Notes: {data['notes']}")
216
+
217
+ elif search_type == "keyword":
218
+ if not data:
219
+ return "No vulnerabilities found for this search."
220
+
221
+ context.append(f"Found {len(data)} vulnerabilities.")
222
+ context.append("\nMost relevant vulnerabilities:")
223
+
224
+ # Show the 5 most relevant vulnerabilities
225
+ for i, vuln in enumerate(data[:5]):
226
+ context.append(f"{i+1}. {vuln.get('cveID', 'Unknown CVE')} - {vuln.get('product', 'Unknown Product')}")
227
+
228
+ return "\n".join(context)
scripts/mdconvert.py ADDED
@@ -0,0 +1,949 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py
2
+ # Thanks to Microsoft researchers for open-sourcing this!
3
+ # type: ignore
4
+ import base64
5
+ import copy
6
+ import html
7
+ import json
8
+ import mimetypes
9
+ import os
10
+ import re
11
+ import shutil
12
+ import subprocess
13
+ import sys
14
+ import tempfile
15
+ import traceback
16
+ from typing import Any, Dict, List, Optional, Union
17
+ from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
18
+
19
+ import mammoth
20
+ import markdownify
21
+ import pandas as pd
22
+ import pdfminer
23
+ import pdfminer.high_level
24
+ import pptx
25
+
26
+ # File-format detection
27
+ import puremagic
28
+ import pydub
29
+ import requests
30
+ import speech_recognition as sr
31
+ from bs4 import BeautifulSoup
32
+ from youtube_transcript_api import YouTubeTranscriptApi
33
+ from youtube_transcript_api.formatters import SRTFormatter
34
+
35
+
36
+ class _CustomMarkdownify(markdownify.MarkdownConverter):
37
+ """
38
+ A custom version of markdownify's MarkdownConverter. Changes include:
39
+
40
+ - Altering the default heading style to use '#', '##', etc.
41
+ - Removing javascript hyperlinks.
42
+ - Truncating images with large data:uri sources.
43
+ - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
44
+ """
45
+
46
+ def __init__(self, **options: Any):
47
+ options["heading_style"] = options.get("heading_style", markdownify.ATX)
48
+ # Explicitly cast options to the expected type if necessary
49
+ super().__init__(**options)
50
+
51
+ def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
52
+ """Same as usual, but be sure to start with a new line"""
53
+ if not convert_as_inline:
54
+ if not re.search(r"^\n", text):
55
+ return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
56
+
57
+ return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
58
+
59
+ def convert_a(self, el: Any, text: str, convert_as_inline: bool):
60
+ """Same as usual converter, but removes Javascript links and escapes URIs."""
61
+ prefix, suffix, text = markdownify.chomp(text) # type: ignore
62
+ if not text:
63
+ return ""
64
+ href = el.get("href")
65
+ title = el.get("title")
66
+
67
+ # Escape URIs and skip non-http or file schemes
68
+ if href:
69
+ try:
70
+ parsed_url = urlparse(href) # type: ignore
71
+ if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
72
+ return "%s%s%s" % (prefix, text, suffix)
73
+ href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
74
+ except ValueError: # It's not clear if this ever gets thrown
75
+ return "%s%s%s" % (prefix, text, suffix)
76
+
77
+ # For the replacement see #29: text nodes underscores are escaped
78
+ if (
79
+ self.options["autolinks"]
80
+ and text.replace(r"\_", "_") == href
81
+ and not title
82
+ and not self.options["default_title"]
83
+ ):
84
+ # Shortcut syntax
85
+ return "<%s>" % href
86
+ if self.options["default_title"] and not title:
87
+ title = href
88
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
89
+ return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text
90
+
91
+ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
92
+ """Same as usual converter, but removes data URIs"""
93
+
94
+ alt = el.attrs.get("alt", None) or ""
95
+ src = el.attrs.get("src", None) or ""
96
+ title = el.attrs.get("title", None) or ""
97
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
98
+ if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]:
99
+ return alt
100
+
101
+ # Remove dataURIs
102
+ if src.startswith("data:"):
103
+ src = src.split(",")[0] + "..."
104
+
105
+ return "![%s](%s%s)" % (alt, src, title_part)
106
+
107
+ def convert_soup(self, soup: Any) -> str:
108
+ return super().convert_soup(soup) # type: ignore
109
+
110
+
111
+ class DocumentConverterResult:
112
+ """The result of converting a document to text."""
113
+
114
+ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
115
+ self.title: Union[str, None] = title
116
+ self.text_content: str = text_content
117
+
118
+
119
+ class DocumentConverter:
120
+ """Abstract superclass of all DocumentConverters."""
121
+
122
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
123
+ raise NotImplementedError()
124
+
125
+
126
+ class PlainTextConverter(DocumentConverter):
127
+ """Anything with content type text/plain"""
128
+
129
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
130
+ # Guess the content type from any file extension that might be around
131
+ content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
132
+
133
+ # Only accept text files
134
+ if content_type is None:
135
+ return None
136
+ # elif "text/" not in content_type.lower():
137
+ # return None
138
+
139
+ text_content = ""
140
+ with open(local_path, "rt", encoding="utf-8") as fh:
141
+ text_content = fh.read()
142
+ return DocumentConverterResult(
143
+ title=None,
144
+ text_content=text_content,
145
+ )
146
+
147
+
148
+ class HtmlConverter(DocumentConverter):
149
+ """Anything with content type text/html"""
150
+
151
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
152
+ # Bail if not html
153
+ extension = kwargs.get("file_extension", "")
154
+ if extension.lower() not in [".html", ".htm"]:
155
+ return None
156
+
157
+ result = None
158
+ with open(local_path, "rt", encoding="utf-8") as fh:
159
+ result = self._convert(fh.read())
160
+
161
+ return result
162
+
163
+ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
164
+ """Helper function that converts and HTML string."""
165
+
166
+ # Parse the string
167
+ soup = BeautifulSoup(html_content, "html.parser")
168
+
169
+ # Remove javascript and style blocks
170
+ for script in soup(["script", "style"]):
171
+ script.extract()
172
+
173
+ # Print only the main content
174
+ body_elm = soup.find("body")
175
+ webpage_text = ""
176
+ if body_elm:
177
+ webpage_text = _CustomMarkdownify().convert_soup(body_elm)
178
+ else:
179
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
180
+
181
+ assert isinstance(webpage_text, str)
182
+
183
+ return DocumentConverterResult(
184
+ title=None if soup.title is None else soup.title.string, text_content=webpage_text
185
+ )
186
+
187
+
188
+ class WikipediaConverter(DocumentConverter):
189
+ """Handle Wikipedia pages separately, focusing only on the main document content."""
190
+
191
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
192
+ # Bail if not Wikipedia
193
+ extension = kwargs.get("file_extension", "")
194
+ if extension.lower() not in [".html", ".htm"]:
195
+ return None
196
+ url = kwargs.get("url", "")
197
+ if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
198
+ return None
199
+
200
+ # Parse the file
201
+ soup = None
202
+ with open(local_path, "rt", encoding="utf-8") as fh:
203
+ soup = BeautifulSoup(fh.read(), "html.parser")
204
+
205
+ # Remove javascript and style blocks
206
+ for script in soup(["script", "style"]):
207
+ script.extract()
208
+
209
+ # Print only the main content
210
+ body_elm = soup.find("div", {"id": "mw-content-text"})
211
+ title_elm = soup.find("span", {"class": "mw-page-title-main"})
212
+
213
+ webpage_text = ""
214
+ main_title = None if soup.title is None else soup.title.string
215
+
216
+ if body_elm:
217
+ # What's the title
218
+ if title_elm and len(title_elm) > 0:
219
+ main_title = title_elm.string # type: ignore
220
+ assert isinstance(main_title, str)
221
+
222
+ # Convert the page
223
+ webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm)
224
+ else:
225
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
226
+
227
+ return DocumentConverterResult(
228
+ title=main_title,
229
+ text_content=webpage_text,
230
+ )
231
+
232
+
233
+ class YouTubeConverter(DocumentConverter):
234
+ """Handle YouTube specially, focusing on the video title, description, and transcript."""
235
+
236
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
237
+ # Bail if not YouTube
238
+ extension = kwargs.get("file_extension", "")
239
+ if extension.lower() not in [".html", ".htm"]:
240
+ return None
241
+ url = kwargs.get("url", "")
242
+ if not url.startswith("https://www.youtube.com/watch?"):
243
+ return None
244
+
245
+ # Parse the file
246
+ soup = None
247
+ with open(local_path, "rt", encoding="utf-8") as fh:
248
+ soup = BeautifulSoup(fh.read(), "html.parser")
249
+
250
+ # Read the meta tags
251
+ assert soup.title is not None and soup.title.string is not None
252
+ metadata: Dict[str, str] = {"title": soup.title.string}
253
+ for meta in soup(["meta"]):
254
+ for a in meta.attrs:
255
+ if a in ["itemprop", "property", "name"]:
256
+ metadata[meta[a]] = meta.get("content", "")
257
+ break
258
+
259
+ # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
260
+ try:
261
+ for script in soup(["script"]):
262
+ content = script.text
263
+ if "ytInitialData" in content:
264
+ lines = re.split(r"\r?\n", content)
265
+ obj_start = lines[0].find("{")
266
+ obj_end = lines[0].rfind("}")
267
+ if obj_start >= 0 and obj_end >= 0:
268
+ data = json.loads(lines[0][obj_start : obj_end + 1])
269
+ attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
270
+ if attrdesc:
271
+ metadata["description"] = str(attrdesc["content"])
272
+ break
273
+ except Exception:
274
+ pass
275
+
276
+ # Start preparing the page
277
+ webpage_text = "# YouTube\n"
278
+
279
+ title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
280
+ assert isinstance(title, str)
281
+
282
+ if title:
283
+ webpage_text += f"\n## {title}\n"
284
+
285
+ stats = ""
286
+ views = self._get(metadata, ["interactionCount"]) # type: ignore
287
+ if views:
288
+ stats += f"- **Views:** {views}\n"
289
+
290
+ keywords = self._get(metadata, ["keywords"]) # type: ignore
291
+ if keywords:
292
+ stats += f"- **Keywords:** {keywords}\n"
293
+
294
+ runtime = self._get(metadata, ["duration"]) # type: ignore
295
+ if runtime:
296
+ stats += f"- **Runtime:** {runtime}\n"
297
+
298
+ if len(stats) > 0:
299
+ webpage_text += f"\n### Video Metadata\n{stats}\n"
300
+
301
+ description = self._get(metadata, ["description", "og:description"]) # type: ignore
302
+ if description:
303
+ webpage_text += f"\n### Description\n{description}\n"
304
+
305
+ transcript_text = ""
306
+ parsed_url = urlparse(url) # type: ignore
307
+ params = parse_qs(parsed_url.query) # type: ignore
308
+ if "v" in params:
309
+ assert isinstance(params["v"][0], str)
310
+ video_id = str(params["v"][0])
311
+ try:
312
+ # Must be a single transcript.
313
+ transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
314
+ # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
315
+ # Alternative formatting:
316
+ transcript_text = SRTFormatter().format_transcript(transcript)
317
+ except Exception:
318
+ pass
319
+ if transcript_text:
320
+ webpage_text += f"\n### Transcript\n{transcript_text}\n"
321
+
322
+ title = title if title else soup.title.string
323
+ assert isinstance(title, str)
324
+
325
+ return DocumentConverterResult(
326
+ title=title,
327
+ text_content=webpage_text,
328
+ )
329
+
330
+ def _get(self, metadata: Dict[str, str], keys: List[str], default: Union[str, None] = None) -> Union[str, None]:
331
+ for k in keys:
332
+ if k in metadata:
333
+ return metadata[k]
334
+ return default
335
+
336
+ def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type
337
+ if isinstance(json, list):
338
+ for elm in json:
339
+ ret = self._findKey(elm, key)
340
+ if ret is not None:
341
+ return ret
342
+ elif isinstance(json, dict):
343
+ for k in json:
344
+ if k == key:
345
+ return json[k]
346
+ else:
347
+ ret = self._findKey(json[k], key)
348
+ if ret is not None:
349
+ return ret
350
+ return None
351
+
352
+
353
+ class PdfConverter(DocumentConverter):
354
+ """
355
+ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
356
+ """
357
+
358
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
359
+ # Bail if not a PDF
360
+ extension = kwargs.get("file_extension", "")
361
+ if extension.lower() != ".pdf":
362
+ return None
363
+
364
+ return DocumentConverterResult(
365
+ title=None,
366
+ text_content=pdfminer.high_level.extract_text(local_path),
367
+ )
368
+
369
+
370
+ class DocxConverter(HtmlConverter):
371
+ """
372
+ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
373
+ """
374
+
375
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
376
+ # Bail if not a DOCX
377
+ extension = kwargs.get("file_extension", "")
378
+ if extension.lower() != ".docx":
379
+ return None
380
+
381
+ result = None
382
+ with open(local_path, "rb") as docx_file:
383
+ result = mammoth.convert_to_html(docx_file)
384
+ html_content = result.value
385
+ result = self._convert(html_content)
386
+
387
+ return result
388
+
389
+
390
+ class XlsxConverter(HtmlConverter):
391
+ """
392
+ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
393
+ """
394
+
395
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
396
+ # Bail if not a XLSX
397
+ extension = kwargs.get("file_extension", "")
398
+ if extension.lower() not in [".xlsx", ".xls"]:
399
+ return None
400
+
401
+ sheets = pd.read_excel(local_path, sheet_name=None)
402
+ md_content = ""
403
+ for s in sheets:
404
+ md_content += f"## {s}\n"
405
+ html_content = sheets[s].to_html(index=False)
406
+ md_content += self._convert(html_content).text_content.strip() + "\n\n"
407
+
408
+ return DocumentConverterResult(
409
+ title=None,
410
+ text_content=md_content.strip(),
411
+ )
412
+
413
+
414
+ class PptxConverter(HtmlConverter):
415
+ """
416
+ Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
417
+ """
418
+
419
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
420
+ # Bail if not a PPTX
421
+ extension = kwargs.get("file_extension", "")
422
+ if extension.lower() != ".pptx":
423
+ return None
424
+
425
+ md_content = ""
426
+
427
+ presentation = pptx.Presentation(local_path)
428
+ slide_num = 0
429
+ for slide in presentation.slides:
430
+ slide_num += 1
431
+
432
+ md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
433
+
434
+ title = slide.shapes.title
435
+ for shape in slide.shapes:
436
+ # Pictures
437
+ if self._is_picture(shape):
438
+ # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
439
+ alt_text = ""
440
+ try:
441
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
442
+ except Exception:
443
+ pass
444
+
445
+ # A placeholder name
446
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
447
+ md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n"
448
+
449
+ # Tables
450
+ if self._is_table(shape):
451
+ html_table = "<html><body><table>"
452
+ first_row = True
453
+ for row in shape.table.rows:
454
+ html_table += "<tr>"
455
+ for cell in row.cells:
456
+ if first_row:
457
+ html_table += "<th>" + html.escape(cell.text) + "</th>"
458
+ else:
459
+ html_table += "<td>" + html.escape(cell.text) + "</td>"
460
+ html_table += "</tr>"
461
+ first_row = False
462
+ html_table += "</table></body></html>"
463
+ md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
464
+
465
+ # Text areas
466
+ elif shape.has_text_frame:
467
+ if shape == title:
468
+ md_content += "# " + shape.text.lstrip() + "\n"
469
+ else:
470
+ md_content += shape.text + "\n"
471
+
472
+ md_content = md_content.strip()
473
+
474
+ if slide.has_notes_slide:
475
+ md_content += "\n\n### Notes:\n"
476
+ notes_frame = slide.notes_slide.notes_text_frame
477
+ if notes_frame is not None:
478
+ md_content += notes_frame.text
479
+ md_content = md_content.strip()
480
+
481
+ return DocumentConverterResult(
482
+ title=None,
483
+ text_content=md_content.strip(),
484
+ )
485
+
486
+ def _is_picture(self, shape):
487
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
488
+ return True
489
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
490
+ if hasattr(shape, "image"):
491
+ return True
492
+ return False
493
+
494
+ def _is_table(self, shape):
495
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
496
+ return True
497
+ return False
498
+
499
+
500
+ class MediaConverter(DocumentConverter):
501
+ """
502
+ Abstract class for multi-modal media (e.g., images and audio)
503
+ """
504
+
505
+ def _get_metadata(self, local_path):
506
+ exiftool = shutil.which("exiftool")
507
+ if not exiftool:
508
+ return None
509
+ else:
510
+ try:
511
+ result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
512
+ return json.loads(result)[0]
513
+ except Exception:
514
+ return None
515
+
516
+
517
+ class WavConverter(MediaConverter):
518
+ """
519
+ Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
520
+ """
521
+
522
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
523
+ # Bail if not a XLSX
524
+ extension = kwargs.get("file_extension", "")
525
+ if extension.lower() != ".wav":
526
+ return None
527
+
528
+ md_content = ""
529
+
530
+ # Add metadata
531
+ metadata = self._get_metadata(local_path)
532
+ if metadata:
533
+ for f in [
534
+ "Title",
535
+ "Artist",
536
+ "Author",
537
+ "Band",
538
+ "Album",
539
+ "Genre",
540
+ "Track",
541
+ "DateTimeOriginal",
542
+ "CreateDate",
543
+ "Duration",
544
+ ]:
545
+ if f in metadata:
546
+ md_content += f"{f}: {metadata[f]}\n"
547
+
548
+ # Transcribe
549
+ try:
550
+ transcript = self._transcribe_audio(local_path)
551
+ md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript)
552
+ except Exception:
553
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
554
+
555
+ return DocumentConverterResult(
556
+ title=None,
557
+ text_content=md_content.strip(),
558
+ )
559
+
560
+ def _transcribe_audio(self, local_path) -> str:
561
+ recognizer = sr.Recognizer()
562
+ with sr.AudioFile(local_path) as source:
563
+ audio = recognizer.record(source)
564
+ return recognizer.recognize_google(audio).strip()
565
+
566
+
567
+ class Mp3Converter(WavConverter):
568
+ """
569
+ Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
570
+ """
571
+
572
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
573
+ # Bail if not a MP3
574
+ extension = kwargs.get("file_extension", "")
575
+ if extension.lower() != ".mp3":
576
+ return None
577
+
578
+ md_content = ""
579
+
580
+ # Add metadata
581
+ metadata = self._get_metadata(local_path)
582
+ if metadata:
583
+ for f in [
584
+ "Title",
585
+ "Artist",
586
+ "Author",
587
+ "Band",
588
+ "Album",
589
+ "Genre",
590
+ "Track",
591
+ "DateTimeOriginal",
592
+ "CreateDate",
593
+ "Duration",
594
+ ]:
595
+ if f in metadata:
596
+ md_content += f"{f}: {metadata[f]}\n"
597
+
598
+ # Transcribe
599
+ handle, temp_path = tempfile.mkstemp(suffix=".wav")
600
+ os.close(handle)
601
+ try:
602
+ sound = pydub.AudioSegment.from_mp3(local_path)
603
+ sound.export(temp_path, format="wav")
604
+
605
+ _args = dict()
606
+ _args.update(kwargs)
607
+ _args["file_extension"] = ".wav"
608
+
609
+ try:
610
+ transcript = super()._transcribe_audio(temp_path).strip()
611
+ md_content += "\n\n### Audio Transcript:\n" + (
612
+ "[No speech detected]" if transcript == "" else transcript
613
+ )
614
+ except Exception:
615
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
616
+
617
+ finally:
618
+ os.unlink(temp_path)
619
+
620
+ # Return the result
621
+ return DocumentConverterResult(
622
+ title=None,
623
+ text_content=md_content.strip(),
624
+ )
625
+
626
+
627
+ class ImageConverter(MediaConverter):
628
+ """
629
+ Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
630
+ """
631
+
632
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
633
+ # Bail if not a XLSX
634
+ extension = kwargs.get("file_extension", "")
635
+ if extension.lower() not in [".jpg", ".jpeg", ".png"]:
636
+ return None
637
+
638
+ md_content = ""
639
+
640
+ # Add metadata
641
+ metadata = self._get_metadata(local_path)
642
+ if metadata:
643
+ for f in [
644
+ "ImageSize",
645
+ "Title",
646
+ "Caption",
647
+ "Description",
648
+ "Keywords",
649
+ "Artist",
650
+ "Author",
651
+ "DateTimeOriginal",
652
+ "CreateDate",
653
+ "GPSPosition",
654
+ ]:
655
+ if f in metadata:
656
+ md_content += f"{f}: {metadata[f]}\n"
657
+
658
+ # Try describing the image with GPTV
659
+ mlm_client = kwargs.get("mlm_client")
660
+ mlm_model = kwargs.get("mlm_model")
661
+ if mlm_client is not None and mlm_model is not None:
662
+ md_content += (
663
+ "\n# Description:\n"
664
+ + self._get_mlm_description(
665
+ local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt")
666
+ ).strip()
667
+ + "\n"
668
+ )
669
+
670
+ return DocumentConverterResult(
671
+ title=None,
672
+ text_content=md_content,
673
+ )
674
+
675
+ def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
676
+ if prompt is None or prompt.strip() == "":
677
+ prompt = "Write a detailed caption for this image."
678
+
679
+ sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
680
+
681
+ data_uri = ""
682
+ with open(local_path, "rb") as image_file:
683
+ content_type, encoding = mimetypes.guess_type("_dummy" + extension)
684
+ if content_type is None:
685
+ content_type = "image/jpeg"
686
+ image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
687
+ data_uri = f"data:{content_type};base64,{image_base64}"
688
+
689
+ messages = [
690
+ {
691
+ "role": "user",
692
+ "content": [
693
+ {"type": "text", "text": prompt},
694
+ {
695
+ "type": "image_url",
696
+ "image_url": {
697
+ "url": data_uri,
698
+ },
699
+ },
700
+ ],
701
+ }
702
+ ]
703
+
704
+ response = client.chat.completions.create(model=model, messages=messages)
705
+ return response.choices[0].message.content
706
+
707
+
708
+ class FileConversionException(BaseException):
709
+ pass
710
+
711
+
712
+ class UnsupportedFormatException(BaseException):
713
+ pass
714
+
715
+
716
+ class MarkdownConverter:
717
+ """(In preview) An extremely simple text-based document reader, suitable for LLM use.
718
+ This reader will convert common file-types or webpages to Markdown."""
719
+
720
+ def __init__(
721
+ self,
722
+ requests_session: Optional[requests.Session] = None,
723
+ mlm_client: Optional[Any] = None,
724
+ mlm_model: Optional[Any] = None,
725
+ ):
726
+ if requests_session is None:
727
+ self._requests_session = requests.Session()
728
+ else:
729
+ self._requests_session = requests_session
730
+
731
+ self._mlm_client = mlm_client
732
+ self._mlm_model = mlm_model
733
+
734
+ self._page_converters: List[DocumentConverter] = []
735
+
736
+ # Register converters for successful browsing operations
737
+ # Later registrations are tried first / take higher priority than earlier registrations
738
+ # To this end, the most specific converters should appear below the most generic converters
739
+ self.register_page_converter(PlainTextConverter())
740
+ self.register_page_converter(HtmlConverter())
741
+ self.register_page_converter(WikipediaConverter())
742
+ self.register_page_converter(YouTubeConverter())
743
+ self.register_page_converter(DocxConverter())
744
+ self.register_page_converter(XlsxConverter())
745
+ self.register_page_converter(PptxConverter())
746
+ self.register_page_converter(WavConverter())
747
+ self.register_page_converter(Mp3Converter())
748
+ self.register_page_converter(ImageConverter())
749
+ self.register_page_converter(PdfConverter())
750
+
751
+ def convert(
752
+ self, source: Union[str, requests.Response], **kwargs: Any
753
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
754
+ """
755
+ Args:
756
+ - source: can be a string representing a path or url, or a requests.response object
757
+ - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
758
+ """
759
+
760
+ # Local path or url
761
+ if isinstance(source, str):
762
+ if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
763
+ return self.convert_url(source, **kwargs)
764
+ else:
765
+ return self.convert_local(source, **kwargs)
766
+ # Request response
767
+ elif isinstance(source, requests.Response):
768
+ return self.convert_response(source, **kwargs)
769
+
770
+ def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
771
+ # Prepare a list of extensions to try (in order of priority)
772
+ ext = kwargs.get("file_extension")
773
+ extensions = [ext] if ext is not None else []
774
+
775
+ # Get extension alternatives from the path and puremagic
776
+ base, ext = os.path.splitext(path)
777
+ self._append_ext(extensions, ext)
778
+ self._append_ext(extensions, self._guess_ext_magic(path))
779
+
780
+ # Convert
781
+ return self._convert(path, extensions, **kwargs)
782
+
783
+ # TODO what should stream's type be?
784
+ def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
785
+ # Prepare a list of extensions to try (in order of priority)
786
+ ext = kwargs.get("file_extension")
787
+ extensions = [ext] if ext is not None else []
788
+
789
+ # Save the file locally to a temporary file. It will be deleted before this method exits
790
+ handle, temp_path = tempfile.mkstemp()
791
+ fh = os.fdopen(handle, "wb")
792
+ result = None
793
+ try:
794
+ # Write to the temporary file
795
+ content = stream.read()
796
+ if isinstance(content, str):
797
+ fh.write(content.encode("utf-8"))
798
+ else:
799
+ fh.write(content)
800
+ fh.close()
801
+
802
+ # Use puremagic to check for more extension options
803
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
804
+
805
+ # Convert
806
+ result = self._convert(temp_path, extensions, **kwargs)
807
+ # Clean up
808
+ finally:
809
+ try:
810
+ fh.close()
811
+ except Exception:
812
+ pass
813
+ os.unlink(temp_path)
814
+
815
+ return result
816
+
817
+ def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type
818
+ # Send a HTTP request to the URL
819
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
820
+ response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
821
+ response.raise_for_status()
822
+ return self.convert_response(response, **kwargs)
823
+
824
+ def convert_response(
825
+ self, response: requests.Response, **kwargs: Any
826
+ ) -> DocumentConverterResult: # TODO fix kwargs type
827
+ # Prepare a list of extensions to try (in order of priority)
828
+ ext = kwargs.get("file_extension")
829
+ extensions = [ext] if ext is not None else []
830
+
831
+ # Guess from the mimetype
832
+ content_type = response.headers.get("content-type", "").split(";")[0]
833
+ self._append_ext(extensions, mimetypes.guess_extension(content_type))
834
+
835
+ # Read the content disposition if there is one
836
+ content_disposition = response.headers.get("content-disposition", "")
837
+ m = re.search(r"filename=([^;]+)", content_disposition)
838
+ if m:
839
+ base, ext = os.path.splitext(m.group(1).strip("\"'"))
840
+ self._append_ext(extensions, ext)
841
+
842
+ # Read from the extension from the path
843
+ base, ext = os.path.splitext(urlparse(response.url).path)
844
+ self._append_ext(extensions, ext)
845
+
846
+ # Save the file locally to a temporary file. It will be deleted before this method exits
847
+ handle, temp_path = tempfile.mkstemp()
848
+ fh = os.fdopen(handle, "wb")
849
+ result = None
850
+ try:
851
+ # Download the file
852
+ for chunk in response.iter_content(chunk_size=512):
853
+ fh.write(chunk)
854
+ fh.close()
855
+
856
+ # Use puremagic to check for more extension options
857
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
858
+
859
+ # Convert
860
+ result = self._convert(temp_path, extensions, url=response.url)
861
+ except Exception as e:
862
+ print(f"Error in converting: {e}")
863
+
864
+ # Clean up
865
+ finally:
866
+ try:
867
+ fh.close()
868
+ except Exception:
869
+ pass
870
+ os.unlink(temp_path)
871
+
872
+ return result
873
+
874
+ def _convert(self, local_path: str, extensions: List[Union[str, None]], **kwargs) -> DocumentConverterResult:
875
+ error_trace = ""
876
+ for ext in extensions + [None]: # Try last with no extension
877
+ for converter in self._page_converters:
878
+ _kwargs = copy.deepcopy(kwargs)
879
+
880
+ # Overwrite file_extension appropriately
881
+ if ext is None:
882
+ if "file_extension" in _kwargs:
883
+ del _kwargs["file_extension"]
884
+ else:
885
+ _kwargs.update({"file_extension": ext})
886
+
887
+ # Copy any additional global options
888
+ if "mlm_client" not in _kwargs and self._mlm_client is not None:
889
+ _kwargs["mlm_client"] = self._mlm_client
890
+
891
+ if "mlm_model" not in _kwargs and self._mlm_model is not None:
892
+ _kwargs["mlm_model"] = self._mlm_model
893
+
894
+ # If we hit an error log it and keep trying
895
+ try:
896
+ res = converter.convert(local_path, **_kwargs)
897
+ except Exception:
898
+ error_trace = ("\n\n" + traceback.format_exc()).strip()
899
+
900
+ if res is not None:
901
+ # Normalize the content
902
+ res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
903
+ res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
904
+
905
+ # Todo
906
+ return res
907
+
908
+ # If we got this far without success, report any exceptions
909
+ if len(error_trace) > 0:
910
+ raise FileConversionException(
911
+ f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
912
+ )
913
+
914
+ # Nothing can handle it!
915
+ raise UnsupportedFormatException(
916
+ f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
917
+ )
918
+
919
+ def _append_ext(self, extensions, ext):
920
+ """Append a unique non-None, non-empty extension to a list of extensions."""
921
+ if ext is None:
922
+ return
923
+ ext = ext.strip()
924
+ if ext == "":
925
+ return
926
+ # if ext not in extensions:
927
+ if True:
928
+ extensions.append(ext)
929
+
930
+ def _guess_ext_magic(self, path):
931
+ """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
932
+ # Use puremagic to guess
933
+ try:
934
+ guesses = puremagic.magic_file(path)
935
+ if len(guesses) > 0:
936
+ ext = guesses[0].extension.strip()
937
+ if len(ext) > 0:
938
+ return ext
939
+ except FileNotFoundError:
940
+ pass
941
+ except IsADirectoryError:
942
+ pass
943
+ except PermissionError:
944
+ pass
945
+ return None
946
+
947
+ def register_page_converter(self, converter: DocumentConverter) -> None:
948
+ """Register a page text converter."""
949
+ self._page_converters.insert(0, converter)
scripts/nvd_tool.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tool for querying the National Vulnerability Database (NVD) of NIST.
2
+ This tool allows obtaining detailed information about specific vulnerabilities.
3
+ """
4
+
5
+ from smolagents import Tool
6
+ import requests
7
+ import json
8
+ from urllib.parse import quote
9
+
10
+ class NvdTool(Tool):
11
+ """
12
+ Tool for querying the National Vulnerability Database (NVD) of NIST.
13
+ """
14
+
15
+ name = "nvd_search"
16
+ description = """Tool for querying the National Vulnerability Database (NVD) of NIST.
17
+ This tool allows searching for vulnerabilities in three ways:
18
+ 1. By CVE ID: nvd_search(search_type="cve", identifier="CVE-2021-44228")
19
+ 2. By keyword: nvd_search(search_type="keyword", identifier="log4j")
20
+ 3. By keyword with exact match: nvd_search(search_type="keyword", identifier="log4j", exact_match=True)
21
+
22
+ CRITICAL - Product name format for keyword search:
23
+ - ALWAYS use ONLY the base product name, NEVER include versions
24
+ - CORRECT examples: "log4j", "apache", "microsoft", "chrome", "tomcat", "nginx"
25
+ - INCORRECT examples: "log4j 2.14.1", "apache http server", "microsoft windows", "chrome 120.0.6099.109"
26
+ - When searching, strip version numbers and descriptive words
27
+ - Example: "Apache Tomcat 9.0.65" → search for "tomcat"
28
+ - Example: "Google Chrome 120.0.6099.109" → search for "chrome"
29
+ - Example: "Log4j 2.14.1" → search for "log4j"
30
+
31
+ IMPORTANT: search_type must be EXACTLY 'cve' or 'keyword', not 'product' or any other value.
32
+ CORRECT: nvd_search(search_type="keyword", identifier="mobaxterm")
33
+ WRONG: nvd_search(search_type="product", identifier="mobaxterm")
34
+
35
+ The exact_match parameter:
36
+ - When True: Searches for CVEs that contain the EXACT phrase in the description
37
+ - When False: Searches for CVEs that contain ANY of the words in the description
38
+ """
39
+
40
+ inputs = {
41
+ "search_type": {
42
+ "description": "Type of search to perform. Must be 'cve' for specific CVE ID or 'keyword' for keyword search.",
43
+ "type": "string",
44
+ },
45
+ "identifier": {
46
+ "description": "The CVE ID (e.g., 'CVE-2021-44228') or keyword (e.g., 'log4j'). For keywords, use ONLY base product names without versions (e.g., 'log4j' not 'log4j 2.14.1').",
47
+ "type": "string",
48
+ },
49
+ "exact_match": {
50
+ "description": "For keyword searches, whether to match the exact phrase (True) or any word (False).",
51
+ "type": "boolean",
52
+ "default": False,
53
+ "nullable": True,
54
+ },
55
+ }
56
+
57
+ output_type = "string"
58
+
59
+ def __init__(self):
60
+ super().__init__()
61
+ self.base_url = "https://services.nvd.nist.gov/rest/json/cves/2.0"
62
+
63
+ def forward(self, search_type: str, identifier: str, exact_match: bool = False) -> str:
64
+ """Search for vulnerabilities in NVD."""
65
+ try:
66
+ if search_type == "cve":
67
+ return self._search_by_cve(identifier)
68
+ elif search_type == "keyword":
69
+ return self._search_by_keyword(identifier, exact_match)
70
+ else:
71
+ return "Error: search_type must be 'cve' or 'keyword'"
72
+
73
+ except Exception as e:
74
+ return f"Error searching NVD: {str(e)}"
75
+
76
+ def _search_by_cve(self, cve_id: str) -> str:
77
+ """Search for a specific CVE."""
78
+ url = f"{self.base_url}?cveId={cve_id}"
79
+
80
+ try:
81
+ response = requests.get(url)
82
+ response.raise_for_status()
83
+ data = response.json()
84
+
85
+ if not data.get('vulnerabilities'):
86
+ return f"No vulnerabilities found for {cve_id}"
87
+
88
+ vuln_data = data['vulnerabilities'][0]['cve']
89
+ return self._format_cve_data(vuln_data)
90
+
91
+ except requests.exceptions.RequestException as e:
92
+ return f"Error accessing NVD API: {str(e)}"
93
+
94
+ def _search_by_keyword(self, keyword: str, exact_match: bool = False) -> str:
95
+ """Search for vulnerabilities by keyword."""
96
+ # Convert keyword to lowercase and encode spaces properly
97
+ keyword = keyword.lower()
98
+ encoded_keyword = quote(keyword)
99
+
100
+ # Build the URL based on exact_match parameter
101
+ if exact_match:
102
+ # For exact match, use keywordExactMatch parameter
103
+ url = f"{self.base_url}?keywordSearch={encoded_keyword}&keywordExactMatch"
104
+ else:
105
+ # For partial match, just use keywordSearch
106
+ url = f"{self.base_url}?keywordSearch={encoded_keyword}"
107
+
108
+ try:
109
+ response = requests.get(url)
110
+ response.raise_for_status()
111
+ data = response.json()
112
+
113
+ if not data.get('vulnerabilities'):
114
+ return f"No vulnerabilities found for '{keyword}'"
115
+
116
+ vulnerabilities = data['vulnerabilities']
117
+ context = []
118
+ context.append(f"Found {len(vulnerabilities)} vulnerabilities for search '{keyword}'")
119
+ context.append("\nVulnerabilities found:")
120
+
121
+ # Show all vulnerabilities
122
+ for i, vuln in enumerate(vulnerabilities):
123
+ vuln_data = vuln['cve']
124
+ context.append(f"\n--- Vulnerability {i+1} ---")
125
+ context.append(f"CVE ID: {vuln_data.get('id', 'Not available')}")
126
+ context.append(f"Source Identifier: {vuln_data.get('sourceIdentifier', 'Not available')}")
127
+ context.append(f"Vulnerability Status: {vuln_data.get('vulnStatus', 'Not available')}")
128
+
129
+ # CVE Tags
130
+ cve_tags = vuln_data.get('cveTags', [])
131
+ if cve_tags:
132
+ context.append(f"CVE Tags: {', '.join(cve_tags)}")
133
+ else:
134
+ context.append("CVE Tags: None")
135
+
136
+ # Description - show all languages
137
+ descriptions = vuln_data.get('descriptions', [])
138
+ if descriptions:
139
+ for desc in descriptions:
140
+ lang = desc.get('lang', 'unknown')
141
+ value = desc.get('value', 'Not available')
142
+ context.append(f"Description ({lang}): {value}")
143
+ else:
144
+ context.append("Description: Not available")
145
+
146
+ # CVSS Metrics - show all available versions
147
+ metrics = vuln_data.get('metrics', {})
148
+ if metrics:
149
+ context.append("\nCVSS Metrics:")
150
+
151
+ # CVSS V3.1
152
+ if 'cvssMetricV31' in metrics:
153
+ cvss_data = metrics['cvssMetricV31'][0]
154
+ cvss_info = cvss_data['cvssData']
155
+ context.append(f"CVSS V3.1:")
156
+ context.append(f" Base Score: {cvss_info['baseScore']}")
157
+ context.append(f" Base Severity: {cvss_info['baseSeverity']}")
158
+ context.append(f" Vector String: {cvss_info['vectorString']}")
159
+ context.append(f" Attack Vector: {cvss_info['attackVector']}")
160
+ context.append(f" Attack Complexity: {cvss_info['attackComplexity']}")
161
+ context.append(f" Privileges Required: {cvss_info['privilegesRequired']}")
162
+ context.append(f" User Interaction: {cvss_info['userInteraction']}")
163
+ context.append(f" Scope: {cvss_info['scope']}")
164
+ context.append(f" Confidentiality Impact: {cvss_info['confidentialityImpact']}")
165
+ context.append(f" Integrity Impact: {cvss_info['integrityImpact']}")
166
+ context.append(f" Availability Impact: {cvss_info['availabilityImpact']}")
167
+ context.append(f" Exploitability Score: {cvss_data['exploitabilityScore']}")
168
+ context.append(f" Impact Score: {cvss_data['impactScore']}")
169
+
170
+ # CVSS V3.0
171
+ if 'cvssMetricV30' in metrics:
172
+ cvss_data = metrics['cvssMetricV30'][0]
173
+ cvss_info = cvss_data['cvssData']
174
+ context.append(f"CVSS V3.0:")
175
+ context.append(f" Base Score: {cvss_info['baseScore']}")
176
+ context.append(f" Base Severity: {cvss_info['baseSeverity']}")
177
+ context.append(f" Vector String: {cvss_info['vectorString']}")
178
+ context.append(f" Attack Vector: {cvss_info['attackVector']}")
179
+ context.append(f" Attack Complexity: {cvss_info['attackComplexity']}")
180
+ context.append(f" Privileges Required: {cvss_info['privilegesRequired']}")
181
+ context.append(f" User Interaction: {cvss_info['userInteraction']}")
182
+ context.append(f" Scope: {cvss_info['scope']}")
183
+ context.append(f" Confidentiality Impact: {cvss_info['confidentialityImpact']}")
184
+ context.append(f" Integrity Impact: {cvss_info['integrityImpact']}")
185
+ context.append(f" Availability Impact: {cvss_info['availabilityImpact']}")
186
+ context.append(f" Exploitability Score: {cvss_data['exploitabilityScore']}")
187
+ context.append(f" Impact Score: {cvss_data['impactScore']}")
188
+
189
+ # CVSS V2
190
+ if 'cvssMetricV2' in metrics:
191
+ cvss_data = metrics['cvssMetricV2'][0]
192
+ cvss_info = cvss_data['cvssData']
193
+ context.append(f"CVSS V2:")
194
+ context.append(f" Base Score: {cvss_info['baseScore']}")
195
+ context.append(f" Base Severity: {cvss_data['baseSeverity']}")
196
+ context.append(f" Vector String: {cvss_info['vectorString']}")
197
+ context.append(f" Access Vector: {cvss_info['accessVector']}")
198
+ context.append(f" Access Complexity: {cvss_info['accessComplexity']}")
199
+ context.append(f" Authentication: {cvss_info['authentication']}")
200
+ context.append(f" Confidentiality Impact: {cvss_info['confidentialityImpact']}")
201
+ context.append(f" Integrity Impact: {cvss_info['integrityImpact']}")
202
+ context.append(f" Availability Impact: {cvss_info['availabilityImpact']}")
203
+ context.append(f" Exploitability Score: {cvss_data['exploitabilityScore']}")
204
+ context.append(f" Impact Score: {cvss_data['impactScore']}")
205
+ else:
206
+ context.append("CVSS Metrics: Not available")
207
+
208
+ # Weaknesses - show all CWEs
209
+ weaknesses = vuln_data.get('weaknesses', [])
210
+ if weaknesses:
211
+ context.append("\nWeaknesses:")
212
+ for weakness in weaknesses:
213
+ source = weakness.get('source', 'Unknown')
214
+ w_type = weakness.get('type', 'Unknown')
215
+ context.append(f" Source: {source}, Type: {w_type}")
216
+ descriptions = weakness.get('description', [])
217
+ for desc in descriptions:
218
+ lang = desc.get('lang', 'unknown')
219
+ value = desc.get('value', 'Not available')
220
+ context.append(f" CWE ({lang}): {value}")
221
+ else:
222
+ context.append("\nWeaknesses: Not available")
223
+
224
+ # Configurations
225
+ configurations = vuln_data.get('configurations', [])
226
+ if configurations:
227
+ context.append("\nAffected Configurations:")
228
+ for config in configurations:
229
+ nodes = config.get('nodes', [])
230
+ for node in nodes:
231
+ operator = node.get('operator', 'Unknown')
232
+ negate = node.get('negate', False)
233
+ context.append(f" Operator: {operator}, Negate: {negate}")
234
+ cpe_matches = node.get('cpeMatch', [])
235
+ for match in cpe_matches:
236
+ vulnerable = match.get('vulnerable', False)
237
+ criteria = match.get('criteria', 'Not available')
238
+ version_end = match.get('versionEndIncluding', 'Not specified')
239
+ context.append(f" CPE: {criteria}")
240
+ context.append(f" Vulnerable: {vulnerable}")
241
+ if version_end != 'Not specified':
242
+ context.append(f" Version End Including: {version_end}")
243
+ else:
244
+ context.append("\nAffected Configurations: Not available")
245
+
246
+ # Dates
247
+ context.append(f"\nPublished: {vuln_data.get('published', 'Not available')}")
248
+ context.append(f"Last Modified: {vuln_data.get('lastModified', 'Not available')}")
249
+
250
+ # References - show all with tags
251
+ references = vuln_data.get('references', [])
252
+ if references:
253
+ context.append("\nReferences:")
254
+ for ref in references:
255
+ url = ref.get('url', 'Not available')
256
+ source = ref.get('source', 'Unknown')
257
+ tags = ref.get('tags', [])
258
+ context.append(f" URL: {url}")
259
+ context.append(f" Source: {source}")
260
+ if tags:
261
+ context.append(f" Tags: {', '.join(tags)}")
262
+ context.append("")
263
+ else:
264
+ context.append("\nReferences: Not available")
265
+
266
+ context.append("-" * 50) # Separator between vulnerabilities
267
+
268
+ return "\n".join(context)
269
+
270
+ except requests.exceptions.RequestException as e:
271
+ return f"Error accessing NVD API: {str(e)}"
272
+
273
+ def _format_cve_data(self, vuln_data: dict) -> str:
274
+ """Format CVE data for display."""
275
+ context = []
276
+
277
+ context.append(f"CVE ID: {vuln_data.get('id', 'Not available')}")
278
+ context.append(f"Source Identifier: {vuln_data.get('sourceIdentifier', 'Not available')}")
279
+ context.append(f"Vulnerability Status: {vuln_data.get('vulnStatus', 'Not available')}")
280
+
281
+ # CVE Tags
282
+ cve_tags = vuln_data.get('cveTags', [])
283
+ if cve_tags:
284
+ context.append(f"CVE Tags: {', '.join(cve_tags)}")
285
+ else:
286
+ context.append("CVE Tags: None")
287
+
288
+ # Description - show all languages
289
+ descriptions = vuln_data.get('descriptions', [])
290
+ if descriptions:
291
+ for desc in descriptions:
292
+ lang = desc.get('lang', 'unknown')
293
+ value = desc.get('value', 'Not available')
294
+ context.append(f"Description ({lang}): {value}")
295
+ else:
296
+ context.append("Description: Not available")
297
+
298
+ # CVSS Metrics - show all available versions
299
+ metrics = vuln_data.get('metrics', {})
300
+ if metrics:
301
+ context.append("\nCVSS Metrics:")
302
+
303
+ # CVSS V3.1
304
+ if 'cvssMetricV31' in metrics:
305
+ cvss_data = metrics['cvssMetricV31'][0]
306
+ cvss_info = cvss_data['cvssData']
307
+ context.append(f"CVSS V3.1:")
308
+ context.append(f" Base Score: {cvss_info['baseScore']}")
309
+ context.append(f" Base Severity: {cvss_info['baseSeverity']}")
310
+ context.append(f" Vector String: {cvss_info['vectorString']}")
311
+ context.append(f" Attack Vector: {cvss_info['attackVector']}")
312
+ context.append(f" Attack Complexity: {cvss_info['attackComplexity']}")
313
+ context.append(f" Privileges Required: {cvss_info['privilegesRequired']}")
314
+ context.append(f" User Interaction: {cvss_info['userInteraction']}")
315
+ context.append(f" Scope: {cvss_info['scope']}")
316
+ context.append(f" Confidentiality Impact: {cvss_info['confidentialityImpact']}")
317
+ context.append(f" Integrity Impact: {cvss_info['integrityImpact']}")
318
+ context.append(f" Availability Impact: {cvss_info['availabilityImpact']}")
319
+ context.append(f" Exploitability Score: {cvss_data['exploitabilityScore']}")
320
+ context.append(f" Impact Score: {cvss_data['impactScore']}")
321
+
322
+ # CVSS V3.0
323
+ if 'cvssMetricV30' in metrics:
324
+ cvss_data = metrics['cvssMetricV30'][0]
325
+ cvss_info = cvss_data['cvssData']
326
+ context.append(f"CVSS V3.0:")
327
+ context.append(f" Base Score: {cvss_info['baseScore']}")
328
+ context.append(f" Base Severity: {cvss_info['baseSeverity']}")
329
+ context.append(f" Vector String: {cvss_info['vectorString']}")
330
+ context.append(f" Attack Vector: {cvss_info['attackVector']}")
331
+ context.append(f" Attack Complexity: {cvss_info['attackComplexity']}")
332
+ context.append(f" Privileges Required: {cvss_info['privilegesRequired']}")
333
+ context.append(f" User Interaction: {cvss_info['userInteraction']}")
334
+ context.append(f" Scope: {cvss_info['scope']}")
335
+ context.append(f" Confidentiality Impact: {cvss_info['confidentialityImpact']}")
336
+ context.append(f" Integrity Impact: {cvss_info['integrityImpact']}")
337
+ context.append(f" Availability Impact: {cvss_info['availabilityImpact']}")
338
+ context.append(f" Exploitability Score: {cvss_data['exploitabilityScore']}")
339
+ context.append(f" Impact Score: {cvss_data['impactScore']}")
340
+
341
+ # CVSS V2
342
+ if 'cvssMetricV2' in metrics:
343
+ cvss_data = metrics['cvssMetricV2'][0]
344
+ cvss_info = cvss_data['cvssData']
345
+ context.append(f"CVSS V2:")
346
+ context.append(f" Base Score: {cvss_info['baseScore']}")
347
+ context.append(f" Base Severity: {cvss_data['baseSeverity']}")
348
+ context.append(f" Vector String: {cvss_info['vectorString']}")
349
+ context.append(f" Access Vector: {cvss_info['accessVector']}")
350
+ context.append(f" Access Complexity: {cvss_info['accessComplexity']}")
351
+ context.append(f" Authentication: {cvss_info['authentication']}")
352
+ context.append(f" Confidentiality Impact: {cvss_info['confidentialityImpact']}")
353
+ context.append(f" Integrity Impact: {cvss_info['integrityImpact']}")
354
+ context.append(f" Availability Impact: {cvss_info['availabilityImpact']}")
355
+ context.append(f" Exploitability Score: {cvss_data['exploitabilityScore']}")
356
+ context.append(f" Impact Score: {cvss_data['impactScore']}")
357
+ else:
358
+ context.append("CVSS Metrics: Not available")
359
+
360
+ # Weaknesses - show all CWEs
361
+ weaknesses = vuln_data.get('weaknesses', [])
362
+ if weaknesses:
363
+ context.append("\nWeaknesses:")
364
+ for weakness in weaknesses:
365
+ source = weakness.get('source', 'Unknown')
366
+ w_type = weakness.get('type', 'Unknown')
367
+ context.append(f" Source: {source}, Type: {w_type}")
368
+ descriptions = weakness.get('description', [])
369
+ for desc in descriptions:
370
+ lang = desc.get('lang', 'unknown')
371
+ value = desc.get('value', 'Not available')
372
+ context.append(f" CWE ({lang}): {value}")
373
+ else:
374
+ context.append("\nWeaknesses: Not available")
375
+
376
+ # Configurations
377
+ configurations = vuln_data.get('configurations', [])
378
+ if configurations:
379
+ context.append("\nAffected Configurations:")
380
+ for config in configurations:
381
+ nodes = config.get('nodes', [])
382
+ for node in nodes:
383
+ operator = node.get('operator', 'Unknown')
384
+ negate = node.get('negate', False)
385
+ context.append(f" Operator: {operator}, Negate: {negate}")
386
+ cpe_matches = node.get('cpeMatch', [])
387
+ for match in cpe_matches:
388
+ vulnerable = match.get('vulnerable', False)
389
+ criteria = match.get('criteria', 'Not available')
390
+ version_end = match.get('versionEndIncluding', 'Not specified')
391
+ context.append(f" CPE: {criteria}")
392
+ context.append(f" Vulnerable: {vulnerable}")
393
+ if version_end != 'Not specified':
394
+ context.append(f" Version End Including: {version_end}")
395
+ else:
396
+ context.append("\nAffected Configurations: Not available")
397
+
398
+ # Dates
399
+ context.append(f"\nPublished: {vuln_data.get('published', 'Not available')}")
400
+ context.append(f"Last Modified: {vuln_data.get('lastModified', 'Not available')}")
401
+
402
+ # References - show all with tags
403
+ references = vuln_data.get('references', [])
404
+ if references:
405
+ context.append("\nReferences:")
406
+ for ref in references:
407
+ url = ref.get('url', 'Not available')
408
+ source = ref.get('source', 'Unknown')
409
+ tags = ref.get('tags', [])
410
+ context.append(f" URL: {url}")
411
+ context.append(f" Source: {source}")
412
+ if tags:
413
+ context.append(f" Tags: {', '.join(tags)}")
414
+ context.append("")
415
+ else:
416
+ context.append("\nReferences: Not available")
417
+
418
+ return "\n".join(context)
scripts/reformulator.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ import copy
4
+
5
+ from smolagents.models import MessageRole, Model
6
+
7
+
8
+ def prepare_response(original_task: str, inner_messages, reformulation_model: Model) -> str:
9
+ messages = [
10
+ {
11
+ "role": MessageRole.SYSTEM,
12
+ "content": [
13
+ {
14
+ "type": "text",
15
+ "text": f"""Earlier you were asked the following:
16
+
17
+ {original_task}
18
+
19
+ Your team then worked diligently to address that request. Read below a transcript of that conversation:""",
20
+ }
21
+ ],
22
+ }
23
+ ]
24
+
25
+ # The first message just repeats the question, so remove it
26
+ # if len(inner_messages) > 1:
27
+ # del inner_messages[0]
28
+
29
+ # copy them to this context
30
+ try:
31
+ for message in inner_messages:
32
+ if not message.get("content"):
33
+ continue
34
+ message = copy.deepcopy(message)
35
+ message["role"] = MessageRole.USER
36
+ messages.append(message)
37
+ except Exception:
38
+ messages += [{"role": MessageRole.ASSISTANT, "content": str(inner_messages)}]
39
+
40
+ # ask for the final answer
41
+ messages.append(
42
+ {
43
+ "role": MessageRole.USER,
44
+ "content": [
45
+ {
46
+ "type": "text",
47
+ "text": f"""
48
+ Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
49
+
50
+ {original_task}
51
+
52
+ To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
53
+ Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
54
+ ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
55
+ If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.
56
+ If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
57
+ If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
58
+ If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
59
+ """,
60
+ }
61
+ ],
62
+ }
63
+ )
64
+
65
+ response = reformulation_model(messages).content
66
+
67
+ final_answer = response.split("FINAL ANSWER: ")[-1].strip()
68
+ print("> Reformulated answer: ", final_answer)
69
+
70
+ # if "unable to determine" in final_answer.lower():
71
+ # messages.append({"role": MessageRole.ASSISTANT, "content": response })
72
+ # messages.append({"role": MessageRole.USER, "content": [{"type": "text", "text": """
73
+ # I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
74
+
75
+ # To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
76
+ # Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
77
+ # ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
78
+ # If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
79
+ # If you are asked for a string, don't use articles or abbreviations (e.g. cit for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
80
+ # If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
81
+ # """.strip()}]})
82
+
83
+ # response = model(messages).content
84
+ # print("\n>>>Making an educated guess.\n", response)
85
+ # final_answer = response.split("EDUCATED GUESS: ")[-1].strip()
86
+ return final_answer
scripts/report_generator.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from datetime import datetime
4
+ from typing import Dict, List, Optional
5
+ from smolagents import Tool
6
+ import plotly.graph_objects as go
7
+ import plotly.express as px
8
+ from jinja2 import Template
9
+
10
+ class ReportGeneratorTool(Tool):
11
+ """Tool for generating interactive HTML vulnerability reports with charts."""
12
+
13
+ name = "generate_vulnerability_report"
14
+ description = "Generates an interactive HTML report with charts and vulnerability analysis. The report is generated from CVEDB search results."
15
+ inputs = {
16
+ "vulnerability_data": {
17
+ "type": "string",
18
+ "description": "Vulnerability data in JSON format",
19
+ },
20
+ "report_type": {
21
+ "type": "string",
22
+ "description": "Report type: 'cve' for a specific CVE or 'product' for a product",
23
+ }
24
+ }
25
+ output_type = "string"
26
+
27
+ def __init__(self):
28
+ super().__init__()
29
+
30
+ # Base HTML template
31
+ self.html_template = """
32
+ <!DOCTYPE html>
33
+ <html>
34
+ <head>
35
+ <title>Vulnerability Report</title>
36
+ <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
37
+ <style>
38
+ body { font-family: Arial, sans-serif; margin: 20px; }
39
+ .container { max-width: 1200px; margin: 0 auto; }
40
+ .header { text-align: center; margin-bottom: 30px; }
41
+ .section { margin-bottom: 40px; }
42
+ .chart { margin: 20px 0; }
43
+ .summary { background-color: #f5f5f5; padding: 20px; border-radius: 5px; }
44
+ .critical { color: #dc3545; }
45
+ .high { color: #fd7e14; }
46
+ .medium { color: #ffc107; }
47
+ .low { color: #28a745; }
48
+ </style>
49
+ </head>
50
+ <body>
51
+ <div class="container">
52
+ <div class="header">
53
+ <h1>Vulnerability Report</h1>
54
+ <p>Generated on {{ generation_date }}</p>
55
+ </div>
56
+
57
+ <div class="section">
58
+ <h2>Summary</h2>
59
+ <div class="summary">
60
+ {{ summary }}
61
+ </div>
62
+ </div>
63
+
64
+ <div class="section">
65
+ <h2>Severity Distribution (CVSS)</h2>
66
+ <div id="cvss_chart" class="chart"></div>
67
+ </div>
68
+
69
+ <div class="section">
70
+ <h2>Temporal Trend</h2>
71
+ <div id="timeline_chart" class="chart"></div>
72
+ </div>
73
+
74
+ <div class="section">
75
+ <h2>Vulnerability Details</h2>
76
+ {{ vulnerabilities_table }}
77
+ </div>
78
+ </div>
79
+
80
+ <script>
81
+ {{ plotly_js }}
82
+ </script>
83
+ </body>
84
+ </html>
85
+ """
86
+
87
+ def forward(self, vulnerability_data: str, report_type: str) -> str:
88
+ """Generates an HTML report with interactive charts from vulnerability data."""
89
+ try:
90
+ data = json.loads(vulnerability_data)
91
+
92
+ # Generate charts with Plotly
93
+ cvss_chart = self._generate_cvss_chart(data)
94
+ timeline_chart = self._generate_timeline_chart(data)
95
+
96
+ # Generate vulnerability table
97
+ vulnerabilities_table = self._generate_vulnerabilities_table(data)
98
+
99
+ # Generate summary
100
+ summary = self._generate_summary(data, report_type)
101
+
102
+ # Render template
103
+ template = Template(self.html_template)
104
+ html = template.render(
105
+ generation_date=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
106
+ summary=summary,
107
+ vulnerabilities_table=vulnerabilities_table,
108
+ plotly_js=f"""
109
+ var cvssData = {cvss_chart};
110
+ var timelineData = {timeline_chart};
111
+ Plotly.newPlot('cvss_chart', cvssData.data, cvssData.layout);
112
+ Plotly.newPlot('timeline_chart', timelineData.data, timelineData.layout);
113
+ """
114
+ )
115
+
116
+ # Save the report to the reports folder
117
+ # NOTE: Only saves to folder when running locally
118
+ # If deployed on a Hugging Face Space, it doesn't save files
119
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
120
+ filename = f"vulnerability_report_{report_type}_{timestamp}.html"
121
+
122
+ # Create the reports folder if it doesn't exist
123
+ reports_dir = "reports"
124
+ if not os.path.exists(reports_dir):
125
+ os.makedirs(reports_dir)
126
+
127
+ # Save the file
128
+ filepath = os.path.join(reports_dir, filename)
129
+ with open(filepath, 'w', encoding='utf-8') as f:
130
+ f.write(html)
131
+
132
+ return f"Report generated and saved as: {filepath}\n\n{html}"
133
+
134
+ except Exception as e:
135
+ return f"Error generating report: {str(e)}"
136
+
137
+ def _generate_cvss_chart(self, data: Dict) -> Dict:
138
+ """Generates a CVSS score distribution chart."""
139
+ if isinstance(data, list):
140
+ cvss_scores = [v.get('cvss', 0) for v in data if 'cvss' in v]
141
+ else:
142
+ cvss_scores = [data.get('cvss', 0)] if 'cvss' in data else []
143
+
144
+ fig = go.Figure()
145
+ fig.add_trace(go.Histogram(
146
+ x=cvss_scores,
147
+ nbinsx=10,
148
+ name='CVSS Scores'
149
+ ))
150
+
151
+ fig.update_layout(
152
+ title='CVSS Score Distribution',
153
+ xaxis_title='CVSS Score',
154
+ yaxis_title='Number of Vulnerabilities',
155
+ showlegend=False
156
+ )
157
+
158
+ return fig.to_json()
159
+
160
+ def _generate_timeline_chart(self, data: Dict) -> Dict:
161
+ """Generates a vulnerability timeline chart."""
162
+ if isinstance(data, list):
163
+ dates = [v.get('published_time', '') for v in data if 'published_time' in v]
164
+ else:
165
+ dates = [data.get('published_time', '')] if 'published_time' in data else []
166
+
167
+ # Convert dates to datetime format and count by month
168
+ from collections import Counter
169
+ from datetime import datetime
170
+
171
+ date_counts = Counter()
172
+ for date_str in dates:
173
+ try:
174
+ date = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S")
175
+ month_key = date.strftime("%Y-%m")
176
+ date_counts[month_key] += 1
177
+ except:
178
+ continue
179
+
180
+ months = sorted(date_counts.keys())
181
+ counts = [date_counts[m] for m in months]
182
+
183
+ fig = go.Figure()
184
+ fig.add_trace(go.Scatter(
185
+ x=months,
186
+ y=counts,
187
+ mode='lines+markers',
188
+ name='Vulnerabilities'
189
+ ))
190
+
191
+ fig.update_layout(
192
+ title='Vulnerability Timeline Trend',
193
+ xaxis_title='Month',
194
+ yaxis_title='Number of Vulnerabilities',
195
+ showlegend=False
196
+ )
197
+
198
+ return fig.to_json()
199
+
200
+ def _generate_vulnerabilities_table(self, data: Dict) -> str:
201
+ """Generates an HTML table with vulnerability details."""
202
+ if isinstance(data, list):
203
+ vulnerabilities = data
204
+ else:
205
+ vulnerabilities = [data]
206
+
207
+ if not vulnerabilities:
208
+ return "<p>No vulnerability data available.</p>"
209
+
210
+ table_html = """
211
+ <table border="1" style="width: 100%; border-collapse: collapse;">
212
+ <thead>
213
+ <tr style="background-color: #f2f2f2;">
214
+ <th style="padding: 8px; text-align: left;">CVE ID</th>
215
+ <th style="padding: 8px; text-align: left;">CVSS Score</th>
216
+ <th style="padding: 8px; text-align: left;">EPSS Score</th>
217
+ <th style="padding: 8px; text-align: left;">Known Exploitable</th>
218
+ <th style="padding: 8px; text-align: left;">Publication Date</th>
219
+ <th style="padding: 8px; text-align: left;">Summary</th>
220
+ </tr>
221
+ </thead>
222
+ <tbody>
223
+ """
224
+
225
+ for vuln in vulnerabilities:
226
+ cvss = vuln.get('cvss', 'Not available')
227
+ epss = vuln.get('epss', 'Not available')
228
+ kev = vuln.get('kev', False)
229
+
230
+ # Determine risk class based on CVSS score
231
+ risk_class = ""
232
+ if cvss != 'Not available' and isinstance(cvss, (int, float)):
233
+ if cvss >= 7.0:
234
+ risk_class = "critical"
235
+ elif cvss >= 4.0:
236
+ risk_class = "high"
237
+ else:
238
+ risk_class = "low"
239
+
240
+ table_html += f"""
241
+ <tr class="{risk_class}">
242
+ <td style="padding: 8px;">{vuln.get('id', 'Not available')}</td>
243
+ <td style="padding: 8px;">{cvss}</td>
244
+ <td style="padding: 8px;">{epss}</td>
245
+ <td style="padding: 8px;">{'Yes' if kev else 'No'}</td>
246
+ <td style="padding: 8px;">{vuln.get('published_time', 'Not available')}</td>
247
+ <td style="padding: 8px;">{vuln.get('summary', 'Not available')[:100]}...</td>
248
+ </tr>
249
+ """
250
+
251
+ table_html += """
252
+ </tbody>
253
+ </table>
254
+ """
255
+
256
+ return table_html
257
+
258
+ def _generate_summary(self, data: Dict, report_type: str) -> str:
259
+ """Generates a summary of the vulnerability data."""
260
+ if isinstance(data, list):
261
+ total_vulns = len(data)
262
+ exploited = sum(1 for v in data if v.get('kev', False))
263
+ avg_cvss = sum(v.get('cvss', 0) for v in data if 'cvss' in v) / max(1, sum(1 for v in data if 'cvss' in v))
264
+ else:
265
+ total_vulns = 1
266
+ exploited = 1 if data.get('kev', False) else 0
267
+ avg_cvss = data.get('cvss', 0)
268
+
269
+ summary_html = f"""
270
+ <p>Found <strong>{total_vulns}</strong> vulnerabilities.</p>
271
+ <p>Exploited vulnerabilities: <strong>{exploited}</strong></p>
272
+ <p>Average CVSS Score: <strong>{avg_cvss:.2f}</strong></p>
273
+ <p>Publication date: <strong>{data.get('published_time', 'N/A')}</strong></p>
274
+ """
275
+
276
+ return summary_html
scripts/run_agents.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import shutil
4
+ import textwrap
5
+ from pathlib import Path
6
+
7
+ # import tqdm.asyncio
8
+ from smolagents.utils import AgentError
9
+
10
+
11
+ def serialize_agent_error(obj):
12
+ if isinstance(obj, AgentError):
13
+ return {"error_type": obj.__class__.__name__, "message": obj.message}
14
+ else:
15
+ return str(obj)
16
+
17
+
18
+ def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str:
19
+ prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question:
20
+ {question}. But do not try to answer the question directly!
21
+ Do not add any information that is not present in the image."""
22
+ return visual_inspection_tool(image_path=file_name, question=prompt)
23
+
24
+
25
+ def get_document_description(file_path: str, question: str, document_inspection_tool) -> str:
26
+ prompt = f"""Write a caption of 5 sentences for this document. Pay special attention to any details that might be useful for someone answering the following question:
27
+ {question}. But do not try to answer the question directly!
28
+ Do not add any information that is not present in the document."""
29
+ return document_inspection_tool.forward_initial_exam_mode(file_path=file_path, question=prompt)
30
+
31
+
32
+ def get_single_file_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
33
+ file_extension = file_path.split(".")[-1]
34
+ if file_extension in ["png", "jpg", "jpeg"]:
35
+ file_description = f" - Attached image: {file_path}"
36
+ file_description += (
37
+ f"\n -> Image description: {get_image_description(file_path, question, visual_inspection_tool)}"
38
+ )
39
+ return file_description
40
+ elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]:
41
+ file_description = f" - Attached document: {file_path}"
42
+ image_path = file_path.split(".")[0] + ".png"
43
+ if os.path.exists(image_path):
44
+ description = get_image_description(image_path, question, visual_inspection_tool)
45
+ else:
46
+ description = get_document_description(file_path, question, document_inspection_tool)
47
+ file_description += f"\n -> File description: {description}"
48
+ return file_description
49
+ elif file_extension in ["mp3", "m4a", "wav"]:
50
+ return f" - Attached audio: {file_path}"
51
+ else:
52
+ return f" - Attached file: {file_path}"
53
+
54
+
55
+ def get_zip_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
56
+ folder_path = file_path.replace(".zip", "")
57
+ os.makedirs(folder_path, exist_ok=True)
58
+ shutil.unpack_archive(file_path, folder_path)
59
+
60
+ prompt_use_files = ""
61
+ for root, dirs, files in os.walk(folder_path):
62
+ for file in files:
63
+ file_path = os.path.join(root, file)
64
+ prompt_use_files += "\n" + textwrap.indent(
65
+ get_single_file_description(file_path, question, visual_inspection_tool, document_inspection_tool),
66
+ prefix=" ",
67
+ )
68
+ return prompt_use_files
69
+
70
+
71
+ def get_tasks_to_run(data, total: int, base_filename: Path, tasks_ids: list[int]):
72
+ f = base_filename.parent / f"{base_filename.stem}_answers.jsonl"
73
+ done = set()
74
+ if f.exists():
75
+ with open(f, encoding="utf-8") as fh:
76
+ done = {json.loads(line)["task_id"] for line in fh if line.strip()}
77
+
78
+ tasks = []
79
+ for i in range(total):
80
+ task_id = int(data[i]["task_id"])
81
+ if task_id not in done:
82
+ if tasks_ids is not None:
83
+ if task_id in tasks_ids:
84
+ tasks.append(data[i])
85
+ else:
86
+ tasks.append(data[i])
87
+ return tasks
scripts/run_mcp_chatbot.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MCP chatbot runner with vulnerability search tools."""
2
+
3
+ from gradio_mcp import GradioMCP
4
+ from scripts.tools_mcp import (
5
+ search_cvedb,
6
+ search_nvd,
7
+ search_kevin,
8
+ search_epss,
9
+ generate_vulnerability_report
10
+ )
11
+
12
+ def main():
13
+ """Main function to run the MCP chatbot."""
14
+ # Initialize the Gradio MCP interface
15
+ mcp = GradioMCP()
16
+
17
+ # Register tools
18
+ mcp.register_tool(search_cvedb)
19
+ mcp.register_tool(search_nvd)
20
+ mcp.register_tool(search_kevin)
21
+ mcp.register_tool(search_epss)
22
+ mcp.register_tool(generate_vulnerability_report)
23
+
24
+ # Launch the interface
25
+ mcp.launch()
26
+
27
+ if __name__ == "__main__":
28
+ main()
scripts/text_inspector_tool.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from smolagents import Tool
4
+ from smolagents.models import MessageRole, Model
5
+
6
+ from .mdconvert import MarkdownConverter
7
+
8
+
9
+ class TextInspectorTool(Tool):
10
+ name = "inspect_file_as_text"
11
+ description = """
12
+ You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it.
13
+ This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."""
14
+
15
+ inputs = {
16
+ "file_path": {
17
+ "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT use this tool for an HTML webpage: use the web_search tool instead!",
18
+ "type": "string",
19
+ },
20
+ "question": {
21
+ "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.",
22
+ "type": "string",
23
+ "nullable": True,
24
+ },
25
+ }
26
+ output_type = "string"
27
+ md_converter = MarkdownConverter()
28
+
29
+ def __init__(self, model: Model, text_limit: int):
30
+ super().__init__()
31
+ self.model = model
32
+ self.text_limit = text_limit
33
+
34
+ def forward_initial_exam_mode(self, file_path, question):
35
+ result = self.md_converter.convert(file_path)
36
+
37
+ if file_path[-4:] in [".png", ".jpg"]:
38
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
39
+
40
+ if ".zip" in file_path:
41
+ return result.text_content
42
+
43
+ if not question:
44
+ return result.text_content
45
+
46
+ if len(result.text_content) < 4000:
47
+ return "Document content: " + result.text_content
48
+
49
+ messages = [
50
+ {
51
+ "role": MessageRole.SYSTEM,
52
+ "content": [
53
+ {
54
+ "type": "text",
55
+ "text": "Here is a file:\n### "
56
+ + str(result.title)
57
+ + "\n\n"
58
+ + result.text_content[: self.text_limit],
59
+ }
60
+ ],
61
+ },
62
+ {
63
+ "role": MessageRole.USER,
64
+ "content": [
65
+ {
66
+ "type": "text",
67
+ "text": "Now please write a short, 5 sentence caption for this document, that could help someone asking this question: "
68
+ + question
69
+ + "\n\nDon't answer the question yourself! Just provide useful notes on the document",
70
+ }
71
+ ],
72
+ },
73
+ ]
74
+ return self.model(messages).content
75
+
76
+ def forward(self, file_path, question: Optional[str] = None) -> str:
77
+ result = self.md_converter.convert(file_path)
78
+
79
+ if file_path[-4:] in [".png", ".jpg"]:
80
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
81
+
82
+ if ".zip" in file_path:
83
+ return result.text_content
84
+
85
+ if not question:
86
+ return result.text_content
87
+
88
+ messages = [
89
+ {
90
+ "role": MessageRole.SYSTEM,
91
+ "content": [
92
+ {
93
+ "type": "text",
94
+ "text": "You will have to write a short caption for this file, then answer this question:"
95
+ + question,
96
+ }
97
+ ],
98
+ },
99
+ {
100
+ "role": MessageRole.USER,
101
+ "content": [
102
+ {
103
+ "type": "text",
104
+ "text": "Here is the complete file:\n### "
105
+ + str(result.title)
106
+ + "\n\n"
107
+ + result.text_content[: self.text_limit],
108
+ }
109
+ ],
110
+ },
111
+ {
112
+ "role": MessageRole.USER,
113
+ "content": [
114
+ {
115
+ "type": "text",
116
+ "text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'."
117
+ + question,
118
+ }
119
+ ],
120
+ },
121
+ ]
122
+ return self.model(messages).content
scripts/text_web_browser.py ADDED
@@ -0,0 +1,564 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ import mimetypes
4
+ import os
5
+ import pathlib
6
+ import re
7
+ import time
8
+ import uuid
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
+ from urllib.parse import unquote, urljoin, urlparse
11
+
12
+ import pathvalidate
13
+ import requests
14
+ from serpapi import GoogleSearch
15
+ # from serpapi.google_search import GoogleSearch
16
+
17
+ from smolagents import Tool
18
+
19
+ from .cookies import COOKIES
20
+ from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException
21
+
22
+
23
+ class SimpleTextBrowser:
24
+ """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
25
+
26
+ def __init__(
27
+ self,
28
+ start_page: Optional[str] = None,
29
+ viewport_size: Optional[int] = 1024 * 8,
30
+ downloads_folder: Optional[Union[str, None]] = None,
31
+ serpapi_key: Optional[Union[str, None]] = None,
32
+ request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
33
+ ):
34
+ self.start_page: str = start_page if start_page else "about:blank"
35
+ self.viewport_size = viewport_size # Applies only to the standard uri types
36
+ self.downloads_folder = downloads_folder
37
+ self.history: List[Tuple[str, float]] = list()
38
+ self.page_title: Optional[str] = None
39
+ self.viewport_current_page = 0
40
+ self.viewport_pages: List[Tuple[int, int]] = list()
41
+ self.set_address(self.start_page)
42
+ self.serpapi_key = serpapi_key
43
+ self.request_kwargs = request_kwargs
44
+ self.request_kwargs["cookies"] = COOKIES
45
+ self._mdconvert = MarkdownConverter()
46
+ self._page_content: str = ""
47
+
48
+ self._find_on_page_query: Union[str, None] = None
49
+ self._find_on_page_last_result: Union[int, None] = None # Location of the last result
50
+
51
+ @property
52
+ def address(self) -> str:
53
+ """Return the address of the current page."""
54
+ return self.history[-1][0]
55
+
56
+ def set_address(self, uri_or_path: str, filter_year: Optional[int] = None) -> None:
57
+ # TODO: Handle anchors
58
+ self.history.append((uri_or_path, time.time()))
59
+
60
+ # Handle special URIs
61
+ if uri_or_path == "about:blank":
62
+ self._set_page_content("")
63
+ elif uri_or_path.startswith("google:"):
64
+ self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year)
65
+ else:
66
+ if (
67
+ not uri_or_path.startswith("http:")
68
+ and not uri_or_path.startswith("https:")
69
+ and not uri_or_path.startswith("file:")
70
+ ):
71
+ if len(self.history) > 1:
72
+ prior_address = self.history[-2][0]
73
+ uri_or_path = urljoin(prior_address, uri_or_path)
74
+ # Update the address with the fully-qualified path
75
+ self.history[-1] = (uri_or_path, self.history[-1][1])
76
+ self._fetch_page(uri_or_path)
77
+
78
+ self.viewport_current_page = 0
79
+ self.find_on_page_query = None
80
+ self.find_on_page_viewport = None
81
+
82
+ @property
83
+ def viewport(self) -> str:
84
+ """Return the content of the current viewport."""
85
+ bounds = self.viewport_pages[self.viewport_current_page]
86
+ return self.page_content[bounds[0] : bounds[1]]
87
+
88
+ @property
89
+ def page_content(self) -> str:
90
+ """Return the full contents of the current page."""
91
+ return self._page_content
92
+
93
+ def _set_page_content(self, content: str) -> None:
94
+ """Sets the text content of the current page."""
95
+ self._page_content = content
96
+ self._split_pages()
97
+ if self.viewport_current_page >= len(self.viewport_pages):
98
+ self.viewport_current_page = len(self.viewport_pages) - 1
99
+
100
+ def page_down(self) -> None:
101
+ self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
102
+
103
+ def page_up(self) -> None:
104
+ self.viewport_current_page = max(self.viewport_current_page - 1, 0)
105
+
106
+ def find_on_page(self, query: str) -> Union[str, None]:
107
+ """Searches for the query from the current viewport forward, looping back to the start if necessary."""
108
+
109
+ # Did we get here via a previous find_on_page search with the same query?
110
+ # If so, map to find_next
111
+ if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
112
+ return self.find_next()
113
+
114
+ # Ok it's a new search start from the current viewport
115
+ self._find_on_page_query = query
116
+ viewport_match = self._find_next_viewport(query, self.viewport_current_page)
117
+ if viewport_match is None:
118
+ self._find_on_page_last_result = None
119
+ return None
120
+ else:
121
+ self.viewport_current_page = viewport_match
122
+ self._find_on_page_last_result = viewport_match
123
+ return self.viewport
124
+
125
+ def find_next(self) -> Union[str, None]:
126
+ """Scroll to the next viewport that matches the query"""
127
+
128
+ if self._find_on_page_query is None:
129
+ return None
130
+
131
+ starting_viewport = self._find_on_page_last_result
132
+ if starting_viewport is None:
133
+ starting_viewport = 0
134
+ else:
135
+ starting_viewport += 1
136
+ if starting_viewport >= len(self.viewport_pages):
137
+ starting_viewport = 0
138
+
139
+ viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
140
+ if viewport_match is None:
141
+ self._find_on_page_last_result = None
142
+ return None
143
+ else:
144
+ self.viewport_current_page = viewport_match
145
+ self._find_on_page_last_result = viewport_match
146
+ return self.viewport
147
+
148
+ def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]:
149
+ """Search for matches between the starting viewport looping when reaching the end."""
150
+
151
+ if query is None:
152
+ return None
153
+
154
+ # Normalize the query, and convert to a regular expression
155
+ nquery = re.sub(r"\*", "__STAR__", query)
156
+ nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
157
+ nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
158
+ nquery = nquery.replace("__STAR__", ".*").lower()
159
+
160
+ if nquery.strip() == "":
161
+ return None
162
+
163
+ idxs = list()
164
+ idxs.extend(range(starting_viewport, len(self.viewport_pages)))
165
+ idxs.extend(range(0, starting_viewport))
166
+
167
+ for i in idxs:
168
+ bounds = self.viewport_pages[i]
169
+ content = self.page_content[bounds[0] : bounds[1]]
170
+
171
+ # TODO: Remove markdown links and images
172
+ ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
173
+ if re.search(nquery, ncontent):
174
+ return i
175
+
176
+ return None
177
+
178
+ def visit_page(self, path_or_uri: str, filter_year: Optional[int] = None) -> str:
179
+ """Update the address, visit the page, and return the content of the viewport."""
180
+ self.set_address(path_or_uri, filter_year=filter_year)
181
+ return self.viewport
182
+
183
+ def _split_pages(self) -> None:
184
+ # Do not split search results
185
+ if self.address.startswith("google:"):
186
+ self.viewport_pages = [(0, len(self._page_content))]
187
+ return
188
+
189
+ # Handle empty pages
190
+ if len(self._page_content) == 0:
191
+ self.viewport_pages = [(0, 0)]
192
+ return
193
+
194
+ # Break the viewport into pages
195
+ self.viewport_pages = []
196
+ start_idx = 0
197
+ while start_idx < len(self._page_content):
198
+ end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
199
+ # Adjust to end on a space
200
+ while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
201
+ end_idx += 1
202
+ self.viewport_pages.append((start_idx, end_idx))
203
+ start_idx = end_idx
204
+
205
+ def _serpapi_search(self, query: str, filter_year: Optional[int] = None) -> None:
206
+ if self.serpapi_key is None:
207
+ raise ValueError("Missing SerpAPI key.")
208
+
209
+ params = {
210
+ "engine": "google",
211
+ "q": query,
212
+ "api_key": self.serpapi_key,
213
+ }
214
+ if filter_year is not None:
215
+ params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
216
+
217
+ search = GoogleSearch(params)
218
+ results = search.get_dict()
219
+ self.page_title = f"{query} - Search"
220
+ if "organic_results" not in results.keys():
221
+ raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
222
+ if len(results["organic_results"]) == 0:
223
+ year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
224
+ self._set_page_content(
225
+ f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
226
+ )
227
+ return
228
+
229
+ def _prev_visit(url):
230
+ for i in range(len(self.history) - 1, -1, -1):
231
+ if self.history[i][0] == url:
232
+ return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
233
+ return ""
234
+
235
+ web_snippets: List[str] = list()
236
+ idx = 0
237
+ if "organic_results" in results:
238
+ for page in results["organic_results"]:
239
+ idx += 1
240
+ date_published = ""
241
+ if "date" in page:
242
+ date_published = "\nDate published: " + page["date"]
243
+
244
+ source = ""
245
+ if "source" in page:
246
+ source = "\nSource: " + page["source"]
247
+
248
+ snippet = ""
249
+ if "snippet" in page:
250
+ snippet = "\n" + page["snippet"]
251
+
252
+ redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
253
+
254
+ redacted_version = redacted_version.replace("Your browser can't play this video.", "")
255
+ web_snippets.append(redacted_version)
256
+
257
+ content = (
258
+ f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
259
+ + "\n\n".join(web_snippets)
260
+ )
261
+
262
+ self._set_page_content(content)
263
+
264
+ def _fetch_page(self, url: str) -> None:
265
+ download_path = ""
266
+ try:
267
+ if url.startswith("file://"):
268
+ download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
269
+ res = self._mdconvert.convert_local(download_path)
270
+ self.page_title = res.title
271
+ self._set_page_content(res.text_content)
272
+ else:
273
+ # Prepare the request parameters
274
+ request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
275
+ request_kwargs["stream"] = True
276
+
277
+ # Send a HTTP request to the URL
278
+ response = requests.get(url, **request_kwargs)
279
+ response.raise_for_status()
280
+
281
+ # If the HTTP request was successful
282
+ content_type = response.headers.get("content-type", "")
283
+
284
+ # Text or HTML
285
+ if "text/" in content_type.lower():
286
+ res = self._mdconvert.convert_response(response)
287
+ self.page_title = res.title
288
+ self._set_page_content(res.text_content)
289
+ # A download
290
+ else:
291
+ # Try producing a safe filename
292
+ fname = None
293
+ download_path = None
294
+ try:
295
+ fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
296
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
297
+
298
+ suffix = 0
299
+ while os.path.exists(download_path) and suffix < 1000:
300
+ suffix += 1
301
+ base, ext = os.path.splitext(fname)
302
+ new_fname = f"{base}__{suffix}{ext}"
303
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
304
+
305
+ except NameError:
306
+ pass
307
+
308
+ # No suitable name, so make one
309
+ if fname is None:
310
+ extension = mimetypes.guess_extension(content_type)
311
+ if extension is None:
312
+ extension = ".download"
313
+ fname = str(uuid.uuid4()) + extension
314
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
315
+
316
+ # Open a file for writing
317
+ with open(download_path, "wb") as fh:
318
+ for chunk in response.iter_content(chunk_size=512):
319
+ fh.write(chunk)
320
+
321
+ # Render it
322
+ local_uri = pathlib.Path(download_path).as_uri()
323
+ self.set_address(local_uri)
324
+
325
+ except UnsupportedFormatException as e:
326
+ print(e)
327
+ self.page_title = ("Download complete.",)
328
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
329
+ except FileConversionException as e:
330
+ print(e)
331
+ self.page_title = ("Download complete.",)
332
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
333
+ except FileNotFoundError:
334
+ self.page_title = "Error 404"
335
+ self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
336
+ except requests.exceptions.RequestException as request_exception:
337
+ try:
338
+ self.page_title = f"Error {response.status_code}"
339
+
340
+ # If the error was rendered in HTML we might as well render it
341
+ content_type = response.headers.get("content-type", "")
342
+ if content_type is not None and "text/html" in content_type.lower():
343
+ res = self._mdconvert.convert(response)
344
+ self.page_title = f"Error {response.status_code}"
345
+ self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
346
+ else:
347
+ text = ""
348
+ for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
349
+ text += chunk
350
+ self.page_title = f"Error {response.status_code}"
351
+ self._set_page_content(f"## Error {response.status_code}\n\n{text}")
352
+ except NameError:
353
+ self.page_title = "Error"
354
+ self._set_page_content(f"## Error\n\n{str(request_exception)}")
355
+
356
+ def _state(self) -> Tuple[str, str]:
357
+ header = f"Address: {self.address}\n"
358
+ if self.page_title is not None:
359
+ header += f"Title: {self.page_title}\n"
360
+
361
+ current_page = self.viewport_current_page
362
+ total_pages = len(self.viewport_pages)
363
+
364
+ address = self.address
365
+ for i in range(len(self.history) - 2, -1, -1): # Start from the second last
366
+ if self.history[i][0] == address:
367
+ header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
368
+ break
369
+
370
+ header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
371
+ return (header, self.viewport)
372
+
373
+
374
+ class SearchInformationTool(Tool):
375
+ name = "web_search"
376
+ description = "Perform a web search query (think a google search) and returns the search results."
377
+ inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
378
+ inputs["filter_year"] = {
379
+ "type": "string",
380
+ "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
381
+ "nullable": True,
382
+ }
383
+ output_type = "string"
384
+
385
+ def __init__(self, browser):
386
+ super().__init__()
387
+ self.browser = browser
388
+
389
+ def forward(self, query: str, filter_year: Optional[int] = None) -> str:
390
+ self.browser.visit_page(f"google: {query}", filter_year=filter_year)
391
+ header, content = self.browser._state()
392
+ return header.strip() + "\n=======================\n" + content
393
+
394
+
395
+ class VisitTool(Tool):
396
+ name = "visit_page"
397
+ description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript."
398
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webapge to visit."}}
399
+ output_type = "string"
400
+
401
+ def __init__(self, browser):
402
+ super().__init__()
403
+ self.browser = browser
404
+
405
+ def forward(self, url: str) -> str:
406
+ self.browser.visit_page(url)
407
+ header, content = self.browser._state()
408
+ return header.strip() + "\n=======================\n" + content
409
+
410
+
411
+ class DownloadTool(Tool):
412
+ name = "download_file"
413
+ description = """
414
+ Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".png", ".docx"]
415
+ After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
416
+ DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
417
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
418
+ output_type = "string"
419
+
420
+ def __init__(self, browser):
421
+ super().__init__()
422
+ self.browser = browser
423
+
424
+ def forward(self, url: str) -> str:
425
+ if "arxiv" in url:
426
+ url = url.replace("abs", "pdf")
427
+ response = requests.get(url)
428
+ content_type = response.headers.get("content-type", "")
429
+ extension = mimetypes.guess_extension(content_type)
430
+ if extension and isinstance(extension, str):
431
+ new_path = f"./downloads/file{extension}"
432
+ else:
433
+ new_path = "./downloads/file.object"
434
+
435
+ with open(new_path, "wb") as f:
436
+ f.write(response.content)
437
+
438
+ if "pdf" in extension or "txt" in extension or "htm" in extension:
439
+ raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
440
+
441
+ return f"File was downloaded and saved under path {new_path}."
442
+
443
+
444
+ class ArchiveSearchTool(Tool):
445
+ name = "find_archived_url"
446
+ description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
447
+ inputs = {
448
+ "url": {"type": "string", "description": "The url you need the archive for."},
449
+ "date": {
450
+ "type": "string",
451
+ "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.",
452
+ },
453
+ }
454
+ output_type = "string"
455
+
456
+ def __init__(self, browser):
457
+ super().__init__()
458
+ self.browser = browser
459
+
460
+ def forward(self, url, date) -> str:
461
+ no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
462
+ archive_url = no_timestamp_url + f"&timestamp={date}"
463
+ response = requests.get(archive_url).json()
464
+ response_notimestamp = requests.get(no_timestamp_url).json()
465
+ if "archived_snapshots" in response and "closest" in response["archived_snapshots"]:
466
+ closest = response["archived_snapshots"]["closest"]
467
+ print("Archive found!", closest)
468
+
469
+ elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]:
470
+ closest = response_notimestamp["archived_snapshots"]["closest"]
471
+ print("Archive found!", closest)
472
+ else:
473
+ raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
474
+ target_url = closest["url"]
475
+ self.browser.visit_page(target_url)
476
+ header, content = self.browser._state()
477
+ return (
478
+ f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
479
+ + header.strip()
480
+ + "\n=======================\n"
481
+ + content
482
+ )
483
+
484
+
485
+ class PageUpTool(Tool):
486
+ name = "page_up"
487
+ description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
488
+ inputs = {}
489
+ output_type = "string"
490
+
491
+ def __init__(self, browser):
492
+ super().__init__()
493
+ self.browser = browser
494
+
495
+ def forward(self) -> str:
496
+ self.browser.page_up()
497
+ header, content = self.browser._state()
498
+ return header.strip() + "\n=======================\n" + content
499
+
500
+
501
+ class PageDownTool(Tool):
502
+ name = "page_down"
503
+ description = (
504
+ "Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
505
+ )
506
+ inputs = {}
507
+ output_type = "string"
508
+
509
+ def __init__(self, browser):
510
+ super().__init__()
511
+ self.browser = browser
512
+
513
+ def forward(self) -> str:
514
+ self.browser.page_down()
515
+ header, content = self.browser._state()
516
+ return header.strip() + "\n=======================\n" + content
517
+
518
+
519
+ class FinderTool(Tool):
520
+ name = "find_on_page_ctrl_f"
521
+ description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
522
+ inputs = {
523
+ "search_string": {
524
+ "type": "string",
525
+ "description": "The string to search for on the page. This search string supports wildcards like '*'",
526
+ }
527
+ }
528
+ output_type = "string"
529
+
530
+ def __init__(self, browser):
531
+ super().__init__()
532
+ self.browser = browser
533
+
534
+ def forward(self, search_string: str) -> str:
535
+ find_result = self.browser.find_on_page(search_string)
536
+ header, content = self.browser._state()
537
+
538
+ if find_result is None:
539
+ return (
540
+ header.strip()
541
+ + f"\n=======================\nThe search string '{search_string}' was not found on this page."
542
+ )
543
+ else:
544
+ return header.strip() + "\n=======================\n" + content
545
+
546
+
547
+ class FindNextTool(Tool):
548
+ name = "find_next"
549
+ description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
550
+ inputs = {}
551
+ output_type = "string"
552
+
553
+ def __init__(self, browser):
554
+ super().__init__()
555
+ self.browser = browser
556
+
557
+ def forward(self) -> str:
558
+ find_result = self.browser.find_next()
559
+ header, content = self.browser._state()
560
+
561
+ if find_result is None:
562
+ return header.strip() + "\n=======================\nThe search string was not found on this page."
563
+ else:
564
+ return header.strip() + "\n=======================\n" + content
scripts/tools_mcp.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MCP tools for vulnerability search and report generation."""
2
+
3
+ from smolagents import Tool
4
+ import json
5
+ import os
6
+ from datetime import datetime
7
+ from gradio_mcp import tool
8
+ from scripts.cvedb_tool import CVEDBTool
9
+ from scripts.nvd_tool import NvdTool
10
+ from scripts.kevin_tool import KevinTool
11
+ from scripts.epss_tool import EpsTool
12
+ from scripts.report_generator import ReportGeneratorTool
13
+
14
+ # Instancias de las herramientas existentes
15
+ cvedb_tool = CVEDBTool()
16
+ nvd_tool = NvdTool()
17
+ kevin_tool = KevinTool()
18
+ epss_tool = EpsTool()
19
+ report_tool = ReportGeneratorTool()
20
+
21
+ @tool
22
+ def search_cvedb(search_type: str, identifier: str) -> str:
23
+ """Search for vulnerabilities in CVEDB by CVE or product."""
24
+ return cvedb_tool.forward(search_type, identifier)
25
+
26
+ @tool
27
+ def search_nvd(search_type: str, identifier: str, exact_match: bool = False) -> str:
28
+ """Search for vulnerabilities in NVD by CVE or keyword."""
29
+ return nvd_tool.forward(search_type, identifier, exact_match)
30
+
31
+ @tool
32
+ def search_kevin(search_type: str, identifier: str) -> str:
33
+ """Search for known exploited vulnerabilities (KEV) by CVE or keyword."""
34
+ return kevin_tool.forward(search_type, identifier)
35
+
36
+ @tool
37
+ def search_epss(cve_id: str, date: str) -> str:
38
+ """Search for EPSS score for a specific CVE."""
39
+ return epss_tool.forward(cve_id, date)
40
+
41
+ @tool
42
+ def generate_vulnerability_report(vulnerability_data: str, report_type: str) -> str:
43
+ """Generate an interactive HTML vulnerability report from JSON data."""
44
+ return report_tool.forward(vulnerability_data, report_type)
scripts/visual_qa.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import mimetypes
4
+ import os
5
+ import uuid
6
+ from io import BytesIO
7
+ from typing import Optional
8
+
9
+ import requests
10
+ from dotenv import load_dotenv
11
+ from PIL import Image
12
+
13
+ from smolagents import Tool, tool
14
+
15
+
16
+ load_dotenv(override=True)
17
+
18
+
19
+ def encode_image(image_path):
20
+ if image_path.startswith("http"):
21
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
22
+ request_kwargs = {
23
+ "headers": {"User-Agent": user_agent},
24
+ "stream": True,
25
+ }
26
+
27
+ # Send a HTTP request to the URL
28
+ response = requests.get(image_path, **request_kwargs)
29
+ response.raise_for_status()
30
+ content_type = response.headers.get("content-type", "")
31
+
32
+ extension = mimetypes.guess_extension(content_type)
33
+ if extension is None:
34
+ extension = ".download"
35
+
36
+ fname = str(uuid.uuid4()) + extension
37
+ download_path = os.path.abspath(os.path.join("downloads", fname))
38
+
39
+ with open(download_path, "wb") as fh:
40
+ for chunk in response.iter_content(chunk_size=512):
41
+ fh.write(chunk)
42
+
43
+ image_path = download_path
44
+
45
+ with open(image_path, "rb") as image_file:
46
+ return base64.b64encode(image_file.read()).decode("utf-8")
47
+
48
+
49
+ def resize_image(image_path):
50
+ img = Image.open(image_path)
51
+ width, height = img.size
52
+ img = img.resize((int(width / 2), int(height / 2)))
53
+ new_image_path = f"resized_{image_path}"
54
+ img.save(new_image_path)
55
+ return new_image_path
56
+
57
+
58
+ @tool
59
+ def visualizer(image_path: str, question: Optional[str] = None) -> str:
60
+ """A tool that can answer questions about attached images.
61
+
62
+ Args:
63
+ image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
64
+ question: The question to answer.
65
+ """
66
+ if not isinstance(image_path, str):
67
+ raise Exception("You should provide at least `image_path` string argument to this tool!")
68
+
69
+ add_note = False
70
+ if not question:
71
+ add_note = True
72
+ question = "Please write a detailed caption for this image."
73
+
74
+ mime_type, _ = mimetypes.guess_type(image_path)
75
+ base64_image = encode_image(image_path)
76
+
77
+ # Configuración para Ollama
78
+ model_id = os.getenv("MODEL_ID", "qwen2.5-coder:3b")
79
+ api_base = os.getenv("OPENAI_API_BASE", "http://localhost:11434/v1")
80
+ api_key = os.getenv("OPENAI_API_KEY", "ollama")
81
+
82
+ headers = {
83
+ "Content-Type": "application/json",
84
+ "Authorization": f"Bearer {api_key}"
85
+ }
86
+
87
+ payload = {
88
+ "model": model_id,
89
+ "messages": [
90
+ {
91
+ "role": "user",
92
+ "content": [
93
+ {"type": "text", "text": question},
94
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
95
+ ],
96
+ }
97
+ ],
98
+ "max_tokens": 1000,
99
+ }
100
+
101
+ try:
102
+ response = requests.post(f"{api_base}/chat/completions", headers=headers, json=payload)
103
+ response.raise_for_status()
104
+ output = response.json()["choices"][0]["message"]["content"]
105
+ except Exception as e:
106
+ print(f"Error processing image: {str(e)}")
107
+ if "Payload Too Large" in str(e):
108
+ new_image_path = resize_image(image_path)
109
+ base64_image = encode_image(new_image_path)
110
+ payload["messages"][0]["content"][1]["image_url"]["url"] = f"data:{mime_type};base64,{base64_image}"
111
+ response = requests.post(f"{api_base}/chat/completions", headers=headers, json=payload)
112
+ response.raise_for_status()
113
+ output = response.json()["choices"][0]["message"]["content"]
114
+ else:
115
+ raise Exception(f"Error processing image: {str(e)}")
116
+
117
+ if add_note:
118
+ output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
119
+
120
+ return output
set-env.bat ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ REM siliconflow rate limited, wont work for deep-research
2
+ REM set OPENAI_API_BASE=https://api.siliconflow.cn/v1
3
+ REM set OPENAI_API_KEY=%SILICONFLOW_API_KEY%
4
+ REM set MODEL_ID=openai/deepseek-ai/DeepSeek-V3
5
+
6
+ set OPENAI_API_BASE=https://litellm.dattw.eu.org/v1
7
+ set OPENAI_API_KEY=%LITELLM_API_KEY%
8
+ set MODEL_ID=Qwen/Qwen2.5-Coder-32B-Instruct
9
+
10
+ REM set SERPAPI_API_KEY=b84...
11
+ REM set HF_TOKEN=hf_yC...
try-litellm.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from smolagents import (
4
+ # MANAGED_AGENT_PROMPT,
5
+ CodeAgent,
6
+ HfApiModel,
7
+ LiteLLMModel,
8
+ Model,
9
+ ToolCallingAgent,
10
+ )
11
+
12
+ custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
13
+
14
+ model = LiteLLMModel(
15
+ # "gpt-4o",
16
+ # os.getenv("MODEL_ID", "gpt-4o-mini"),
17
+ os.getenv("MODEL_ID", "deepseek-ai/DeepSeek-V3"),
18
+ custom_role_conversions=custom_role_conversions,
19
+ api_base=os.getenv("OPENAI_API_BASE"),
20
+ api_key=os.getenv("OPENAI_API_KEY"),
21
+ )
22
+
23
+ print(model)
24
+
25
+ print(model.invoke("Say this is a test"))
uploads/winget_list.txt ADDED
Binary file (2.13 kB). View file