Spaces:
Sleeping
Sleeping
Commit
·
600ab03
1
Parent(s):
5a0e586
start web data port
Browse files
style.css
CHANGED
|
@@ -264,3 +264,7 @@ d-contents nav > div > a:hover,
|
|
| 264 |
d-contents nav > ul > li > a:hover {
|
| 265 |
text-decoration: none;
|
| 266 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
d-contents nav > ul > li > a:hover {
|
| 265 |
text-decoration: none;
|
| 266 |
}
|
| 267 |
+
|
| 268 |
+
.hljs {
|
| 269 |
+
background: rgb(255, 255, 255) !important;
|
| 270 |
+
}
|
web.py
CHANGED
|
@@ -3,5 +3,36 @@ from fasthtml.components import *
|
|
| 3 |
|
| 4 |
|
| 5 |
def web_data():
|
| 6 |
-
return Div(
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def web_data():
|
| 6 |
+
return Div(
|
| 7 |
+
Div(
|
| 8 |
+
Ul(
|
| 9 |
+
Li(
|
| 10 |
+
A(
|
| 11 |
+
"Raw Documentation",
|
| 12 |
+
href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link",
|
| 13 |
+
)
|
| 14 |
+
),
|
| 15 |
+
Li(
|
| 16 |
+
A(
|
| 17 |
+
"Github link of Web Data Pipeline",
|
| 18 |
+
href="https://github.com/CIAI-LLM/WebDataProcessing.git",
|
| 19 |
+
)
|
| 20 |
+
),
|
| 21 |
+
),
|
| 22 |
+
style="""
|
| 23 |
+
background-color: #d4edda; /* Light green background */
|
| 24 |
+
padding: 15px;
|
| 25 |
+
border: 1px solid #c3e6cb; /* Green border */
|
| 26 |
+
border-radius: 5px;
|
| 27 |
+
margin-bottom: 20px;
|
| 28 |
+
""",
|
| 29 |
+
),
|
| 30 |
+
Div(
|
| 31 |
+
P(
|
| 32 |
+
"To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ",
|
| 33 |
+
A("Common Crawl", href="https://commoncrawl.org/"),
|
| 34 |
+
", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.",
|
| 35 |
+
),
|
| 36 |
+
style="margin-top: 20px;",
|
| 37 |
+
),
|
| 38 |
+
)
|