Spaces:
Sleeping
Sleeping
| from fasthtml.common import * | |
| from fasthtml.components import * | |
| def web_data(): | |
| return Div( | |
| Div( | |
| Ul( | |
| Li( | |
| A( | |
| "Raw Documentation", | |
| href="https://drive.google.com/drive/folders/1mIJ-Zx8tRhohFdj4ByMToNz1u_9Saa8W?usp=drive_link", | |
| ) | |
| ), | |
| Li( | |
| A( | |
| "Github link of Web Data Pipeline", | |
| href="https://github.com/CIAI-LLM/WebDataProcessing.git", | |
| ) | |
| ), | |
| ), | |
| style=""" | |
| background-color: #d4edda; /* Light green background */ | |
| padding: 15px; | |
| border: 1px solid #c3e6cb; /* Green border */ | |
| border-radius: 5px; | |
| margin-bottom: 20px; | |
| """, | |
| ), | |
| Div( | |
| P( | |
| "To generate a high-quality dataset from large-scale webpages, we have investigated the processing steps used by the community and made our choices based on careful manual inspection. Starting from ", | |
| A("Common Crawl", href="https://commoncrawl.org/"), | |
| ", our process can be summarized as five main steps: document preparation, line-level removal, document-level filtering, deduplication and PII removal.", | |
| ), | |
| style="margin-top: 20px;", | |
| ), | |
| ) | |