Spaces:
Sleeping
Sleeping
| from fasthtml.common import * | |
| from fasthtml.components import * | |
| from plotly import graph_objects as go | |
| from fh_plotly import plotly2fasthtml | |
| import pandas as pd | |
| import json | |
| from data_viewer import view_data, gen_random_id | |
| from rich import print | |
| import uuid | |
| data_sources = [ | |
| "Freelaw", | |
| "Wikipedia", | |
| "PhilPapers", | |
| "Arxiv", | |
| "S2ORC", | |
| "S2ORC Abstract", | |
| "Pubmed", | |
| "USPTO", | |
| "Hackernews", | |
| "Ubuntu IRC", | |
| "StackExchange", | |
| "DM Maths", | |
| "PG19", | |
| "Europarl", | |
| ] | |
| def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"): | |
| doc_id = max(0, min(int(doc_id), 9)) | |
| if data_source == "Freelaw": | |
| raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json")) | |
| extracted_sample_doc = json.load( | |
| open("data/curated_samples/freelaw_extract.json") | |
| ) | |
| elif data_source == "Wikipedia": | |
| raw_sample_doc = extracted_sample_doc = json.load( | |
| open("data/curated_samples/wiki.json") | |
| ) | |
| elif data_source == "StackExchange": | |
| raw_sample_doc = json.load(open("data/curated_samples/stackexchange_raw.json")) | |
| extracted_sample_doc = json.load( | |
| open("data/curated_samples/stackexchange_extract.json") | |
| ) | |
| elif data_source == "PhilPapers": | |
| raw_sample_doc = extracted_sample_doc = json.load( | |
| open("data/curated_samples/philpapers_raw.json") | |
| ) | |
| elif data_source == "Arxiv": | |
| raw_sample_doc = json.load(open("data/curated_samples/arxiv_raw.json")) | |
| extracted_sample_doc = json.load( | |
| open("data/curated_samples/arxiv_extract.json") | |
| ) | |
| elif data_source == "S2ORC": | |
| raw_sample_doc = extracted_sample_doc = json.load( | |
| open("data/curated_samples/s2orc_raw.json") | |
| ) | |
| elif data_source == "S2ORC Abstract": | |
| raw_sample_doc = extracted_sample_doc = json.load( | |
| open("data/curated_samples/s2orc_abstract_raw.json") | |
| ) | |
| elif data_source == "Pubmed": | |
| raw_sample_doc = json.load(open("data/curated_samples/pubmed_raw.json")) | |
| extracted_sample_doc = json.load( | |
| open("data/curated_samples/pubmed_extract.json") | |
| ) | |
| elif data_source == "DM Maths": | |
| raw_sample_doc = json.load(open("data/curated_samples/dm_maths_raw.json")) | |
| extracted_sample_doc = json.load( | |
| open("data/curated_samples/dm_maths_extract.json") | |
| ) | |
| elif data_source == "PG19": | |
| raw_sample_doc = extracted_sample_doc = json.load( | |
| open("data/curated_samples/pg19_raw.json") | |
| ) | |
| elif data_source == "Europarl": | |
| raw_sample_doc = extracted_sample_doc = json.load( | |
| open("data/curated_samples/europarl_raw.json") | |
| ) | |
| else: | |
| raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)] | |
| raw_json = raw_sample_doc[doc_id] | |
| extracted_json = extracted_sample_doc[doc_id] | |
| return view_data( | |
| raw_json, | |
| extracted_json, | |
| doc_id=doc_id, | |
| data_source=data_source, | |
| data_sources=data_sources, | |
| target=target, | |
| ) | |
| def get_chart_28168342(): | |
| fig = go.Figure() | |
| filter_names = [ | |
| "Download", | |
| "Language", | |
| "Min word count", | |
| "Title Abstract", | |
| "Majority language", | |
| "Paragraph count", | |
| "Frequency", | |
| "Unigram log probability", | |
| "Local dedup", | |
| ] | |
| data_sources = [ | |
| ("Wikipedia", [100, 90, 80, 70, 60, 50, 40, 30, 20]), | |
| ("Freelaw", [100, 90, 80, 70, 60, 50, 40, 20, 20]), | |
| ("DM Maths", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("USPTO", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("PG19", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("Hackernews", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("Ubuntu IRC", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("Europarl", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("StackExchange", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("Arxiv", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("S2ORC", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("S2ORC Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("PubMed Central", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("PubMed Central Abstract", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ("PhilPapers", [100, 90, 80, 70, 60, 40, 40, 30, 20]), | |
| ] | |
| for name, x_values in data_sources: | |
| fig.add_trace( | |
| go.Funnel( | |
| name=name, | |
| orientation="h", | |
| y=filter_names, | |
| x=x_values, | |
| textinfo="value+percent total", | |
| textposition="inside", | |
| ) | |
| ) | |
| fig.update_layout(height=500, plot_bgcolor="rgba(0,0,0,0)") | |
| return fig | |
| def curated(request): | |
| # Partial Updates | |
| params = dict(request.query_params) | |
| if target := params.get("target"): | |
| if data_source := params.get(f"data_source_{target}"): | |
| return get_data( | |
| data_source, params.get(f"doc_id_{target}", 3), params.get("target") | |
| ) | |
| if doc_id := params.get(f"doc_id_{target}"): | |
| return get_data( | |
| params.get(f"data_source_{target}"), doc_id, params.get("target") | |
| ) | |
| data_preparation_steps = pd.DataFrame( | |
| { | |
| "Method": [ | |
| "HTTP/FTP dumps", | |
| "Web crawling", | |
| "Archive snapshot", | |
| "Generated", | |
| "Curated", | |
| ], | |
| "Description": [ | |
| "Acquiring data from HTTP/FTP dumps", | |
| "Crawling websites to extract data", | |
| "Working with archive dumps", | |
| "Generating synthetic data", | |
| "High quality curated data", | |
| ], | |
| "Source": [ | |
| "Freelaw | Wikipedia | PhilPapers | Arxiv | S2ORC | Pubmeds", | |
| "USPTO | Hackernews | Ubuntu IRC", | |
| "StackExchange", | |
| "DM Maths", | |
| "PG19 | Europarl", | |
| ], | |
| } | |
| ) | |
| table_html = data_preparation_steps.to_html(index=False, border=0) | |
| table_div = Div(NotStr(table_html), style="margin: 40px;") | |
| text = P("""This initial stage serves as the foundation for the entire | |
| process. Here, we focus on acquiring and extracting the raw data, which can | |
| come from various sources such as crawling websites, using HTTP/FTP dumps, | |
| or working with archive dumps. For instance, to download and prepare a | |
| dataset, we can specific downloaders based on the data source. Each dataset | |
| might have its own downloader script which can be updated in real time to | |
| handle changes in the data source. Here is a general outline of the data | |
| preparation process: It's worth noting that some pipelines might require | |
| invoking additional functions or scripts to handle specific data sources or | |
| formats. These helper scripts can be located within specific directories | |
| or modules dedicated to the dataset.""") | |
| data_preparation_div = Div( | |
| H3("Data Preparation"), | |
| text, | |
| table_div, | |
| Div( | |
| get_data(target=gen_random_id()), | |
| style="border: 1px solid #ccc; padding: 20px;", | |
| ), | |
| ) | |
| text = P("""Data preprocessing is a crucial step in the data science | |
| pipeline. It involves cleaning and transforming raw data into a format that | |
| is suitable for analysis. This process includes handling missing values, | |
| normalizing data, encoding categorical variables, and more.""") | |
| preprocessing_steps = pd.DataFrame( | |
| { | |
| "Step": [ | |
| "Language Filter", | |
| "Min Word Count", | |
| "Title Abstract", | |
| "Majority Language", | |
| "Paragraph Count", | |
| "Frequency", | |
| "Unigram Log Probability", | |
| ], | |
| "Description": [ | |
| "Filtering data based on language", | |
| "Setting a minimum word count threshold", | |
| "Extracting information from the title and abstract", | |
| "Identifying the majority language in the dataset", | |
| "Counting the number of paragraphs in each document", | |
| "Calculating the frequency of each word in the dataset", | |
| "Calculating the log probability of each unigram", | |
| ], | |
| "Need": [ | |
| "To remove documents in unwanted languages", | |
| "To filter out documents with very few words", | |
| "To extract relevant information for analysis", | |
| "To understand the distribution of languages in the dataset", | |
| "To analyze the structure and length of documents", | |
| "To identify important words in the dataset", | |
| "To measure the significance of individual words", | |
| ], | |
| "Pros": [ | |
| "Improves data quality by removing irrelevant documents", | |
| "Filters out low-quality or incomplete documents", | |
| "Provides additional information for analysis", | |
| "Enables language-specific analysis and insights", | |
| "Helps understand the complexity and content of documents", | |
| "Identifies important terms and topics in the dataset", | |
| "Quantifies the importance of individual words", | |
| ], | |
| "Cons": [ | |
| "May exclude documents in less common languages", | |
| "May remove documents with valuable information", | |
| "May introduce bias in the analysis", | |
| "May not accurately represent the language distribution", | |
| "May not capture the complexity of document structure", | |
| "May be sensitive to noise and outliers", | |
| "May not capture the semantic meaning of words", | |
| ], | |
| } | |
| ) | |
| table_html = preprocessing_steps.to_html(index=False, border=0) | |
| table_div = Div(NotStr(table_html), style="margin: 40px;") | |
| data_preprocessing_div = Div(H3("Data Preprocessing"), text, table_div) | |
| return Div( | |
| Section( | |
| H2("Curated Sources"), | |
| plotly2fasthtml(get_chart_28168342()), | |
| data_preparation_div, | |
| data_preprocessing_div, | |
| id="inner-text", | |
| ) | |
| ) | |