croissant-checker / validation.py
JoaquinVanschoren's picture
timeout warning
8ed167c
import mlcroissant._src.operation_graph.operations.download as dl_mod
import requests
import os
# Make sure the HF token is loaded
HF_TOKEN = os.environ.get("HF_TOKEN")
# Set the environment variables Croissant expects
os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user"
os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN or ""
print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing")
print("[DEBUG] Basic auth env set for Croissant")
import mlcroissant as mlc
import func_timeout
import json
import traceback
WAIT_TIME = 10 * 60 # seconds
def validate_json(file_path):
"""Validate that the file is proper JSON."""
try:
with open(file_path, 'r') as f:
json_data = json.load(f)
return True, "The file is valid JSON.", json_data
except json.JSONDecodeError as e:
error_message = f"Invalid JSON format: {str(e)}"
return False, error_message, None
except Exception as e:
error_message = f"Error reading file: {str(e)}"
return False, error_message, None
def validate_croissant(json_data):
"""Validate that the JSON follows Croissant schema."""
try:
dataset = mlc.Dataset(jsonld=json_data)
return True, "The dataset passes Croissant validation."
except mlc.ValidationError as e:
error_details = traceback.format_exc()
error_message = f"Validation failed: {str(e)}\n\n{error_details}"
return False, error_message
except Exception as e:
error_details = traceback.format_exc()
error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}"
return False, error_message
def try_generate_record(record_collection):
try:
for i, record in enumerate(record_collection):
if i == 0:
break
return "success"
except Exception as e:
return e
def validate_records(json_data):
"""Validate that records can be generated within the time limit."""
try:
dataset = mlc.Dataset(jsonld=json_data)
record_sets = dataset.metadata.record_sets
if not record_sets:
return True, "No record sets found to validate.", "pass"
results = []
for record_set in record_sets:
try:
result = func_timeout.func_timeout(
WAIT_TIME,
lambda: try_generate_record(dataset.records(record_set=record_set.uuid))
)
if isinstance(result, Exception):
raise result # re-raise actual error outside timeout
results.append(f"Record set '{record_set.uuid}' passed validation.")
except func_timeout.exceptions.FunctionTimedOut:
error_message = f"Record set '{record_set.uuid}' generation took too long (>10 minutes)."
return False, error_message, "warning"
except Exception as e:
error_details = traceback.format_exc()
error_message = (
f"Record set '{record_set.uuid}' failed due to generation error:\n\n"
f"```text\n{str(e)}\n\n{error_details}```"
)
return False, error_message, "warning"
return True, "\n".join(results), "pass"
except Exception as e:
error_details = traceback.format_exc()
error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}"
return False, error_message, "error"
def generate_validation_report(filename, json_data, results):
"""Generate a detailed validation report in markdown format."""
report = []
report.append("# CROISSANT VALIDATION REPORT")
report.append("=" * 80)
report.append("## VALIDATION RESULTS")
report.append("-" * 80)
report.append(f"Starting validation for file: {filename}")
# Add validation results
for result in results:
if len(result) == 4:
test_name, passed, message, status = result
else:
test_name, passed, message = result
status = "pass" if passed else "error"
report.append(f"### {test_name}")
if status == "pass":
report.append("✓")
elif status == "warning":
report.append("?") # Question mark for warning
else:
report.append("✗")
report.append(message.strip()) # Remove any trailing newlines
# Add JSON-LD reference
report.append("## JSON-LD REFERENCE")
report.append("=" * 80)
report.append("```json")
report.append(json.dumps(json_data, indent=2))
report.append("```")
return "\n".join(report)