croissant-checker

Running

App Files Files Community

croissant-checker / validation.py

JoaquinVanschoren

timeout warning

8ed167c 19 days ago

raw

history blame contribute delete

4.72 kB

	import mlcroissant._src.operation_graph.operations.download as dl_mod
	import requests
	import os

	# Make sure the HF token is loaded
	HF_TOKEN = os.environ.get("HF_TOKEN")

	# Set the environment variables Croissant expects
	os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user"
	os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN or ""

	print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing")
	print("[DEBUG] Basic auth env set for Croissant")

	import mlcroissant as mlc
	import func_timeout
	import json
	import traceback

	WAIT_TIME = 10 * 60 # seconds

	def validate_json(file_path):
	"""Validate that the file is proper JSON."""
	try:
	with open(file_path, 'r') as f:
	json_data = json.load(f)
	return True, "The file is valid JSON.", json_data
	except json.JSONDecodeError as e:
	error_message = f"Invalid JSON format: {str(e)}"
	return False, error_message, None
	except Exception as e:
	error_message = f"Error reading file: {str(e)}"
	return False, error_message, None

	def validate_croissant(json_data):
	"""Validate that the JSON follows Croissant schema."""
	try:
	dataset = mlc.Dataset(jsonld=json_data)
	return True, "The dataset passes Croissant validation."
	except mlc.ValidationError as e:
	error_details = traceback.format_exc()
	error_message = f"Validation failed: {str(e)}\n\n{error_details}"
	return False, error_message
	except Exception as e:
	error_details = traceback.format_exc()
	error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}"
	return False, error_message

	def try_generate_record(record_collection):
	try:
	for i, record in enumerate(record_collection):
	if i == 0:
	break
	return "success"
	except Exception as e:
	return e

	def validate_records(json_data):
	"""Validate that records can be generated within the time limit."""
	try:
	dataset = mlc.Dataset(jsonld=json_data)
	record_sets = dataset.metadata.record_sets

	if not record_sets:
	return True, "No record sets found to validate.", "pass"

	results = []

	for record_set in record_sets:
	try:
	result = func_timeout.func_timeout(
	WAIT_TIME,
	lambda: try_generate_record(dataset.records(record_set=record_set.uuid))
	)

	if isinstance(result, Exception):
	raise result # re-raise actual error outside timeout

	results.append(f"Record set '{record_set.uuid}' passed validation.")

	except func_timeout.exceptions.FunctionTimedOut:
	error_message = f"Record set '{record_set.uuid}' generation took too long (>10 minutes)."
	return False, error_message, "warning"

	except Exception as e:
	error_details = traceback.format_exc()
	error_message = (
	f"Record set '{record_set.uuid}' failed due to generation error:\n\n"
	f"```text\n{str(e)}\n\n{error_details}```"
	)
	return False, error_message, "warning"

	return True, "\n".join(results), "pass"
	except Exception as e:
	error_details = traceback.format_exc()
	error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}"
	return False, error_message, "error"

	def generate_validation_report(filename, json_data, results):
	"""Generate a detailed validation report in markdown format."""
	report = []
	report.append("# CROISSANT VALIDATION REPORT")
	report.append("=" * 80)
	report.append("## VALIDATION RESULTS")
	report.append("-" * 80)
	report.append(f"Starting validation for file: {filename}")

	# Add validation results
	for result in results:
	if len(result) == 4:
	test_name, passed, message, status = result
	else:
	test_name, passed, message = result
	status = "pass" if passed else "error"

	report.append(f"### {test_name}")
	if status == "pass":
	report.append("✓")
	elif status == "warning":
	report.append("?") # Question mark for warning
	else:
	report.append("✗")
	report.append(message.strip()) # Remove any trailing newlines

	# Add JSON-LD reference
	report.append("## JSON-LD REFERENCE")
	report.append("=" * 80)
	report.append("```json")
	report.append(json.dumps(json_data, indent=2))
	report.append("```")

	return "\n".join(report)