import mlcroissant._src.operation_graph.operations.download as dl_mod import requests import os # Make sure the HF token is loaded HF_TOKEN = os.environ.get("HF_TOKEN") # Set the environment variables Croissant expects os.environ["CROISSANT_BASIC_AUTH_USERNAME"] = "hf_user" os.environ["CROISSANT_BASIC_AUTH_PASSWORD"] = HF_TOKEN or "" print("[DEBUG] HF_TOKEN is", "set" if HF_TOKEN else "missing") print("[DEBUG] Basic auth env set for Croissant") import mlcroissant as mlc import func_timeout import json import traceback WAIT_TIME = 10 * 60 # seconds def validate_json(file_path): """Validate that the file is proper JSON.""" try: with open(file_path, 'r') as f: json_data = json.load(f) return True, "The file is valid JSON.", json_data except json.JSONDecodeError as e: error_message = f"Invalid JSON format: {str(e)}" return False, error_message, None except Exception as e: error_message = f"Error reading file: {str(e)}" return False, error_message, None def validate_croissant(json_data): """Validate that the JSON follows Croissant schema.""" try: dataset = mlc.Dataset(jsonld=json_data) return True, "The dataset passes Croissant validation." except mlc.ValidationError as e: error_details = traceback.format_exc() error_message = f"Validation failed: {str(e)}\n\n{error_details}" return False, error_message except Exception as e: error_details = traceback.format_exc() error_message = f"Unexpected error during validation: {str(e)}\n\n{error_details}" return False, error_message def try_generate_record(record_collection): try: for i, record in enumerate(record_collection): if i == 0: break return "success" except Exception as e: return e def validate_records(json_data): """Validate that records can be generated within the time limit.""" try: dataset = mlc.Dataset(jsonld=json_data) record_sets = dataset.metadata.record_sets if not record_sets: return True, "No record sets found to validate.", "pass" results = [] for record_set in record_sets: try: result = func_timeout.func_timeout( WAIT_TIME, lambda: try_generate_record(dataset.records(record_set=record_set.uuid)) ) if isinstance(result, Exception): raise result # re-raise actual error outside timeout results.append(f"Record set '{record_set.uuid}' passed validation.") except func_timeout.exceptions.FunctionTimedOut: error_message = f"Record set '{record_set.uuid}' generation took too long (>10 minutes)." return False, error_message, "warning" except Exception as e: error_details = traceback.format_exc() error_message = ( f"Record set '{record_set.uuid}' failed due to generation error:\n\n" f"```text\n{str(e)}\n\n{error_details}```" ) return False, error_message, "warning" return True, "\n".join(results), "pass" except Exception as e: error_details = traceback.format_exc() error_message = f"Unexpected error during records validation: {str(e)}\n\n{error_details}" return False, error_message, "error" def generate_validation_report(filename, json_data, results): """Generate a detailed validation report in markdown format.""" report = [] report.append("# CROISSANT VALIDATION REPORT") report.append("=" * 80) report.append("## VALIDATION RESULTS") report.append("-" * 80) report.append(f"Starting validation for file: {filename}") # Add validation results for result in results: if len(result) == 4: test_name, passed, message, status = result else: test_name, passed, message = result status = "pass" if passed else "error" report.append(f"### {test_name}") if status == "pass": report.append("✓") elif status == "warning": report.append("?") # Question mark for warning else: report.append("✗") report.append(message.strip()) # Remove any trailing newlines # Add JSON-LD reference report.append("## JSON-LD REFERENCE") report.append("=" * 80) report.append("```json") report.append(json.dumps(json_data, indent=2)) report.append("```") return "\n".join(report)