Upload compile_complete_training_data.py with huggingface_hub
Browse files
compile_complete_training_data.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Script to compile the complete BSG CyLLama training data
|
| 4 |
+
Combines the cluster-aligned data with the remaining records from the full dataset
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
def compile_complete_training_data():
|
| 12 |
+
"""
|
| 13 |
+
Compile the complete training data by combining cluster-aligned data
|
| 14 |
+
with remaining records from the full dataset
|
| 15 |
+
"""
|
| 16 |
+
print("Loading training data files...")
|
| 17 |
+
|
| 18 |
+
# Load the datasets
|
| 19 |
+
cluster_aligned_df = pd.read_csv('bsg_training_data_cluster_aligned.tsv', sep='\t')
|
| 20 |
+
full_df = pd.read_csv('bsg_training_data_full.tsv', sep='\t')
|
| 21 |
+
pubmed_clustered_df = pd.read_csv('pubmed_clustered_data_sciner.tsv', sep='\t')
|
| 22 |
+
|
| 23 |
+
print(f"Cluster aligned data: {len(cluster_aligned_df)} records")
|
| 24 |
+
print(f"Full data: {len(full_df)} records")
|
| 25 |
+
print(f"PubMed clustered data: {len(pubmed_clustered_df)} records")
|
| 26 |
+
|
| 27 |
+
# Get the cluster-aligned columns
|
| 28 |
+
cluster_columns = cluster_aligned_df.columns.tolist()
|
| 29 |
+
base_columns = ['OriginalIndex', 'SourceRow', 'AbstractSummary', 'ShortSummary',
|
| 30 |
+
'Title', 'OriginalKeywords', 'OriginalText']
|
| 31 |
+
|
| 32 |
+
print(f"Cluster aligned columns: {cluster_columns}")
|
| 33 |
+
print(f"Base columns: {base_columns}")
|
| 34 |
+
|
| 35 |
+
# Extract SourceRow values that are already in the cluster-aligned data
|
| 36 |
+
aligned_source_rows = set(cluster_aligned_df['SourceRow'].values)
|
| 37 |
+
print(f"Already aligned source rows: {len(aligned_source_rows)}")
|
| 38 |
+
|
| 39 |
+
# Find records in the full dataset that are not in the cluster-aligned data
|
| 40 |
+
missing_records = full_df[~full_df['SourceRow'].isin(aligned_source_rows)].copy()
|
| 41 |
+
print(f"Missing records to be added: {len(missing_records)}")
|
| 42 |
+
|
| 43 |
+
if len(missing_records) > 0:
|
| 44 |
+
# For the missing records, we need to add the clustering columns
|
| 45 |
+
# We'll use placeholder values for now, similar to what might be in the aligned data
|
| 46 |
+
|
| 47 |
+
# Add missing columns with placeholder values
|
| 48 |
+
missing_records['cluster_num_x'] = 'cluster_' + missing_records['SourceRow'].astype(str)
|
| 49 |
+
missing_records['Index'] = missing_records['SourceRow']
|
| 50 |
+
missing_records['ConcatenatedAbstracts'] = missing_records['AbstractSummary']
|
| 51 |
+
missing_records['TopKeywords'] = missing_records['OriginalKeywords']
|
| 52 |
+
missing_records['cluster_num_y'] = missing_records['SourceRow']
|
| 53 |
+
|
| 54 |
+
# Reorder columns to match cluster_aligned_df
|
| 55 |
+
missing_records = missing_records[cluster_columns]
|
| 56 |
+
|
| 57 |
+
# Combine the datasets
|
| 58 |
+
complete_training_data = pd.concat([cluster_aligned_df, missing_records], ignore_index=True)
|
| 59 |
+
|
| 60 |
+
print(f"Complete training data: {len(complete_training_data)} records")
|
| 61 |
+
|
| 62 |
+
# Save the complete dataset
|
| 63 |
+
output_file = 'bsg_training_data_complete_aligned.tsv'
|
| 64 |
+
complete_training_data.to_csv(output_file, sep='\t', index=False)
|
| 65 |
+
print(f"Complete training data saved to: {output_file}")
|
| 66 |
+
|
| 67 |
+
# Also check if we can use any clustering information from pubmed_clustered_data
|
| 68 |
+
if len(pubmed_clustered_df) == len(full_df):
|
| 69 |
+
print("PubMed clustered data has same length as full data - checking for additional clustering info...")
|
| 70 |
+
|
| 71 |
+
# Check column overlap
|
| 72 |
+
pubmed_columns = pubmed_clustered_df.columns.tolist()
|
| 73 |
+
print(f"PubMed columns: {pubmed_columns}")
|
| 74 |
+
|
| 75 |
+
# If there are useful clustering columns in pubmed data, merge them
|
| 76 |
+
if 'cluster_num' in pubmed_columns or any('cluster' in col.lower() for col in pubmed_columns):
|
| 77 |
+
print("Found clustering information in PubMed data, will create enhanced version...")
|
| 78 |
+
|
| 79 |
+
# Create enhanced version with proper clustering from pubmed data
|
| 80 |
+
enhanced_data = complete_training_data.copy()
|
| 81 |
+
|
| 82 |
+
# Merge with pubmed clustering data based on SourceRow
|
| 83 |
+
if 'SourceRow' in pubmed_columns:
|
| 84 |
+
pubmed_subset = pubmed_clustered_df[['SourceRow'] + [col for col in pubmed_columns if 'cluster' in col.lower()]]
|
| 85 |
+
enhanced_data = enhanced_data.merge(pubmed_subset, on='SourceRow', how='left', suffixes=('', '_pubmed'))
|
| 86 |
+
|
| 87 |
+
enhanced_output_file = 'bsg_training_data_complete_enhanced.tsv'
|
| 88 |
+
enhanced_data.to_csv(enhanced_output_file, sep='\t', index=False)
|
| 89 |
+
print(f"Enhanced training data saved to: {enhanced_output_file}")
|
| 90 |
+
|
| 91 |
+
return complete_training_data
|
| 92 |
+
else:
|
| 93 |
+
print("No missing records found - cluster aligned data is already complete!")
|
| 94 |
+
return cluster_aligned_df
|
| 95 |
+
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
# Change to the script directory
|
| 98 |
+
import os
|
| 99 |
+
os.chdir('/home/joneill/bsg_cyllama')
|
| 100 |
+
|
| 101 |
+
complete_data = compile_complete_training_data()
|
| 102 |
+
print(f"Compilation complete! Final dataset has {len(complete_data)} records.")
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
|