File size: 5,522 Bytes
c28358e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
#!/usr/bin/env python3
"""
Working Whisper Transcription for Apple M3 Ultra (CPU Version)
Fixes MPS compatibility issues by using CPU
"""
import whisper
import torch
import time
from pathlib import Path
import logging
import os
import sys
import subprocess
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def check_environment():
"""Check the current environment and suggest fixes"""
logger.info("๐ Checking environment...")
# Check PyTorch version
logger.info(f"๐ Python version: {sys.version}")
logger.info(f"๐ฅ PyTorch version: {torch.__version__}")
logger.info(f"๐ค Whisper version: {whisper.__version__}")
# Check MPS availability
mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
logger.info(f"๐ MPS available: {mps_available}")
# Check CUDA availability
cuda_available = torch.cuda.is_available()
logger.info(f"๐ฎ CUDA available: {cuda_available}")
# Recommend CPU for stability
logger.info("๐ก Using CPU for stable transcription (MPS has compatibility issues)")
return "cpu"
def transcribe_with_cpu(audio_file, model_size="medium"):
"""Transcribe using CPU for maximum compatibility"""
logger.info(f"๐ง Transcribing: {audio_file}")
logger.info(f"๐ค Using model: {model_size}")
logger.info("โก Device: CPU (stable mode)")
try:
start_time = time.time()
# Load model with CPU device
model = whisper.load_model(
model_size,
device="cpu",
download_root="./whisper_models"
)
# Transcribe with CPU - explicitly set language to English
logger.info("๐ค Starting transcription...")
result = model.transcribe(
audio_file,
verbose=True,
fp16=False, # Disable FP16 for CPU stability
temperature=0.0,
best_of=1,
language="en" # Explicitly set language to English
)
return result, time.time() - start_time
except Exception as e:
logger.error(f"โ Transcription failed: {e}")
return None, 0
def estimate_time(audio_file, model_size):
"""Estimate transcription time based on file size"""
file_size_mb = os.path.getsize(audio_file) / (1024 * 1024)
# Rough time estimates per MB for CPU
time_per_mb = {
'tiny': 1.5,
'base': 2.0,
'small': 3.0,
'medium': 4.5,
'large': 6.0
}
estimated_seconds = file_size_mb * time_per_mb.get(model_size, 4.5)
minutes = int(estimated_seconds // 60)
seconds = int(estimated_seconds % 60)
return f"{minutes}:{seconds:02d}"
def main():
audio_file = "yuval_harari_lecture.mp3"
if not Path(audio_file).exists():
logger.error(f"โ Audio file not found: {audio_file}")
logger.info("๐ก Run: python comprehensive_yt_dl.py to download the lecture")
return
# Check file size
file_size_mb = os.path.getsize(audio_file) / (1024 * 1024)
logger.info(f"๐ File size: {file_size_mb:.1f} MB")
# Estimate time
estimated_time = estimate_time(audio_file, "medium")
logger.info(f"โฑ๏ธ Estimated time: ~{estimated_time}")
# Check environment
device = check_environment()
logger.info("๐ Starting transcription...")
logger.info("โ ๏ธ This may take a while on CPU - be patient!")
# Transcribe
result, duration = transcribe_with_cpu(audio_file, "medium")
if result:
# Save results
output_file = f"{Path(audio_file).stem}_transcript.txt"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(result['text'])
# Performance metrics
minutes = int(duration // 60)
seconds = int(duration % 60)
logger.info(f"โ
Transcription completed in {minutes}:{seconds:02d}")
logger.info(f"๐ Saved to: {output_file}")
logger.info(f"๐ Word count: {len(result['text'].split()):,}")
# Show preview
preview = result['text'][:500] + "..." if len(result['text']) > 500 else result['text']
logger.info(f"๐ Preview:\n{preview}")
# Save additional formats
save_additional_formats(Path(audio_file).stem, result)
else:
logger.error("โ Transcription failed completely")
def save_additional_formats(base_name, result):
"""Save transcript in additional formats"""
# Save as JSON with timestamps
json_path = f"{base_name}_timestamps.json"
try:
import json
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
logger.info(f"โฐ Timestamps saved to: {json_path}")
except Exception as e:
logger.warning(f"โ ๏ธ Could not save JSON: {e}")
# Save as SRT using CLI if available
try:
subprocess.run([
"whisper", f"{base_name}.mp3",
"--model", "medium",
"--output_format", "srt",
"--device", "cpu",
"--language", "en" # Also set language for SRT generation
], timeout=300)
if Path(f"{base_name}.srt").exists():
logger.info(f"๐ฌ Subtitles saved to: {base_name}.srt")
except:
pass
if __name__ == "__main__":
main() |