File size: 1,462 Bytes
48c079f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import argparse
import json
from pathlib import Path

import librosa
import torch
from tqdm import tqdm


def main():
    parser = argparse.ArgumentParser(description="UTMOS Evaluation")
    parser.add_argument("--audio_dir", type=str, required=True, help="Audio file path.")
    parser.add_argument("--ext", type=str, default="wav", help="Audio extension.")
    args = parser.parse_args()

    device = "cuda" if torch.cuda.is_available() else "cpu"

    predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
    predictor = predictor.to(device)

    audio_paths = list(Path(args.audio_dir).rglob(f"*.{args.ext}"))
    utmos_results = {}
    utmos_score = 0

    for audio_path in tqdm(audio_paths, desc="Processing"):
        wav_name = audio_path.stem
        wav, sr = librosa.load(audio_path, sr=None, mono=True)
        wav_tensor = torch.from_numpy(wav).to(device).unsqueeze(0)
        score = predictor(wav_tensor, sr)
        utmos_results[str(wav_name)] = score.item()
        utmos_score += score.item()

    avg_score = utmos_score / len(audio_paths) if len(audio_paths) > 0 else 0
    print(f"UTMOS: {avg_score}")

    utmos_result_path = Path(args.audio_dir) / "utmos_results.json"
    with open(utmos_result_path, "w", encoding="utf-8") as f:
        json.dump(utmos_results, f, ensure_ascii=False, indent=4)

    print(f"Results have been saved to {utmos_result_path}")


if __name__ == "__main__":
    main()