import argparse import json import re import os from functools import cache from pathlib import Path from typing import Iterator, List, NoReturn, Optional, Tuple, Union import kenlm import msgspec import sentencepiece from numpy.random import default_rng from scipy.stats import norm from tqdm import tqdm from normalization import normalize_text RNG = default_rng() LANGS = ("no", "nn", "nob", "nno", "da", "sv", "is", "en") DEFAULT_LANG = "no" BASEPATH = Path(os.environ.get("PERPLEXITY_BASEPATH", "/nfsmounts/datastore/mimir/perplexity")) CONFIG = { "harmful": { "no": {"model": BASEPATH / "kenlm" / "harmful" / "no.bin", "normalize": True}, "nn": {"model": BASEPATH / "kenlm" / "harmful" / "no.bin", "normalize": True}, "nob": {"model": BASEPATH / "kenlm" / "harmful" / "no.bin", "normalize": True}, "nno": {"model": BASEPATH / "kenlm" / "harmful" / "no.bin", "normalize": True}, "da": {"model": BASEPATH / "kenlm" / "harmful" / "da.bin", "normalize": True}, "sv": {"model": BASEPATH / "kenlm" / "harmful" / "sv.bin", "normalize": True}, "is": {"model": BASEPATH / "kenlm" / "harmful" / "is.bin", "normalize": True}, "en": {"model": BASEPATH / "kenlm" / "harmful" / "en.bin", "normalize": True}, }, "wikipedia": { "no": { "model": BASEPATH / "kenlm" / "wikipedia" / "no.arpa.bin", "tokenizer": BASEPATH / "spm" / "wikipedia" / "no.sp.model", "normalize": True }, "nn": { "model": BASEPATH / "kenlm" / "wikipedia" / "nn.arpa.bin", "tokenizer": BASEPATH / "spm" / "wikipedia" / "nn.sp.model", "normalize": True }, "nob": { "model": BASEPATH / "kenlm" / "wikipedia" / "no.arpa.bin", "tokenizer": BASEPATH / "spm" / "wikipedia" / "no.sp.model", "normalize": True }, "nno": { "model": BASEPATH / "kenlm" / "wikipedia" / "nn.arpa.bin", "tokenizer": BASEPATH / "spm" / "wikipedia" / "nn.sp.model", "normalize": True }, "da": { "model": BASEPATH / "kenlm" / "wikipedia" / "da.arpa.bin", "tokenizer": BASEPATH / "spm" / "wikipedia" / "da.sp.model", "normalize": True }, "en": { "model": BASEPATH / "kenlm" / "wikipedia" / "en.arpa.bin", "tokenizer": BASEPATH / "spm" / "wikipedia" / "en.sp.model", "normalize": True }, "is": { "model": BASEPATH / "kenlm" / "wikipedia" / "is.arpa.bin", "tokenizer": BASEPATH / "spm" / "wikipedia" / "is.sp.model", "normalize": True }, "sv": { "model": BASEPATH / "kenlm" / "wikipedia" / "sv.arpa.bin", "tokenizer": BASEPATH / "spm" / "wikipedia" / "sv.sp.model", "normalize": True }, }, "books": { "model": BASEPATH / "kenlm" / "books.norm.sp.arpa.bin", "tokenizer": BASEPATH / "spm" / "books.norm.sp.model", "normalize": True }, "newspapers": { "model": BASEPATH / "kenlm" / "newspapers.norm.sp.arpa.bin", "tokenizer": BASEPATH / "spm" / "newspapers.norm.sp.model", "normalize": True }, "maalfrid": { "model": BASEPATH / "kenlm" / "maalfrid.norm.sp.arpa.bin", "tokenizer": BASEPATH / "spm" / "maalfrid.norm.sp.model", "normalize": True } } # Not used anymore, speed is almost same as naive algorithm # class PerplexityDoc(msgspec.Struct): # id: str # doc_type: str # publish_year: int # lang_fasttext: str # lang_fasttext_conf: Union[str, float] # text: str # perplexity: float | None = -1.0 # perplexity_model: str | None = None # harmful_pp: float | None = None # # wikipedia_pp: float | None = None # # books_pp: float | None = None # # newspapers_pp: float | None = None # # maalfrid_pp: float | None = None def should_keep( perp: float, dist_norm: float, dist_mean: float, dist_std: float ) -> bool: """ Decide if a doc is to be retained based on its perplexity value Note: set() must have been called previously """ p = norm.pdf(perp, loc=dist_mean, scale=dist_std) / dist_norm return RNG.uniform() < p def fix_language(language: str) -> str: if language not in LANGS: return DEFAULT_LANG else: return language def pp(log_score, length): return 10.0 ** (-log_score / length) @cache def load_kenlm(model: str) -> kenlm.Model: lm_config = kenlm.Config() lm_config.load_method = 2 return kenlm.Model(str(model), lm_config) @cache def load_sentencepiece(model: str) -> sentencepiece.SentencePieceProcessor: sp = sentencepiece.SentencePieceProcessor() sp.load(str(model)) return sp def get_perplexity( document: str, model: str, tokenizer: str=None, normalize: bool=False ) -> float: lines = document.split("\n") model = load_kenlm(model) if not lines or not model: return 0.0 if tokenizer: sp = load_sentencepiece(tokenizer) doc_log_score, doc_length = 0, 0 for line in lines: if not line: continue if normalize: line = normalize_text(line) if tokenizer: line = " ".join(sp.encode_as_pieces(line)) log_score = model.score(line) length = len(line.split()) + 1 doc_log_score += log_score doc_length += length return round(pp(doc_log_score, doc_length), 1) def get_perplexity_local( document: str, model: kenlm.Model, tokenizer: sentencepiece.SentencePieceProcessor=None, normalize: bool=False ) -> float: lines = document.split("\n") if not lines or not model: return 0.0 doc_log_score, doc_length = 0, 0 for line in lines: if normalize: line = normalize_text(line) if tokenizer is not None: line = " ".join(tokenizer.encode_as_pieces(line)) log_score = model.score(line) length = len(line.split()) + 1 doc_log_score += log_score doc_length += length return round(pp(doc_log_score, doc_length), 1) def harmful_perplexity(document: str, language: str) -> float: params = CONFIG["harmful"][fix_lang(language)] return get_perplexity(document=document, **params) def wikipedia_perplexity(document: str, language: str) -> float: params = CONFIG["wikipedia"][fix_lang(language)] return get_perplexity(document=document, **params) def books_perplexity(document: str) -> float: params = CONFIG["books"] return get_perplexity(document=document, **params) def newspapers_perplexity(document: str) -> float: params = CONFIG["newspapers"] return get_perplexity(document=document, **params) def maalfrid_perplexity(document: str) -> float: params = CONFIG["maalfrid"] return get_perplexity(document=document, **params) def source_perplexities( document: str, language: str, model: str | None = None, include_harmful: bool=True) -> float: """Calculates all models perplexities at once""" # Since normalization is applied to all, we normalize first and set it to False normalized_document = "\n".join(normalize_text(line) for line in document.split("\n")) language = fix_language(language) if model is not None: params = CONFIG[model] if model == "wikipedia": params = params[language] params.update({"normalize": False}) perplexity = get_perplexity(document=normalized_document, **params) perplexities = { f"{model}_pp": perplexity, } else: params = CONFIG["wikipedia"][language] params.update({"normalize": False}) wikipedia_perplexity = get_perplexity(document=normalized_document, **params) params = CONFIG["books"] params.update({"normalize": False}) books_perplexity = get_perplexity(document=normalized_document, **params) params = CONFIG["newspapers"] params.update({"normalize": False}) newspapers_perplexity = get_perplexity(document=normalized_document, **params) params = CONFIG["maalfrid"] params.update({"normalize": False}) maalfrid_perplexity = get_perplexity(document=normalized_document, **params) perplexities = { "wikipedia_pp": wikipedia_perplexity, "books_pp": books_perplexity, "newspapers_pp": newspapers_perplexity, "maalfrid_pp": maalfrid_perplexity, } if include_harmful: params = CONFIG["harmful"][language] params.update({"normalize": False}) harmful_perplexity = get_perplexity(document=normalized_document, **params) perplexities.update({ "harmful_pp": harmful_perplexity, }) return perplexities def get_model_for(doc_type: str) -> (str, bool): """Returns model type and if it needs a language variant""" doc_type = doc_type.split("_", 1)[0] if "-" in doc_type: doc_type = doc_type.split("-", 1)[-1] if doc_type in ("book", "books"): return "books", False elif doc_type in ("culturax", "slimpajama", "wikipedia", "digimanus", "pg19", "hplt", "starcoder"): return "wikipedia", True elif doc_type in ("newspaper", "newspapers"): return "newspapers", False elif doc_type in ("evalueringsrapport", "lovdata", "maalfrid", "parlamint"): return "maalfrid", False else: return "wikipedia", True def preload_models_tokenizers() -> List: print("Preloading models...", end=" ") models = { "books": ( load_kenlm(BASEPATH / "kenlm" / "books.norm.arpa.bin"), load_sentencepiece(BASEPATH / "spm" / "books.norm.sp.model") ), "newspapers": ( load_kenlm(BASEPATH / "kenlm" / "newspapers.norm.arpa.bin"), load_sentencepiece(BASEPATH / "spm" / "newspapers.norm.sp.model") ), "maalfrid": ( load_kenlm(BASEPATH / "kenlm" / "maalfrid.norm.arpa.bin"), load_sentencepiece(BASEPATH / "spm" / "maalfrid.norm.sp.model") ), } for lang, params in CONFIG["harmful"].items(): model = load_kenlm(params["model"]) models[f"harmful-{lang}"] = model, None for lang, params in CONFIG["wikipedia"].items(): model = load_kenlm(params["model"]) tokenizer = load_sentencepiece(params["tokenizer"]) models[f"wikipedia-{lang}"] = model, tokenizer print("Done") return models # Not used anymore, speed is almost same as naive algorithm # def process_file_binary(input_file, output_path, cutoff=None, overwrite_output=True): # input_file = Path(input_file) # output_file = Path(output_path) / input_file.name # if not overwrite_output and output_file.exists(): # print(f"Skipping {output_file} as it already exists") # return # models = preload_models_tokenizers() # encoder = msgspec.json.Encoder() # decoder = msgspec.json.Decoder(PerplexityDoc) # buffer = bytearray(64) # with (open(output_file, 'wb') as f, # open(input_file, 'r', encoding='utf-8') as lines): # for line_count, line in tqdm(enumerate(lines), desc=f"Processing {input_file.name}"): # doc = decoder.decode(line) # if "code" not in doc.doc_type: # # Perplexity # model_type, needs_lang = get_model_for(doc.doc_type) # if needs_lang: # model_key = f"{model_type}-{fix_language(doc.lang_fasttext)}" # else: # model_key = model_type # model, tokenizer = models[model_key] # text = "\n".join(normalize_text(line) for line in doc.text.split("\n")) # score = get_perplexity_local( # text, model=model, tokenizer=tokenizer, normalize=False # ) # doc.perplexity = score # doc.perplexity_model = model_type # # Harmfulness # harmful_key = f"harmful-{fix_language(doc.lang_fasttext)}" # harmful_model, harmful_tokenizer = models[harmful_key] # harmful_pp = get_perplexity_local( # text, model=harmful_model, tokenizer=harmful_tokenizer, normalize=False # ) # doc.harmful_pp = harmful_pp # encoder.encode_into(doc, buffer) # buffer.extend(b"\n") # f.write(buffer) # if cutoff is not None and line_count >= cutoff: # break def process_file(input_file, output_path, cutoff=None, model=None, overwrite_output=True): """ Processes a file by reading its contents, analyzing each line for language and document type, computing perplexities using specified models, and writing the modified content to a new file. This function performs several steps: 1. Determines the output file path and checks for its existence if overwrite is not desired. 2. Reads the input file line by line, processing each line as a separate JSON document. 3. For each document, identifies its language using a fastText model. If the document type is "starcoder", it defaults the language to English. 4. Depending on the model parameter, computes perplexities for the document text either using a single document type model or a specified general model. 5. Updates the document with computed perplexities and writes it to the output file in JSON format. 6. Optionally stops processing after a specified number of lines determined by the cutoff parameter. Parameters: - input_file (str or Path): Path to the input file to be processed. - output_path (str or Path): Directory path where the output file will be saved. The output file will have the same name as the input file. - cutoff (int, optional): If provided, processing will stop after this number of lines. Defaults to None. - model (str, optional): Specifies the model to use for computing perplexities. If 'single', uses a model specific to the document's type. Otherwise, uses the model specified. Defaults to None. - overwrite_output (bool): If True, will overwrite the output file if it already exists. If False, will skip processing if the output file exists. Defaults to True. Returns: None. Writes processed documents to an output file in the specified output path. """ input_file = Path(input_file) output_file = Path(output_path) / input_file.name if not overwrite_output and output_file.exists(): print(f"Skipping {output_file} as it already exists") return with (open(output_file, 'w', encoding='utf-8') as f, open(input_file, 'r', encoding='utf-8') as lines): for line_count, line in tqdm(enumerate(lines), desc=f"Processing {input_file.name}"): doc = json.loads(line) language = doc["lang_fasttext"] if doc["doc_type"] == "starcoder": language = "en" if model == "single": doc_type_model, _ = get_model_for(doc["doc_type"]) perplexities = source_perplexities(doc["text"], language, model=doc_type_model) perplexities["perplexity"] = perplexities.pop(f"{doc_type_model}_pp") perplexities["perplexity_model"] = doc_type_model else: perplexities = source_perplexities(doc["text"], language, model=model) doc.update(perplexities) f.write(json.dumps(doc) + "\n") if cutoff is not None and line_count >= cutoff: break if __name__ == "__main__": parser = argparse.ArgumentParser(description='Calculate perplexity values for a given JSON Lines file and output the result to a new file.') parser.add_argument('-i', '--input_file', type=str, help='Input file path') parser.add_argument('-o', '--output_path', type=str, help='Output path to write enriched file') parser.add_argument('-c', '--cutoff', required=False, type=int, help='Max number of lines to process') parser.add_argument('-m', '--model', required=False, type=str, help='Run "single" model per doc type, "all" the models, ' 'or a specific model to choose from ' '"books", "wikipedia", "newspapers" or "maalfrid". ' 'Defaults to "single"') parser.add_argument('--overwrite_output', action=argparse.BooleanOptionalAction, default=True, help="Whether to overwrite the output file if exists.") args = parser.parse_args() if args.model == "single": process_file( args.input_file, args.output_path, args.cutoff, model="single", overwrite_output=args.overwrite_output, ) elif args.model in ("books", "wikipedia", "newspapers", "maalfrid"): process_file( args.input_file, args.output_path, args.cutoff, model=args.model, overwrite_output=args.overwrite_output, ) else: process_file( args.input_file, args.output_path, args.cutoff, overwrite_output=args.overwrite_output, )