mimir-perplexity / perplexity.py
versae's picture
Mdels and code
import argparse
import json
import re
import os
from functools import cache
from pathlib import Path
from typing import Iterator, List, NoReturn, Optional, Tuple, Union
import kenlm
import msgspec
import sentencepiece
from numpy.random import default_rng
from scipy.stats import norm
from tqdm import tqdm
from normalization import normalize_text
RNG = default_rng()
LANGS = ("no", "nn", "nob", "nno", "da", "sv", "is", "en")
BASEPATH = Path(os.environ.get("PERPLEXITY_BASEPATH", "/nfsmounts/datastore/mimir/perplexity"))
"harmful": {
"no": {"model": BASEPATH / "kenlm" / "harmful" / "no.bin", "normalize": True},
"nn": {"model": BASEPATH / "kenlm" / "harmful" / "no.bin", "normalize": True},
"nob": {"model": BASEPATH / "kenlm" / "harmful" / "no.bin", "normalize": True},
"nno": {"model": BASEPATH / "kenlm" / "harmful" / "no.bin", "normalize": True},
"da": {"model": BASEPATH / "kenlm" / "harmful" / "da.bin", "normalize": True},
"sv": {"model": BASEPATH / "kenlm" / "harmful" / "sv.bin", "normalize": True},
"is": {"model": BASEPATH / "kenlm" / "harmful" / "is.bin", "normalize": True},
"en": {"model": BASEPATH / "kenlm" / "harmful" / "en.bin", "normalize": True},
"wikipedia": {
"no": {
"model": BASEPATH / "kenlm" / "wikipedia" / "no.arpa.bin",
"tokenizer": BASEPATH / "spm" / "wikipedia" / "no.sp.model",
"normalize": True
"nn": {
"model": BASEPATH / "kenlm" / "wikipedia" / "nn.arpa.bin",
"tokenizer": BASEPATH / "spm" / "wikipedia" / "nn.sp.model",
"normalize": True
"nob": {
"model": BASEPATH / "kenlm" / "wikipedia" / "no.arpa.bin",
"tokenizer": BASEPATH / "spm" / "wikipedia" / "no.sp.model",
"normalize": True
"nno": {
"model": BASEPATH / "kenlm" / "wikipedia" / "nn.arpa.bin",
"tokenizer": BASEPATH / "spm" / "wikipedia" / "nn.sp.model",
"normalize": True
"da": {
"model": BASEPATH / "kenlm" / "wikipedia" / "da.arpa.bin",
"tokenizer": BASEPATH / "spm" / "wikipedia" / "da.sp.model",
"normalize": True
"en": {
"model": BASEPATH / "kenlm" / "wikipedia" / "en.arpa.bin",
"tokenizer": BASEPATH / "spm" / "wikipedia" / "en.sp.model",
"normalize": True
"is": {
"model": BASEPATH / "kenlm" / "wikipedia" / "is.arpa.bin",
"tokenizer": BASEPATH / "spm" / "wikipedia" / "is.sp.model",
"normalize": True
"sv": {
"model": BASEPATH / "kenlm" / "wikipedia" / "sv.arpa.bin",
"tokenizer": BASEPATH / "spm" / "wikipedia" / "sv.sp.model",
"normalize": True
"books": {
"model": BASEPATH / "kenlm" / "books.norm.sp.arpa.bin",
"tokenizer": BASEPATH / "spm" / "books.norm.sp.model",
"normalize": True
"newspapers": {
"model": BASEPATH / "kenlm" / "newspapers.norm.sp.arpa.bin",
"tokenizer": BASEPATH / "spm" / "newspapers.norm.sp.model",
"normalize": True
"maalfrid": {
"model": BASEPATH / "kenlm" / "maalfrid.norm.sp.arpa.bin",
"tokenizer": BASEPATH / "spm" / "maalfrid.norm.sp.model",
"normalize": True
# Not used anymore, speed is almost same as naive algorithm
# class PerplexityDoc(msgspec.Struct):
# id: str
# doc_type: str
# publish_year: int
# lang_fasttext: str
# lang_fasttext_conf: Union[str, float]
# text: str
# perplexity: float | None = -1.0
# perplexity_model: str | None = None
# harmful_pp: float | None = None
# # wikipedia_pp: float | None = None
# # books_pp: float | None = None
# # newspapers_pp: float | None = None
# # maalfrid_pp: float | None = None
def should_keep(
perp: float, dist_norm: float, dist_mean: float, dist_std: float
) -> bool:
Decide if a doc is to be retained based on its perplexity value
Note: set() must have been called previously
p = norm.pdf(perp, loc=dist_mean, scale=dist_std) / dist_norm
return RNG.uniform() < p
def fix_language(language: str) -> str:
if language not in LANGS:
return language
def pp(log_score, length):
return 10.0 ** (-log_score / length)
def load_kenlm(model: str) -> kenlm.Model:
lm_config = kenlm.Config()
lm_config.load_method = 2
return kenlm.Model(str(model), lm_config)
def load_sentencepiece(model: str) -> sentencepiece.SentencePieceProcessor:
sp = sentencepiece.SentencePieceProcessor()
return sp
def get_perplexity(
document: str,
model: str,
tokenizer: str=None,
normalize: bool=False
) -> float:
lines = document.split("\n")
model = load_kenlm(model)
if not lines or not model:
return 0.0
if tokenizer:
sp = load_sentencepiece(tokenizer)
doc_log_score, doc_length = 0, 0
for line in lines:
if not line:
if normalize:
line = normalize_text(line)
if tokenizer:
line = " ".join(sp.encode_as_pieces(line))
log_score = model.score(line)
length = len(line.split()) + 1
doc_log_score += log_score
doc_length += length
return round(pp(doc_log_score, doc_length), 1)
def get_perplexity_local(
document: str,
model: kenlm.Model,
tokenizer: sentencepiece.SentencePieceProcessor=None,
normalize: bool=False
) -> float:
lines = document.split("\n")
if not lines or not model:
return 0.0
doc_log_score, doc_length = 0, 0
for line in lines:
if normalize:
line = normalize_text(line)
if tokenizer is not None:
line = " ".join(tokenizer.encode_as_pieces(line))
log_score = model.score(line)
length = len(line.split()) + 1
doc_log_score += log_score
doc_length += length
return round(pp(doc_log_score, doc_length), 1)
def harmful_perplexity(document: str, language: str) -> float:
params = CONFIG["harmful"][fix_lang(language)]
return get_perplexity(document=document, **params)
def wikipedia_perplexity(document: str, language: str) -> float:
params = CONFIG["wikipedia"][fix_lang(language)]
return get_perplexity(document=document, **params)
def books_perplexity(document: str) -> float:
params = CONFIG["books"]
return get_perplexity(document=document, **params)
def newspapers_perplexity(document: str) -> float:
params = CONFIG["newspapers"]
return get_perplexity(document=document, **params)
def maalfrid_perplexity(document: str) -> float:
params = CONFIG["maalfrid"]
return get_perplexity(document=document, **params)
def source_perplexities(
document: str,
language: str,
model: str | None = None,
include_harmful: bool=True) -> float:
"""Calculates all models perplexities at once"""
# Since normalization is applied to all, we normalize first and set it to False
normalized_document = "\n".join(normalize_text(line) for line in document.split("\n"))
language = fix_language(language)
if model is not None:
params = CONFIG[model]
if model == "wikipedia":
params = params[language]
params.update({"normalize": False})
perplexity = get_perplexity(document=normalized_document, **params)
perplexities = {
f"{model}_pp": perplexity,
params = CONFIG["wikipedia"][language]
params.update({"normalize": False})
wikipedia_perplexity = get_perplexity(document=normalized_document, **params)
params = CONFIG["books"]
params.update({"normalize": False})
books_perplexity = get_perplexity(document=normalized_document, **params)
params = CONFIG["newspapers"]
params.update({"normalize": False})
newspapers_perplexity = get_perplexity(document=normalized_document, **params)
params = CONFIG["maalfrid"]
params.update({"normalize": False})
maalfrid_perplexity = get_perplexity(document=normalized_document, **params)
perplexities = {
"wikipedia_pp": wikipedia_perplexity,
"books_pp": books_perplexity,
"newspapers_pp": newspapers_perplexity,
"maalfrid_pp": maalfrid_perplexity,
if include_harmful:
params = CONFIG["harmful"][language]
params.update({"normalize": False})
harmful_perplexity = get_perplexity(document=normalized_document, **params)
"harmful_pp": harmful_perplexity,
return perplexities
def get_model_for(doc_type: str) -> (str, bool):
"""Returns model type and if it needs a language variant"""
doc_type = doc_type.split("_", 1)[0]
if "-" in doc_type:
doc_type = doc_type.split("-", 1)[-1]
if doc_type in ("book", "books"):
return "books", False
elif doc_type in ("culturax", "slimpajama", "wikipedia", "digimanus", "pg19", "hplt", "starcoder"):
return "wikipedia", True
elif doc_type in ("newspaper", "newspapers"):
return "newspapers", False
elif doc_type in ("evalueringsrapport", "lovdata", "maalfrid", "parlamint"):
return "maalfrid", False
return "wikipedia", True
def preload_models_tokenizers() -> List:
print("Preloading models...", end=" ")
models = {
"books": (
load_kenlm(BASEPATH / "kenlm" / "books.norm.arpa.bin"),
load_sentencepiece(BASEPATH / "spm" / "books.norm.sp.model")
"newspapers": (
load_kenlm(BASEPATH / "kenlm" / "newspapers.norm.arpa.bin"),
load_sentencepiece(BASEPATH / "spm" / "newspapers.norm.sp.model")
"maalfrid": (
load_kenlm(BASEPATH / "kenlm" / "maalfrid.norm.arpa.bin"),
load_sentencepiece(BASEPATH / "spm" / "maalfrid.norm.sp.model")
for lang, params in CONFIG["harmful"].items():
model = load_kenlm(params["model"])
models[f"harmful-{lang}"] = model, None
for lang, params in CONFIG["wikipedia"].items():
model = load_kenlm(params["model"])
tokenizer = load_sentencepiece(params["tokenizer"])
models[f"wikipedia-{lang}"] = model, tokenizer
return models
# Not used anymore, speed is almost same as naive algorithm
# def process_file_binary(input_file, output_path, cutoff=None, overwrite_output=True):
# input_file = Path(input_file)
# output_file = Path(output_path) / input_file.name
# if not overwrite_output and output_file.exists():
# print(f"Skipping {output_file} as it already exists")
# return
# models = preload_models_tokenizers()
# encoder = msgspec.json.Encoder()
# decoder = msgspec.json.Decoder(PerplexityDoc)
# buffer = bytearray(64)
# with (open(output_file, 'wb') as f,
# open(input_file, 'r', encoding='utf-8') as lines):
# for line_count, line in tqdm(enumerate(lines), desc=f"Processing {input_file.name}"):
# doc = decoder.decode(line)
# if "code" not in doc.doc_type:
# # Perplexity
# model_type, needs_lang = get_model_for(doc.doc_type)
# if needs_lang:
# model_key = f"{model_type}-{fix_language(doc.lang_fasttext)}"
# else:
# model_key = model_type
# model, tokenizer = models[model_key]
# text = "\n".join(normalize_text(line) for line in doc.text.split("\n"))
# score = get_perplexity_local(
# text, model=model, tokenizer=tokenizer, normalize=False
# )
# doc.perplexity = score
# doc.perplexity_model = model_type
# # Harmfulness
# harmful_key = f"harmful-{fix_language(doc.lang_fasttext)}"
# harmful_model, harmful_tokenizer = models[harmful_key]
# harmful_pp = get_perplexity_local(
# text, model=harmful_model, tokenizer=harmful_tokenizer, normalize=False
# )
# doc.harmful_pp = harmful_pp
# encoder.encode_into(doc, buffer)
# buffer.extend(b"\n")
# f.write(buffer)
# if cutoff is not None and line_count >= cutoff:
# break
def process_file(input_file, output_path, cutoff=None, model=None, overwrite_output=True):
Processes a file by reading its contents, analyzing each line for language and document type,
computing perplexities using specified models, and writing the modified content to a new file.
This function performs several steps:
1. Determines the output file path and checks for its existence if overwrite is not desired.
2. Reads the input file line by line, processing each line as a separate JSON document.
3. For each document, identifies its language using a fastText model. If the document type is "starcoder",
it defaults the language to English.
4. Depending on the model parameter, computes perplexities for the document text either using a
single document type model or a specified general model.
5. Updates the document with computed perplexities and writes it to the output file in JSON format.
6. Optionally stops processing after a specified number of lines determined by the cutoff parameter.
- input_file (str or Path): Path to the input file to be processed.
- output_path (str or Path): Directory path where the output file will be saved. The output file
will have the same name as the input file.
- cutoff (int, optional): If provided, processing will stop after this number of lines. Defaults to None.
- model (str, optional): Specifies the model to use for computing perplexities. If 'single', uses a
model specific to the document's type. Otherwise, uses the model specified.
Defaults to None.
- overwrite_output (bool): If True, will overwrite the output file if it already exists. If False,
will skip processing if the output file exists. Defaults to True.
None. Writes processed documents to an output file in the specified output path.
input_file = Path(input_file)
output_file = Path(output_path) / input_file.name
if not overwrite_output and output_file.exists():
print(f"Skipping {output_file} as it already exists")
with (open(output_file, 'w', encoding='utf-8') as f,
open(input_file, 'r', encoding='utf-8') as lines):
for line_count, line in tqdm(enumerate(lines), desc=f"Processing {input_file.name}"):
doc = json.loads(line)
language = doc["lang_fasttext"]
if doc["doc_type"] == "starcoder":
language = "en"
if model == "single":
doc_type_model, _ = get_model_for(doc["doc_type"])
perplexities = source_perplexities(doc["text"], language, model=doc_type_model)
perplexities["perplexity"] = perplexities.pop(f"{doc_type_model}_pp")
perplexities["perplexity_model"] = doc_type_model
perplexities = source_perplexities(doc["text"], language, model=model)
f.write(json.dumps(doc) + "\n")
if cutoff is not None and line_count >= cutoff:
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Calculate perplexity values for a given JSON Lines file and output the result to a new file.')
parser.add_argument('-i', '--input_file', type=str,
help='Input file path')
parser.add_argument('-o', '--output_path', type=str,
help='Output path to write enriched file')
parser.add_argument('-c', '--cutoff', required=False, type=int,
help='Max number of lines to process')
parser.add_argument('-m', '--model', required=False, type=str,
help='Run "single" model per doc type, "all" the models, '
'or a specific model to choose from '
'"books", "wikipedia", "newspapers" or "maalfrid". '
'Defaults to "single"')
action=argparse.BooleanOptionalAction, default=True,
help="Whether to overwrite the output file if exists.")
args = parser.parse_args()
if args.model == "single":
args.input_file, args.output_path, args.cutoff,
model="single", overwrite_output=args.overwrite_output,
elif args.model in ("books", "wikipedia", "newspapers", "maalfrid"):
args.input_file, args.output_path, args.cutoff,
model=args.model, overwrite_output=args.overwrite_output,
args.input_file, args.output_path, args.cutoff,