Spaces:

mantisnlp
/

LocationFinder

Runtime error

File size: 1,309 Bytes

b8d16b2

import csv

import spacy
import srsly
import tqdm

nlp = spacy.load("en_core_web_trf")

INPUT_FILE = "data/processed/wellcome_grant_descriptions.csv"
OUTPUT_FILE = "data/processed/entities.jsonl"
ENTITY_SUBSET = ["GPE", "LOC"]


def process_documents(input_file: str, output_file: str):

    data = []

    print(f"Reading data from {input_file}...")

    with open(input_file, "r") as f:
        reader = csv.reader(f)
        next(reader)

        for row in reader:
            data.append(row[0])

    print(f"Processing {len(data)} documents...")

    entities = []

    for doc_ in tqdm.tqdm(data):
        doc = nlp(doc_)

        # Get a list of found entities

        ents = [
            {
                "text": ent.text,
                "label": ent.label_,
                "start": ent.start_char,
                "end": ent.end_char,
            }
            for ent in doc.ents
            if ent.label_ in ENTITY_SUBSET
        ]

        if ents:
            entities.append(
                {
                    "text": doc.text,
                    "ents": ents,
                }
            )

    print(f"Writing {len(entities)} documents to {output_file}...")

    srsly.write_jsonl(output_file, entities)


if __name__ == "__main__":
    process_documents(INPUT_FILE, OUTPUT_FILE)