|
|
|
|
|
import pandas as pd |
|
import glob |
|
import json |
|
import sys |
|
import os |
|
import re |
|
import argparse |
|
import multiprocessing as mp |
|
from multiprocessing.dummy import Pool as ThreadPool |
|
import tqdm |
|
import spacy |
|
from spacy.util import registry |
|
from spacy.tokenizer import Tokenizer |
|
from pathlib import Path |
|
|
|
import warnings |
|
warnings.simplefilter(action='ignore', category=FutureWarning) |
|
|
|
MODEL_PATH = Path(__file__).resolve().parents[1] / 'model' |
|
TOK_PATH = MODEL_PATH / 'tokenizer' |
|
|
|
@registry.callbacks("custom_tok") |
|
def get_custom(): |
|
def load_it(nlp): |
|
tokenizer = Tokenizer(nlp.vocab) |
|
tokenizer.from_disk(TOK_PATH) |
|
return tokenizer |
|
return load_it |
|
jnlp = spacy.load(MODEL_PATH) |
|
|
|
droppers = ['ebc','executive committee','unassigned','magistrate judge','magistrate judge unassigned magistrate', |
|
'judge unassigned judge','judge unassigned', 'cvb judge', 'debt-magistrate','mia duty magistrate', |
|
'CVRecovery Case CRStatistics Unassigned', 'Cvrecovery Case Crstatistics', 'civ','mdl'] |
|
|
|
base_prefixes = ['magistrate judge judge','us district judge','district judge','mag\. judge','magistrate judge','bankruptcy judge', |
|
'honorable judge','mag judge','dist judge', |
|
'magistrate','honorable', 'hon\.', 'judge', 'mj', 'justice'] |
|
base_prefixes_snr = ['senior '+b for b in base_prefixes] |
|
base_prefixes_chief = ['chief '+b for b in base_prefixes] |
|
|
|
all_prefixes = base_prefixes + base_prefixes_chief + base_prefixes_snr |
|
all_prefixes = sorted(all_prefixes, key=lambda x: len(x.split()), reverse=True) |
|
|
|
header_cause_pattern = re.compile(r'\s*cause:\s*',flags=re.I) |
|
header_referred_pattern = re.compile(r'\s*referred\s*(to)?\s*', flags=re.I) |
|
header_designated_pattern = re.compile(r'^designated\s*',flags=re.I) |
|
dash_mj_pattern = re.compile(r'-mj$', flags=re.I) |
|
header_prefixes = re.compile(rf'\b({"|".join(all_prefixes)})[\.]*($|(?=\s))', flags=re.I) |
|
|
|
|
|
def judge_detection(judge_str): |
|
|
|
if not judge_str or len(judge_str)==0 or any(i for i in droppers if i.lower() == judge_str.lower()): |
|
return None |
|
|
|
if ' and ' in judge_str.lower(): |
|
print(judge_str) |
|
|
|
cause = header_cause_pattern.search(judge_str) |
|
if cause: |
|
judge_str = judge_str[0:cause.start()] |
|
|
|
referred = header_referred_pattern.search(judge_str) |
|
if referred: |
|
judge_str = judge_str[0:referred.start()] |
|
|
|
designated = header_designated_pattern.search(judge_str) |
|
if designated: |
|
new_star = designated.end() |
|
judge_str = judge_str[new_star:] |
|
else: |
|
new_star = 0 |
|
|
|
dash_mj = dash_mj_pattern.search(judge_str) |
|
if dash_mj: |
|
judge_str = judge_str[0:dash_mj.start()] |
|
|
|
|
|
m = header_prefixes.search(judge_str) |
|
if m: |
|
pref_span_start = m.start() |
|
pref_span_end = m.end() |
|
|
|
pretext = judge_str[0:m.end()] |
|
name = judge_str[m.end()+1:] |
|
if name: |
|
name_span_start = m.end()+1 + new_star |
|
name_span_end = name_span_start + len(name) |
|
else: |
|
name_span_start = None |
|
name_span_end = None |
|
else: |
|
name = judge_str |
|
pretext = None |
|
name_span_start = 0 + new_star |
|
name_span_end = len(judge_str) + new_star |
|
pref_span_start = None |
|
pref_span_end = None |
|
|
|
return { |
|
'extracted_pretext':pretext, |
|
'extracted_entity': name, |
|
'Ent_span_start':name_span_start,'Ent_span_end':name_span_end, |
|
'Pre_span_start':pref_span_start,'Pre_span_end':pref_span_end} |
|
|
|
|
|
def extract_header_entities(header_judges): |
|
|
|
out_dicts = [] |
|
|
|
if 'referred_judges' in header_judges: |
|
RJ_enum = 0 |
|
for judge_string in header_judges['referred_judges']: |
|
label = f"RJ-{RJ_enum}" |
|
RJ_enum+=1 |
|
detected_judge = judge_detection(judge_string) |
|
if detected_judge: |
|
out_dicts.append( |
|
{ |
|
'original_text': judge_string, |
|
**detected_judge, |
|
'docket_source': 'case_header', |
|
'Entity_Extraction_Method': 'referred_judges', |
|
'judge_enum': RJ_enum, |
|
'party_enum': None, |
|
'pacer_id': None, |
|
'docket_index': None |
|
}) |
|
|
|
if 'judge' in header_judges: |
|
label = "AJ" |
|
detected_judge = judge_detection(header_judges['judge']) |
|
if detected_judge: |
|
out_dicts.append( |
|
{ |
|
'original_text': header_judges['judge'], |
|
**detected_judge, |
|
'docket_source': 'case_header', |
|
'Entity_Extraction_Method': 'assigned_judge', |
|
'judge_enum': 0, |
|
'party_enum': None, |
|
'pacer_id': None, |
|
'docket_index': None |
|
}) |
|
|
|
return out_dicts |
|
|
|
|
|
def extract_party_judge_entities(party_list): |
|
out_dicts = [] |
|
for assigned, referred, pacer_id, p_enum in party_list: |
|
if referred: |
|
r_enum = 0 |
|
for ref in referred: |
|
label = f"RJ-{r_enum}" |
|
r_enum+=1 |
|
detected_judge = judge_detection(ref) |
|
if detected_judge: |
|
out_dicts.append( |
|
{ |
|
'original_text': ref, |
|
**detected_judge, |
|
'docket_source': 'case_parties', |
|
'Entity_Extraction_Method': 'referred_judges', |
|
'judge_enum': r_enum, |
|
'party_enum': p_enum, |
|
'pacer_id': pacer_id, |
|
'docket_index': None |
|
}) |
|
|
|
if assigned: |
|
label = f"AJ" |
|
detected_judge = judge_detection(assigned) |
|
if detected_judge: |
|
out_dicts.append( |
|
{ |
|
'original_text': assigned, |
|
**detected_judge, |
|
'docket_source': 'case_parties', |
|
'Entity_Extraction_Method': 'assigned_judge', |
|
'judge_enum': 0, |
|
'party_enum': p_enum, |
|
'pacer_id': pacer_id, |
|
'docket_index': None |
|
}) |
|
|
|
return out_dicts |
|
|
|
|
|
def extract_entities_sub_pipe(doc): |
|
|
|
out_dicts = [] |
|
|
|
for jj in [ent for ent in doc.ents if ent.label_ == 'JUDGE']: |
|
|
|
pretext=None |
|
postext=None |
|
pre_ss=None |
|
pre_se=None |
|
post_ss=None |
|
post_se=None |
|
|
|
star = jj.start |
|
fin = jj.end |
|
|
|
n=7 |
|
pre_n = doc[max(jj.start-n,0):jj.start] |
|
star = max(jj.start-n,0) |
|
if pre_n: |
|
pretext = pre_n.text |
|
pre_ss = pre_n.start_char |
|
pre_se = pre_n.end_char |
|
|
|
|
|
n=7 |
|
post_n = doc[jj.end:jj.end+n] |
|
fin = min(len(doc),jj.end+n) |
|
if post_n: |
|
postext = post_n.text |
|
post_ss = post_n.start_char |
|
post_se = post_n.end_char |
|
|
|
out_dicts.append( |
|
{ |
|
'original_text': doc[star:fin].text, |
|
'full_span_start': doc[star:fin].start_char, |
|
'full_span_end': doc[star:fin].end_char, |
|
'extracted_pre_5': pretext, |
|
'extracted_entity': jj.text, |
|
'extracted_post_5': postext, |
|
'Ent_span_start': jj.start_char, |
|
'Ent_span_end': jj.end_char, |
|
'docket_source': 'line_entry', |
|
'Entity_Extraction_Method': 'SPACY JNLP2' |
|
} |
|
) |
|
|
|
return out_dicts |
|
|
|
|
|
def counsels_and_parties(case_parties, meta): |
|
|
|
counsels = [] |
|
parties = [] |
|
for party in case_parties: |
|
|
|
parties.append( |
|
{ |
|
**meta, |
|
"Role":party['role'], |
|
"Entity": party['name'] |
|
} |
|
) |
|
|
|
|
|
if 'counsel' in party: |
|
if not party['counsel']: |
|
continue |
|
|
|
for each in party['counsel']: |
|
counsels.append( |
|
{ |
|
**meta, |
|
"Role":party['role'], |
|
"Entity": each['name'] |
|
} |
|
) |
|
|
|
return parties, counsels |
|
|
|
|
|
def load_data(fpath, pbar): |
|
|
|
|
|
with open(fpath) as f: |
|
case = json.load(f) |
|
|
|
year = int(case['filing_date'].split('/')[-1]) |
|
if 'court' not in case: |
|
court = ucid.split(";;")[0] |
|
else: |
|
court = case['court'] |
|
|
|
meta = { |
|
'ucid': case['ucid'], |
|
'court': court, |
|
'cid': case['case_id'], |
|
'year': year, |
|
'filing_date': case['filing_date'] |
|
} |
|
|
|
|
|
|
|
|
|
if 'parties' in case and case['parties']: |
|
parties, counsels = counsels_and_parties(case['parties'], meta) |
|
else: |
|
parties, counsels = [], [] |
|
|
|
|
|
|
|
|
|
case_type = case['case_type'] |
|
|
|
|
|
|
|
header_keys_interested = ['judge','referred_judges'] |
|
header_data = {key:case[key] for key in header_keys_interested if case[key]} |
|
|
|
header_judges = [] |
|
if case_type == 'cr' and case['parties']: |
|
|
|
party_judges = [(party['judge'], party['referred_judges'], party['pacer_id'], enum) |
|
for enum, party in enumerate(case['parties']) |
|
if party['judge'] or party['referred_judges']] |
|
if party_judges: |
|
party_info = extract_party_judge_entities(party_judges) |
|
padded_with_meta = [{**meta ,**each} for each in party_info] |
|
header_judges += padded_with_meta |
|
|
|
|
|
|
|
if header_data: |
|
header_info = extract_header_entities(header_data) |
|
padded_with_meta = [{**meta ,**each} for each in header_info] |
|
header_judges += padded_with_meta |
|
|
|
|
|
|
|
|
|
|
|
|
|
text_entries = [] |
|
if 'docket' in case and case['docket']: |
|
meta = { |
|
'ucid': case['ucid'], |
|
'court': court, |
|
'year': year, |
|
'cid': case['case_id'] |
|
} |
|
entries = [(d['docket_text'], d['date_filed']) for d in case['docket']] |
|
|
|
threeples = [] |
|
threeples = [(*e, i, meta) for i,e in enumerate(entries)] |
|
text_entries += [(tup[0],tup[1:]) for tup in threeples] |
|
|
|
pbar.update() |
|
return (parties, counsels, header_judges, text_entries) |
|
|
|
|
|
def run_thru_entries(texts, cores=8): |
|
processed_data = [] |
|
for doc, context_tuple in tqdm.tqdm( |
|
jnlp.pipe(texts, |
|
n_process=cores, batch_size=1000, as_tuples=True, disable = ['lemmatizer','tagger','parser','attribute_ruler','morphologizer','textcat']), |
|
total=len(texts), desc=f'[--Extraction--]', leave=True |
|
): |
|
if 'JUDGE' in [ent.label_ for ent in doc.ents]: |
|
entry_date, enum_index, meta = context_tuple |
|
rows = extract_entities_sub_pipe(doc) |
|
|
|
rows = [ |
|
{ |
|
**meta, |
|
'docket_index': enum_index, |
|
'entry_date': entry_date, |
|
**row |
|
} for row in rows ] |
|
if rows: |
|
processed_data+=rows |
|
return processed_data |
|
|
|
|
|
def main_run(indir, outdir, cores=10): |
|
|
|
fpaths = glob.glob(str(Path(indir).resolve()/'*.json')) |
|
pbar = tqdm.tqdm(total = len(fpaths)) |
|
pool = ThreadPool(cores) |
|
returned_tuples = pool.starmap(load_data, ((fpath, pbar) for fpath in fpaths)) |
|
|
|
pdf_rows = [] |
|
cdf_rows = [] |
|
hj_rows = [] |
|
entry_rows = [] |
|
for p,c,h,t in returned_tuples: |
|
if p: |
|
pdf_rows+=p |
|
if c: |
|
cdf_rows+=c |
|
if h: |
|
hj_rows+=h |
|
if t: |
|
entry_rows+=t |
|
|
|
PDF = pd.DataFrame(pdf_rows) |
|
CDF = pd.DataFrame(cdf_rows) |
|
Headers = pd.DataFrame(hj_rows) |
|
|
|
outdir = Path(outdir).resolve() |
|
PDF.to_csv(outdir/'parties.csv', index=False) |
|
CDF.to_csv(outdir/'counsels.csv', index=False) |
|
Headers.to_csv(outdir/'headers.csv', index=False) |
|
|
|
del(PDF) |
|
del(CDF) |
|
del(Headers) |
|
|
|
print("Now performing model extraction") |
|
extractions = run_thru_entries(entry_rows, cores) |
|
EDF = pd.DataFrame(extractions) |
|
EDF.to_csv(outdir / 'entries.csv', index=False) |