emanuelaboros commited on
Commit
b867d57
·
verified ·
1 Parent(s): ddb90c9

Update generic_ner.py

Browse files
Files changed (1) hide show
  1. generic_ner.py +32 -43
generic_ner.py CHANGED
@@ -528,7 +528,7 @@ def remove_trailing_stopwords(entities):
528
  and repairs the lOffset and rOffset accordingly.
529
  """
530
  if DEBUG:
531
- print(f"Initial entities: {len(entities)}")
532
  new_entities = []
533
  for entity in entities:
534
  if "comp" not in entity["type"]:
@@ -540,7 +540,6 @@ def remove_trailing_stopwords(entities):
540
  rOffset = entity.get("rOffset", original_len)
541
 
542
  # Remove stopwords and punctuation from the beginning
543
- i = 0
544
  while entity_text and (
545
  entity_text.split()[0].lower() in stop_words
546
  or entity_text[0] in punctuation
@@ -562,48 +561,36 @@ def remove_trailing_stopwords(entities):
562
  print(
563
  f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
564
  )
565
- i += 1
566
 
567
- i = 0
568
  # Remove stopwords and punctuation from the end
569
- iteration = 0
570
- max_iterations = len(entity_text) # Prevent infinite loops
571
-
572
- while entity_text and iteration < max_iterations:
573
- # Check if the last word is a stopword or the last character is punctuation
574
- last_word = entity_text.split()[-1] if entity_text.split() else ""
575
- last_char = entity_text[-1]
576
-
577
- if last_word.lower() in stop_words:
578
- # Remove trailing stopword and adjust rOffset
579
- stopword_len = len(last_word) + 1 # Include space before stopword
580
- entity_text = entity_text[:-stopword_len].rstrip()
581
- rOffset -= stopword_len
582
- if DEBUG:
583
- print(
584
- f"Removed trailing stopword from entity: {entity_text} (rOffset={rOffset})"
585
- )
586
-
587
- elif last_char in punctuation:
588
- # Remove trailing punctuation and adjust rOffset
589
- entity_text = entity_text[:-1].rstrip()
590
- rOffset -= 1
591
- if DEBUG:
592
- print(
593
- f"Removed trailing punctuation from entity: {entity_text} (rOffset={rOffset})"
594
- )
595
- else:
596
- # Exit loop if neither stopwords nor punctuation are found
597
- break
598
-
599
- iteration += 1
600
- # print(f"ITERATION: {iteration} [{entity['surface']}] for {entity_text}")
601
 
602
- if len(entity_text.strip()) == 1:
603
- entities.remove(entity)
604
- if DEBUG:
605
- print(f"Skipping entity: {entity_text}")
606
- continue
607
  # Skip certain entities based on rules
608
  if entity_text in string.punctuation:
609
  if DEBUG:
@@ -682,10 +669,12 @@ def remove_trailing_stopwords(entities):
682
  entities.remove(entity)
683
  else:
684
  new_entities.append(entity)
685
-
 
686
  if DEBUG:
687
- print(f"Remained entities: {len(new_entities)}")
688
  return new_entities
 
689
 
690
  class MultitaskTokenClassificationPipeline(Pipeline):
691
 
 
528
  and repairs the lOffset and rOffset accordingly.
529
  """
530
  if DEBUG:
531
+ print(f"Initial entities in remove_trailing_stopwords: {len(entities)}")
532
  new_entities = []
533
  for entity in entities:
534
  if "comp" not in entity["type"]:
 
540
  rOffset = entity.get("rOffset", original_len)
541
 
542
  # Remove stopwords and punctuation from the beginning
 
543
  while entity_text and (
544
  entity_text.split()[0].lower() in stop_words
545
  or entity_text[0] in punctuation
 
561
  print(
562
  f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
563
  )
 
564
 
 
565
  # Remove stopwords and punctuation from the end
566
+ if len(entity_text.strip()) > 1:
567
+ while entity_text and (
568
+ entity_text.split()[-1].lower() in stop_words
569
+ or entity_text[-1] in punctuation
570
+ ):
571
+ if entity_text.split()[-1].lower() in stop_words:
572
+ stopword_len = (
573
+ len(entity_text.split()[-1]) + 1
574
+ ) # Adjust length for stopword and preceding space
575
+ entity_text = entity_text[
576
+ :-stopword_len
577
+ ] # Remove trailing stopword
578
+ rOffset -= stopword_len # Adjust the right offset
579
+ if DEBUG:
580
+ print(
581
+ f"Removed trailing stopword from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
582
+ )
583
+ if entity_text:
584
+ if entity_text[-1] in punctuation:
585
+ entity_text = entity_text[
586
+ :-1
587
+ ] # Remove trailing punctuation
588
+ rOffset -= 1 # Adjust the right offset
589
+ if DEBUG:
590
+ print(
591
+ f"Removed trailing punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
592
+ )
 
 
 
 
 
593
 
 
 
 
 
 
594
  # Skip certain entities based on rules
595
  if entity_text in string.punctuation:
596
  if DEBUG:
 
669
  entities.remove(entity)
670
  else:
671
  new_entities.append(entity)
672
+ else:
673
+ new_entities.append(entity)
674
  if DEBUG:
675
+ print(f"Remained entities in remove_trailing_stopwords: {len(new_entities)}")
676
  return new_entities
677
+
678
 
679
  class MultitaskTokenClassificationPipeline(Pipeline):
680