Update generic_ner.py
Browse files- generic_ner.py +32 -43
generic_ner.py
CHANGED
@@ -528,7 +528,7 @@ def remove_trailing_stopwords(entities):
|
|
528 |
and repairs the lOffset and rOffset accordingly.
|
529 |
"""
|
530 |
if DEBUG:
|
531 |
-
print(f"Initial entities: {len(entities)}")
|
532 |
new_entities = []
|
533 |
for entity in entities:
|
534 |
if "comp" not in entity["type"]:
|
@@ -540,7 +540,6 @@ def remove_trailing_stopwords(entities):
|
|
540 |
rOffset = entity.get("rOffset", original_len)
|
541 |
|
542 |
# Remove stopwords and punctuation from the beginning
|
543 |
-
i = 0
|
544 |
while entity_text and (
|
545 |
entity_text.split()[0].lower() in stop_words
|
546 |
or entity_text[0] in punctuation
|
@@ -562,48 +561,36 @@ def remove_trailing_stopwords(entities):
|
|
562 |
print(
|
563 |
f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
564 |
)
|
565 |
-
i += 1
|
566 |
|
567 |
-
i = 0
|
568 |
# Remove stopwords and punctuation from the end
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
|
574 |
-
|
575 |
-
|
576 |
-
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
# Exit loop if neither stopwords nor punctuation are found
|
597 |
-
break
|
598 |
-
|
599 |
-
iteration += 1
|
600 |
-
# print(f"ITERATION: {iteration} [{entity['surface']}] for {entity_text}")
|
601 |
|
602 |
-
if len(entity_text.strip()) == 1:
|
603 |
-
entities.remove(entity)
|
604 |
-
if DEBUG:
|
605 |
-
print(f"Skipping entity: {entity_text}")
|
606 |
-
continue
|
607 |
# Skip certain entities based on rules
|
608 |
if entity_text in string.punctuation:
|
609 |
if DEBUG:
|
@@ -682,10 +669,12 @@ def remove_trailing_stopwords(entities):
|
|
682 |
entities.remove(entity)
|
683 |
else:
|
684 |
new_entities.append(entity)
|
685 |
-
|
|
|
686 |
if DEBUG:
|
687 |
-
print(f"Remained entities: {len(new_entities)}")
|
688 |
return new_entities
|
|
|
689 |
|
690 |
class MultitaskTokenClassificationPipeline(Pipeline):
|
691 |
|
|
|
528 |
and repairs the lOffset and rOffset accordingly.
|
529 |
"""
|
530 |
if DEBUG:
|
531 |
+
print(f"Initial entities in remove_trailing_stopwords: {len(entities)}")
|
532 |
new_entities = []
|
533 |
for entity in entities:
|
534 |
if "comp" not in entity["type"]:
|
|
|
540 |
rOffset = entity.get("rOffset", original_len)
|
541 |
|
542 |
# Remove stopwords and punctuation from the beginning
|
|
|
543 |
while entity_text and (
|
544 |
entity_text.split()[0].lower() in stop_words
|
545 |
or entity_text[0] in punctuation
|
|
|
561 |
print(
|
562 |
f"Removed leading punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
563 |
)
|
|
|
564 |
|
|
|
565 |
# Remove stopwords and punctuation from the end
|
566 |
+
if len(entity_text.strip()) > 1:
|
567 |
+
while entity_text and (
|
568 |
+
entity_text.split()[-1].lower() in stop_words
|
569 |
+
or entity_text[-1] in punctuation
|
570 |
+
):
|
571 |
+
if entity_text.split()[-1].lower() in stop_words:
|
572 |
+
stopword_len = (
|
573 |
+
len(entity_text.split()[-1]) + 1
|
574 |
+
) # Adjust length for stopword and preceding space
|
575 |
+
entity_text = entity_text[
|
576 |
+
:-stopword_len
|
577 |
+
] # Remove trailing stopword
|
578 |
+
rOffset -= stopword_len # Adjust the right offset
|
579 |
+
if DEBUG:
|
580 |
+
print(
|
581 |
+
f"Removed trailing stopword from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
582 |
+
)
|
583 |
+
if entity_text:
|
584 |
+
if entity_text[-1] in punctuation:
|
585 |
+
entity_text = entity_text[
|
586 |
+
:-1
|
587 |
+
] # Remove trailing punctuation
|
588 |
+
rOffset -= 1 # Adjust the right offset
|
589 |
+
if DEBUG:
|
590 |
+
print(
|
591 |
+
f"Removed trailing punctuation from entity: {entity['surface']} --> {entity_text} ({entity['type']}"
|
592 |
+
)
|
|
|
|
|
|
|
|
|
|
|
593 |
|
|
|
|
|
|
|
|
|
|
|
594 |
# Skip certain entities based on rules
|
595 |
if entity_text in string.punctuation:
|
596 |
if DEBUG:
|
|
|
669 |
entities.remove(entity)
|
670 |
else:
|
671 |
new_entities.append(entity)
|
672 |
+
else:
|
673 |
+
new_entities.append(entity)
|
674 |
if DEBUG:
|
675 |
+
print(f"Remained entities in remove_trailing_stopwords: {len(new_entities)}")
|
676 |
return new_entities
|
677 |
+
|
678 |
|
679 |
class MultitaskTokenClassificationPipeline(Pipeline):
|
680 |
|