mjlee
commited on
Commit
ยท
0cd2c97
1
Parent(s):
d9b3b55
0708
Browse files- .gitignore +1 -0
- __pycache__/config.cpython-39.pyc +0 -0
- __pycache__/models.cpython-39.pyc +0 -0
- app.py +87 -4
- config.py +14 -0
- flagged/log.csv +2 -0
- models.py +71 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*.ipynb
|
__pycache__/config.cpython-39.pyc
ADDED
Binary file (1.29 kB). View file
|
|
__pycache__/models.cpython-39.pyc
ADDED
Binary file (2.46 kB). View file
|
|
app.py
CHANGED
@@ -1,7 +1,90 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
def greet(name):
|
4 |
-
return "Hello " + name + "!!"
|
5 |
|
6 |
-
demo = gr.Interface(fn=greet, inputs="text", outputs="text")
|
7 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from models import *
|
3 |
+
from huggingface_hub import hf_hub_download
|
4 |
+
import os
|
5 |
+
from config import *
|
6 |
+
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
|
7 |
+
|
8 |
+
ENTITY_REPO_ID = 'cccornflake/absa_v2_entity'
|
9 |
+
ENTITY_FILENAME = "entity_model.pt"
|
10 |
+
|
11 |
+
SENTIMENT_REPO_ID = 'cccornflake/absa_v2_sentiment'
|
12 |
+
SENTIMENT_FILENAME = "sentiment_model.pt"
|
13 |
+
|
14 |
+
print("downloading model...")
|
15 |
+
sen_model_file = hf_hub_download(repo_id=SENTIMENT_REPO_ID, filename=SENTIMENT_FILENAME)
|
16 |
+
entity_model_file = hf_hub_download(repo_id=ENTITY_REPO_ID, filename=ENTITY_FILENAME)
|
17 |
+
|
18 |
+
base_model = cfg.base_model
|
19 |
+
|
20 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model)
|
21 |
+
|
22 |
+
sen_model = Classifier(base_model, num_labels=2, device=device, tokenizer=tokenizer)
|
23 |
+
sen_model.load_state_dict(torch.load(sen_model_file))
|
24 |
+
|
25 |
+
entity_model = Classifier(base_model, num_labels=2, device=device, tokenizer=tokenizer)
|
26 |
+
entity_model.load_state_dict(torch.load(entity_model_file))
|
27 |
+
|
28 |
+
|
29 |
+
def infer(test_sentence):
|
30 |
+
entity_model.to(device)
|
31 |
+
entity_model.eval()
|
32 |
+
sen_model.to(device)
|
33 |
+
sen_model.eval()
|
34 |
+
|
35 |
+
form = test_sentence
|
36 |
+
annotation = []
|
37 |
+
|
38 |
+
if len(form) > 500:
|
39 |
+
return "Too long sentence!"
|
40 |
+
|
41 |
+
|
42 |
+
for pair in entity_property_pair:
|
43 |
+
|
44 |
+
form_ = form + "[SEP]"
|
45 |
+
pair_ = entity2str[pair] + "[SEP]"
|
46 |
+
|
47 |
+
tokenized_data = tokenizer(form_, pair_, padding='max_length', max_length=512, truncation=True)
|
48 |
+
|
49 |
+
input_ids = torch.tensor([tokenized_data['input_ids']]).to(device)
|
50 |
+
attention_mask = torch.tensor([tokenized_data['attention_mask']]).to(device)
|
51 |
+
|
52 |
+
first_sep = tokenized_data['input_ids'].index(2)
|
53 |
+
last_sep = tokenized_data['input_ids'][first_sep+2:].index(2) + (first_sep + 2)
|
54 |
+
mask = [0] * len(tokenized_data['input_ids'])
|
55 |
+
for i in range(first_sep + 2, last_sep):
|
56 |
+
mask[i] = 1
|
57 |
+
mask = torch.tensor([mask]).to(device)
|
58 |
+
|
59 |
+
with torch.no_grad():
|
60 |
+
outputs = entity_model(input_ids, attention_mask, mask)
|
61 |
+
ce_logits = outputs
|
62 |
+
ce_predictions = torch.argmax(ce_logits, dim = -1)
|
63 |
+
|
64 |
+
ce_result = tf_id_to_name[ce_predictions[0]]
|
65 |
+
|
66 |
+
if ce_result == 'True':
|
67 |
+
with torch.no_grad():
|
68 |
+
outputs = sen_model(input_ids, attention_mask, mask)
|
69 |
+
pc_logits = outputs
|
70 |
+
pc_predictions = torch.argmax(pc_logits, dim=-1)
|
71 |
+
pc_result = polarity_id_to_name[pc_predictions[0]]
|
72 |
+
|
73 |
+
annotation.append(f"{pair} - {pc_result}")
|
74 |
+
|
75 |
+
result = '\n'.join(annotation)
|
76 |
+
return result
|
77 |
+
|
78 |
+
article = "**์ด๋ฏธ์ง๋ฅผ ์
๋ก๋ํ์ธ์.**" \
|
79 |
+
|
80 |
+
demo = gr.Interface(fn=infer,
|
81 |
+
inputs=gr.Textbox(type="text", label="Input Sentence"),
|
82 |
+
outputs=gr.Textbox(type="text", label="Result Sentence")
|
83 |
+
# examples=[image_path,]
|
84 |
+
)
|
85 |
+
|
86 |
+
demo.launch(share=True)
|
87 |
+
|
88 |
+
|
89 |
|
|
|
|
|
90 |
|
|
|
|
config.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
base_model = 'beomi/KcELECTRA-base'
|
2 |
+
|
3 |
+
entity_property_pair =['์ฃผ์ฐจ#์ผ๋ฐ', 'ํธ์์์ค#๋งํธ', 'ํธ์์์ค#์ผํ', 'ํ๊ตฐ#์ผ๋ฐ', 'ํ๊ตฐ#ํ๊ต ์ ๊ทผ์ฑ', 'ํ๊ตฐ#ํ์ ์ ๊ทผ์ฑ',
|
4 |
+
'ํ๊ตฐ#์์ก ํ๊ฒฝ', '์ธํ๋ผ#์ผ๋ฐ', '์ธํ๋ผ#์๊ถ', '์ธํ๋ผ#๊ตํต', '์ธํ๋ผ#๋ณ์', '์ธํ๋ผ#๋์ค๊ตํต', 'ํ๊ฒฝ#์ผ๋ฐ', 'ํ๊ฒฝ#์์', 'ํธ์์์ค#์ผ๋ฐ',
|
5 |
+
'ํ๊ฒฝ#๊ณต์', 'ํ๊ฒฝ#๋จ์ง ๊ด๋ฆฌ', 'ํ๊ฒฝ#๋ทฐ', 'ํ๊ฒฝ#์กฐ๊ฒฝ', 'ํ๊ฒฝ#๊ด๋ฆฌ๋น', '๊ตฌ์กฐ#์ง ๊ตฌ์กฐ', '๊ฐ๊ฒฉ#์์ธ', '์ ๋ง#์ผ๋ฐ', 'ํ๊ตฐ#์ ์น์']
|
6 |
+
|
7 |
+
entity2str = dict(zip(entity_property_pair, map(lambda x: x.replace("#", ", ").replace("/", ", "), entity_property_pair)))
|
8 |
+
|
9 |
+
|
10 |
+
tf_id_to_name = ['True', 'False']
|
11 |
+
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}
|
12 |
+
|
13 |
+
polarity_id_to_name = ['positive', 'negative']
|
14 |
+
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}
|
flagged/log.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
Input Sentence,Result Sentence,flag,username,timestamp
|
2 |
+
hello,hello,,,2024-07-08 13:41:38.124456
|
models.py
CHANGED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import torch.nn as nn
|
6 |
+
from tqdm import trange
|
7 |
+
from transformers import ElectraModel, AutoTokenizer, AutoModel
|
8 |
+
from transformers import AutoTokenizer, AutoConfig
|
9 |
+
from torch.utils.data import DataLoader, TensorDataset
|
10 |
+
from transformers import get_linear_schedule_with_warmup
|
11 |
+
from transformers import AdamW
|
12 |
+
from datasets import load_metric
|
13 |
+
from sklearn.metrics import f1_score
|
14 |
+
import pandas as pd
|
15 |
+
import copy
|
16 |
+
# from utils import evaluation, evaluation_f1
|
17 |
+
from torch.nn import functional as F
|
18 |
+
import re
|
19 |
+
from config import entity_property_pair
|
20 |
+
from tqdm import tqdm
|
21 |
+
from datasets import Dataset
|
22 |
+
import torch.nn as nn
|
23 |
+
from transformers import AutoModelForSequenceClassification
|
24 |
+
from transformers import ElectraModel
|
25 |
+
|
26 |
+
|
27 |
+
|
28 |
+
class Classifier(nn.Module):
|
29 |
+
def __init__(self, base_model, num_labels, device, tokenizer):
|
30 |
+
super(Classifier, self).__init__()
|
31 |
+
self.num_labels = num_labels
|
32 |
+
self.device = device
|
33 |
+
|
34 |
+
self.electra = ElectraModel.from_pretrained('beomi/KcELECTRA-base', num_labels=2)
|
35 |
+
self.electra.resize_token_embeddings(len(tokenizer))
|
36 |
+
|
37 |
+
self.fc1 = nn.Linear(self.electra.config.hidden_size, 256)
|
38 |
+
self.fc2 = nn.Linear(self.electra.config.hidden_size, 512)
|
39 |
+
self.fc3 = nn.Linear(256+512, 2)
|
40 |
+
|
41 |
+
self.dropout = nn.Dropout(0.1)
|
42 |
+
|
43 |
+
|
44 |
+
def forward(self, input_ids, attention_mask, entity_mask):
|
45 |
+
|
46 |
+
|
47 |
+
outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
|
48 |
+
last_hidden_state = outputs.last_hidden_state
|
49 |
+
|
50 |
+
masked_last_hidden = self.entity_average(last_hidden_state, entity_mask)
|
51 |
+
masked_last_hidden = self.fc2(masked_last_hidden)
|
52 |
+
|
53 |
+
last_hidden_state = self.fc1(last_hidden_state)
|
54 |
+
entity_outputs = torch.cat([last_hidden_state[:, 0, :] , masked_last_hidden], dim=-1)
|
55 |
+
|
56 |
+
outputs = torch.tanh(entity_outputs)
|
57 |
+
outputs = self.dropout(outputs)
|
58 |
+
outputs = self.fc3(outputs)
|
59 |
+
|
60 |
+
|
61 |
+
return outputs
|
62 |
+
|
63 |
+
@staticmethod
|
64 |
+
def entity_average(hidden_output, e_mask):
|
65 |
+
e_mask_unsqueeze = e_mask.unsqueeze(1) # [b, 1, j-i+1]
|
66 |
+
length_tensor = (e_mask != 0).sum(dim=1).unsqueeze(1) # [batch_size, 1]
|
67 |
+
|
68 |
+
# [b, 1, j-i+1] * [b, j-i+1, dim] = [b, 1, dim] -> [b, dim]
|
69 |
+
sum_vector = torch.bmm(e_mask_unsqueeze.float(), hidden_output).squeeze(1)
|
70 |
+
avg_vector = sum_vector.float() / length_tensor.float() # broadcasting
|
71 |
+
return avg_vector
|