mjlee commited on
Commit
0cd2c97
ยท
1 Parent(s): d9b3b55
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.ipynb
__pycache__/config.cpython-39.pyc ADDED
Binary file (1.29 kB). View file
 
__pycache__/models.cpython-39.pyc ADDED
Binary file (2.46 kB). View file
 
app.py CHANGED
@@ -1,7 +1,90 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
1
  import gradio as gr
2
+ from models import *
3
+ from huggingface_hub import hf_hub_download
4
+ import os
5
+ from config import *
6
+ device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
7
+
8
+ ENTITY_REPO_ID = 'cccornflake/absa_v2_entity'
9
+ ENTITY_FILENAME = "entity_model.pt"
10
+
11
+ SENTIMENT_REPO_ID = 'cccornflake/absa_v2_sentiment'
12
+ SENTIMENT_FILENAME = "sentiment_model.pt"
13
+
14
+ print("downloading model...")
15
+ sen_model_file = hf_hub_download(repo_id=SENTIMENT_REPO_ID, filename=SENTIMENT_FILENAME)
16
+ entity_model_file = hf_hub_download(repo_id=ENTITY_REPO_ID, filename=ENTITY_FILENAME)
17
+
18
+ base_model = cfg.base_model
19
+
20
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
21
+
22
+ sen_model = Classifier(base_model, num_labels=2, device=device, tokenizer=tokenizer)
23
+ sen_model.load_state_dict(torch.load(sen_model_file))
24
+
25
+ entity_model = Classifier(base_model, num_labels=2, device=device, tokenizer=tokenizer)
26
+ entity_model.load_state_dict(torch.load(entity_model_file))
27
+
28
+
29
+ def infer(test_sentence):
30
+ entity_model.to(device)
31
+ entity_model.eval()
32
+ sen_model.to(device)
33
+ sen_model.eval()
34
+
35
+ form = test_sentence
36
+ annotation = []
37
+
38
+ if len(form) > 500:
39
+ return "Too long sentence!"
40
+
41
+
42
+ for pair in entity_property_pair:
43
+
44
+ form_ = form + "[SEP]"
45
+ pair_ = entity2str[pair] + "[SEP]"
46
+
47
+ tokenized_data = tokenizer(form_, pair_, padding='max_length', max_length=512, truncation=True)
48
+
49
+ input_ids = torch.tensor([tokenized_data['input_ids']]).to(device)
50
+ attention_mask = torch.tensor([tokenized_data['attention_mask']]).to(device)
51
+
52
+ first_sep = tokenized_data['input_ids'].index(2)
53
+ last_sep = tokenized_data['input_ids'][first_sep+2:].index(2) + (first_sep + 2)
54
+ mask = [0] * len(tokenized_data['input_ids'])
55
+ for i in range(first_sep + 2, last_sep):
56
+ mask[i] = 1
57
+ mask = torch.tensor([mask]).to(device)
58
+
59
+ with torch.no_grad():
60
+ outputs = entity_model(input_ids, attention_mask, mask)
61
+ ce_logits = outputs
62
+ ce_predictions = torch.argmax(ce_logits, dim = -1)
63
+
64
+ ce_result = tf_id_to_name[ce_predictions[0]]
65
+
66
+ if ce_result == 'True':
67
+ with torch.no_grad():
68
+ outputs = sen_model(input_ids, attention_mask, mask)
69
+ pc_logits = outputs
70
+ pc_predictions = torch.argmax(pc_logits, dim=-1)
71
+ pc_result = polarity_id_to_name[pc_predictions[0]]
72
+
73
+ annotation.append(f"{pair} - {pc_result}")
74
+
75
+ result = '\n'.join(annotation)
76
+ return result
77
+
78
+ article = "**์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”.**" \
79
+
80
+ demo = gr.Interface(fn=infer,
81
+ inputs=gr.Textbox(type="text", label="Input Sentence"),
82
+ outputs=gr.Textbox(type="text", label="Result Sentence")
83
+ # examples=[image_path,]
84
+ )
85
+
86
+ demo.launch(share=True)
87
+
88
+
89
 
 
 
90
 
 
 
config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model = 'beomi/KcELECTRA-base'
2
+
3
+ entity_property_pair =['์ฃผ์ฐจ#์ผ๋ฐ˜', 'ํŽธ์˜์‹œ์„ค#๋งˆํŠธ', 'ํŽธ์˜์‹œ์„ค#์‡ผํ•‘', 'ํ•™๊ตฐ#์ผ๋ฐ˜', 'ํ•™๊ตฐ#ํ•™๊ต ์ ‘๊ทผ์„ฑ', 'ํ•™๊ตฐ#ํ•™์› ์ ‘๊ทผ์„ฑ',
4
+ 'ํ•™๊ตฐ#์–‘์œก ํ™˜๊ฒฝ', '์ธํ”„๋ผ#์ผ๋ฐ˜', '์ธํ”„๋ผ#์ƒ๊ถŒ', '์ธํ”„๋ผ#๊ตํ†ต', '์ธํ”„๋ผ#๋ณ‘์›', '์ธํ”„๋ผ#๋Œ€์ค‘๊ตํ†ต', 'ํ™˜๊ฒฝ#์ผ๋ฐ˜', 'ํ™˜๊ฒฝ#์†Œ์Œ', 'ํŽธ์˜์‹œ์„ค#์ผ๋ฐ˜',
5
+ 'ํ™˜๊ฒฝ#๊ณต์›', 'ํ™˜๊ฒฝ#๋‹จ์ง€ ๊ด€๋ฆฌ', 'ํ™˜๊ฒฝ#๋ทฐ', 'ํ™˜๊ฒฝ#์กฐ๊ฒฝ', 'ํ™˜๊ฒฝ#๊ด€๋ฆฌ๋น„', '๊ตฌ์กฐ#์ง‘ ๊ตฌ์กฐ', '๊ฐ€๊ฒฉ#์‹œ์„ธ', '์ „๋ง#์ผ๋ฐ˜', 'ํ•™๊ตฐ#์œ ์น˜์›']
6
+
7
+ entity2str = dict(zip(entity_property_pair, map(lambda x: x.replace("#", ", ").replace("/", ", "), entity_property_pair)))
8
+
9
+
10
+ tf_id_to_name = ['True', 'False']
11
+ tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}
12
+
13
+ polarity_id_to_name = ['positive', 'negative']
14
+ polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}
flagged/log.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ Input Sentence,Result Sentence,flag,username,timestamp
2
+ hello,hello,,,2024-07-08 13:41:38.124456
models.py CHANGED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import numpy as np
4
+ import torch
5
+ import torch.nn as nn
6
+ from tqdm import trange
7
+ from transformers import ElectraModel, AutoTokenizer, AutoModel
8
+ from transformers import AutoTokenizer, AutoConfig
9
+ from torch.utils.data import DataLoader, TensorDataset
10
+ from transformers import get_linear_schedule_with_warmup
11
+ from transformers import AdamW
12
+ from datasets import load_metric
13
+ from sklearn.metrics import f1_score
14
+ import pandas as pd
15
+ import copy
16
+ # from utils import evaluation, evaluation_f1
17
+ from torch.nn import functional as F
18
+ import re
19
+ from config import entity_property_pair
20
+ from tqdm import tqdm
21
+ from datasets import Dataset
22
+ import torch.nn as nn
23
+ from transformers import AutoModelForSequenceClassification
24
+ from transformers import ElectraModel
25
+
26
+
27
+
28
+ class Classifier(nn.Module):
29
+ def __init__(self, base_model, num_labels, device, tokenizer):
30
+ super(Classifier, self).__init__()
31
+ self.num_labels = num_labels
32
+ self.device = device
33
+
34
+ self.electra = ElectraModel.from_pretrained('beomi/KcELECTRA-base', num_labels=2)
35
+ self.electra.resize_token_embeddings(len(tokenizer))
36
+
37
+ self.fc1 = nn.Linear(self.electra.config.hidden_size, 256)
38
+ self.fc2 = nn.Linear(self.electra.config.hidden_size, 512)
39
+ self.fc3 = nn.Linear(256+512, 2)
40
+
41
+ self.dropout = nn.Dropout(0.1)
42
+
43
+
44
+ def forward(self, input_ids, attention_mask, entity_mask):
45
+
46
+
47
+ outputs = self.electra(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
48
+ last_hidden_state = outputs.last_hidden_state
49
+
50
+ masked_last_hidden = self.entity_average(last_hidden_state, entity_mask)
51
+ masked_last_hidden = self.fc2(masked_last_hidden)
52
+
53
+ last_hidden_state = self.fc1(last_hidden_state)
54
+ entity_outputs = torch.cat([last_hidden_state[:, 0, :] , masked_last_hidden], dim=-1)
55
+
56
+ outputs = torch.tanh(entity_outputs)
57
+ outputs = self.dropout(outputs)
58
+ outputs = self.fc3(outputs)
59
+
60
+
61
+ return outputs
62
+
63
+ @staticmethod
64
+ def entity_average(hidden_output, e_mask):
65
+ e_mask_unsqueeze = e_mask.unsqueeze(1) # [b, 1, j-i+1]
66
+ length_tensor = (e_mask != 0).sum(dim=1).unsqueeze(1) # [batch_size, 1]
67
+
68
+ # [b, 1, j-i+1] * [b, j-i+1, dim] = [b, 1, dim] -> [b, dim]
69
+ sum_vector = torch.bmm(e_mask_unsqueeze.float(), hidden_output).squeeze(1)
70
+ avg_vector = sum_vector.float() / length_tensor.float() # broadcasting
71
+ return avg_vector