Vu Minh Chien commited on
Commit
06d9f7d
·
1 Parent(s): 5a202c5

change predict rule

Browse files
Files changed (3) hide show
  1. Dockerfile +2 -2
  2. routes/predict.py +118 -109
  3. validate_optimization.py +2 -2
Dockerfile CHANGED
@@ -28,8 +28,8 @@ COPY requirements.txt .
28
  RUN --mount=type=secret,id=BITBUCKET_APP_PW,mode=0444,required=true \
29
  git clone https://vumichien:$(cat /run/secrets/BITBUCKET_APP_PW)@bitbucket.org/dtm-partners/meisai-check-ai.git && \
30
  cd meisai-check-ai && \
31
- git checkout develop && \
32
- git pull origin develop && \
33
  cd ..
34
 
35
  # Cài đặt dependencies
 
28
  RUN --mount=type=secret,id=BITBUCKET_APP_PW,mode=0444,required=true \
29
  git clone https://vumichien:$(cat /run/secrets/BITBUCKET_APP_PW)@bitbucket.org/dtm-partners/meisai-check-ai.git && \
30
  cd meisai-check-ai && \
31
+ git checkout staging && \
32
+ git pull origin staging && \
33
  cd ..
34
 
35
  # Cài đặt dependencies
routes/predict.py CHANGED
@@ -21,8 +21,14 @@ from mapping_lib.sub_subject_and_name_data_mapper import SubSubjectAndNameDataMa
21
  from mapping_lib.sub_subject_location_data_mapper import SubSubjectLocationDataMapper
22
  from mapping_lib.abstract_similarity_mapper import AbstractSimilarityMapper
23
  from mapping_lib.name_and_abstract_mapper import NameAndAbstractDataMapper
24
- from mapping_lib.unit_similarity_mapper import UnitSimilarityMapper
25
- from mapping_lib.standard_name_mapper import StandardNameMapper
 
 
 
 
 
 
26
 
27
  from config import UPLOAD_DIR, OUTPUT_DIR
28
  from models import (
@@ -65,6 +71,21 @@ async def predict(
65
  # Load input data
66
  start_time = time.time()
67
  df_input_data = pd.read_csv(input_file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  # Ensure basic columns exist with default values
70
  basic_columns = {
@@ -83,9 +104,8 @@ async def predict(
83
  if col not in df_input_data.columns:
84
  df_input_data[col] = default_value
85
 
86
- # Process data using the new mapping system similar to predict.py
87
  try:
88
- # Subject mapping
89
  if sentence_service.df_subject_map_data is not None:
90
  subject_similarity_mapper = SubjectSimilarityMapper(
91
  cached_embedding_helper=sentence_service.subject_cached_embedding_helper,
@@ -93,35 +113,29 @@ async def predict(
93
  )
94
 
95
  list_input_subject = df_input_data["科目"].unique()
96
- df_subject_data = pd.DataFrame({"科目": list_input_subject})
97
 
98
- subject_similarity_mapper.predict_input_optimized(df_input_data=df_subject_data)
99
 
100
- output_subject_map = dict(
101
- zip(df_subject_data["科目"], df_subject_data["出力_科目"])
102
- )
103
- df_input_data["標準科目"] = df_input_data["科目"].map(
104
- output_subject_map
105
- )
106
- df_input_data["出力_科目"] = df_input_data["科目"].map(
107
- output_subject_map
108
- )
109
 
110
  except Exception as e:
111
  print(f"Error processing SubjectSimilarityMapper: {e}")
112
  raise HTTPException(status_code=500, detail=str(e))
113
 
 
114
  try:
115
- # Standard subject mapping
116
  if sentence_service.df_standard_subject_map_data is not None:
117
  standard_subject_data_mapper = StandardSubjectDataMapper(
118
  df_map_data=sentence_service.df_standard_subject_map_data
119
  )
120
  df_output_data = standard_subject_data_mapper.map_data(
121
- df_input_data=df_input_data,
122
- input_key_columns=["出力_科目"],
123
- in_place=True,
124
  )
 
125
  else:
126
  df_output_data = df_input_data.copy()
127
 
@@ -130,131 +144,127 @@ async def predict(
130
  # Continue with original data if standard subject mapping fails
131
  df_output_data = df_input_data.copy()
132
 
 
133
  try:
134
- # Sub subject mapping
135
  if sentence_service.df_sub_subject_map_data is not None:
136
  sub_subject_similarity_mapper = SubSubjectSimilarityMapper(
137
  cached_embedding_helper=sentence_service.sub_subject_cached_embedding_helper,
138
  df_map_data=sentence_service.df_sub_subject_map_data,
139
  )
140
- sub_subject_similarity_mapper.predict_input_optimized(
141
- df_input_data=df_output_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  )
143
- df_output_data = df_output_data.fillna("")
144
 
145
  except Exception as e:
146
  print(f"Error processing SubSubjectSimilarityMapper: {e}")
147
  raise HTTPException(status_code=500, detail=str(e))
148
 
149
-
150
  try:
151
- # Name mapping
152
  if sentence_service.df_name_map_data is not None:
153
  name_sentence_mapper = NameSimilarityMapper(
154
  cached_embedding_helper=sentence_service.name_cached_embedding_helper,
155
  df_map_data=sentence_service.df_name_map_data,
156
  )
157
- name_sentence_mapper.predict_input_optimized(df_input_data=df_output_data)
 
158
 
159
  except Exception as e:
160
  print(f"Error processing NameSimilarityMapper: {e}")
161
  raise HTTPException(status_code=500, detail=str(e))
162
- try:
163
- sub_subject_location_mapper = SubSubjectLocationDataMapper()
164
- sub_subject_location_mapper.map_location(df_output_data)
165
- except Exception as e:
166
- print(f"Error processing SubSubjectLocationDataMapper: {e}")
167
- raise HTTPException(status_code=500, detail=str(e))
168
 
 
169
  try:
170
- # Sub subject and name mapping
171
  if sentence_service.df_sub_subject_and_name_map_data is not None:
172
- sub_subject_and_name_mapper = SubSubjectAndNameDataMapper(
173
  df_map_data=sentence_service.df_sub_subject_and_name_map_data
174
  )
175
- sub_subject_and_name_mapper.map_data(df_input_data=df_output_data)
176
 
177
  except Exception as e:
178
  print(f"Error processing SubSubjectAndNameDataMapper: {e}")
179
  raise HTTPException(status_code=500, detail=str(e))
180
 
 
181
  try:
182
- # Abstract mapping
183
- if sentence_service.df_abstract_map_data is not None:
184
- # Ensure required columns exist before AbstractSimilarityMapper
185
- required_columns_for_abstract = {
186
- "標準科目": "",
187
- "摘要グループ": "",
188
- "確定": "未確定",
189
- "摘要": "",
190
- "備考": "",
191
- }
192
-
193
- # Add missing columns with appropriate defaults
194
- for col, default_val in required_columns_for_abstract.items():
195
- if col not in df_output_data.columns:
196
- df_output_data[col] = default_val
197
- print(
198
- f"DEBUG: Added missing column '{col}' with default value '{default_val}'"
199
- )
200
-
201
- # Ensure data types are correct (convert to string to avoid type issues)
202
- for col in ["標準科目", "摘要グループ", "確定", "摘要", "備考"]:
203
- if col in df_output_data.columns:
204
- df_output_data[col] = df_output_data[col].astype(str).fillna("")
205
 
 
 
 
206
  abstract_similarity_mapper = AbstractSimilarityMapper(
207
  cached_embedding_helper=sentence_service.abstract_cached_embedding_helper,
208
  df_map_data=sentence_service.df_abstract_map_data,
209
  )
210
- abstract_similarity_mapper.predict_input_optimized(df_input_data=df_output_data)
211
-
212
- print(f"DEBUG: AbstractSimilarityMapper completed successfully")
213
 
214
  except Exception as e:
215
  print(f"Error processing AbstractSimilarityMapper: {e}")
216
  print(f"DEBUG: Full error traceback:")
217
- import traceback
218
-
219
  traceback.print_exc()
220
  # Don't raise the exception, continue processing
221
  print(f"DEBUG: Continuing without AbstractSimilarityMapper...")
222
 
 
223
  try:
224
- # Name and abstract mapping
225
  if sentence_service.df_name_and_subject_map_data is not None:
226
  name_and_abstract_mapper = NameAndAbstractDataMapper(
227
  df_map_data=sentence_service.df_name_and_subject_map_data
228
  )
229
- df_output_data = name_and_abstract_mapper.map_data(df_output_data)
 
 
 
230
 
231
  except Exception as e:
232
  print(f"Error processing NameAndAbstractDataMapper: {e}")
233
  raise HTTPException(status_code=500, detail=str(e))
234
 
 
235
  try:
236
- # Unit mapping
237
- if sentence_service.df_unit_map_data is not None:
238
- unit_mapper = UnitSimilarityMapper(
239
- cached_embedding_helper=sentence_service.unit_cached_embedding_helper,
240
- df_map_data=sentence_service.df_unit_map_data,
241
- )
242
- unit_mapper.predict_input_optimized(df_input_data=df_output_data)
243
-
244
- except Exception as e:
245
- print(f"Error processing UnitMapper: {e}")
246
- raise HTTPException(status_code=500, detail=str(e))
247
-
248
- try:
249
- # Standard name mapping
250
- if sentence_service.df_standard_name_map_data is not None:
251
- standard_name_mapper = StandardNameMapper(
252
- df_map_data=sentence_service.df_standard_name_map_data
253
- )
254
- df_output_data = standard_name_mapper.map_data(df_output_data)
255
-
256
  except Exception as e:
257
- print(f"Error processing StandardNameMapper: {e}")
258
  raise HTTPException(status_code=500, detail=str(e))
259
 
260
  # Create output columns and ensure they have proper values
@@ -286,7 +296,6 @@ async def predict(
286
  for col, default_value in required_columns.items():
287
  if col not in df_output_data.columns:
288
  df_output_data[col] = default_value
289
-
290
  # Map output columns to match Excel structure
291
  # 出力_中科目 mapping - use the standard sub-subject from sub-subject mapper
292
  if "出力_中科目" in df_output_data.columns:
@@ -331,26 +340,26 @@ async def predict(
331
  print(f"Available columns after processing: {list(df_output_data.columns)}")
332
 
333
  # Final check and fallback for missing output columns
334
- if (
335
- "出力_中科目" not in df_output_data.columns
336
- or df_output_data["出力_中科目"].eq("").all()
337
- ):
338
- df_output_data["出力_中科目"] = df_output_data.get("中科目", "")
339
-
340
- if (
341
- "出力_項目名" not in df_output_data.columns
342
- or df_output_data["出力_項目名"].eq("").all()
343
- ):
344
- df_output_data["出力_項目名"] = df_output_data.get("名称", "")
345
-
346
- if (
347
- "出力_単位" not in df_output_data.columns
348
- or df_output_data["出力_単位"].eq("").all()
349
- ):
350
- df_output_data["出力_単位"] = df_output_data.get("単位", "")
351
-
352
- if "出力_確率度" not in df_output_data.columns:
353
- df_output_data["出力_確率度"] = 0 # Default confidence score
354
 
355
  # Define output columns in exact order as shown in Excel
356
  output_columns = [
@@ -511,14 +520,14 @@ async def predict_raw(
511
  try:
512
  # Unit mapping
513
  if sentence_service.df_unit_map_data is not None:
514
- unit_mapper = UnitSimilarityMapper(
515
  cached_embedding_helper=sentence_service.unit_cached_embedding_helper,
516
  df_map_data=sentence_service.df_unit_map_data,
517
  )
518
  unit_mapper.predict_input(df_input_data=df_input_data)
519
 
520
  except Exception as e:
521
- print(f"Error processing UnitSimilarityMapper: {e}")
522
  raise HTTPException(status_code=500, detail=str(e))
523
 
524
  # Ensure required columns exist
 
21
  from mapping_lib.sub_subject_location_data_mapper import SubSubjectLocationDataMapper
22
  from mapping_lib.abstract_similarity_mapper import AbstractSimilarityMapper
23
  from mapping_lib.name_and_abstract_mapper import NameAndAbstractDataMapper
24
+ from mapping_lib.unit_mapper import UnitMapper
25
+ from mapping_lib.base_dictionary_mapper import BaseDictionaryMapper
26
+ from common_lib.data_utilities import fillna_with_space
27
+ from common_lib.string_utilities import (
28
+ preprocess_text,
29
+ ConversionType,
30
+ ConversionSettings,
31
+ )
32
 
33
  from config import UPLOAD_DIR, OUTPUT_DIR
34
  from models import (
 
71
  # Load input data
72
  start_time = time.time()
73
  df_input_data = pd.read_csv(input_file_path)
74
+
75
+ # Preprocess data like in meisai-check-ai/predict.py
76
+ df_input_data["元名称"] = df_input_data["名称"]
77
+ df_input_data["名称"] = df_input_data["名称"].apply(
78
+ lambda x: (
79
+ preprocess_text(
80
+ x,
81
+ convert_kana=ConversionType.Z2H,
82
+ convert_alphabet=ConversionType.Z2H,
83
+ convert_digit=ConversionType.Z2H,
84
+ )
85
+ if pd.notna(x)
86
+ else ""
87
+ )
88
+ )
89
 
90
  # Ensure basic columns exist with default values
91
  basic_columns = {
 
104
  if col not in df_input_data.columns:
105
  df_input_data[col] = default_value
106
 
107
+ # SubjectSimilarityMapper
108
  try:
 
109
  if sentence_service.df_subject_map_data is not None:
110
  subject_similarity_mapper = SubjectSimilarityMapper(
111
  cached_embedding_helper=sentence_service.subject_cached_embedding_helper,
 
113
  )
114
 
115
  list_input_subject = df_input_data["科目"].unique()
116
+ df_subject_data = pd.DataFrame(list_input_subject, columns=["科目"])
117
 
118
+ subject_similarity_mapper.predict_input(df_input_data=df_subject_data)
119
 
120
+ output_subject_map = dict(zip(df_subject_data["科目"], df_subject_data["出力_科目"]))
121
+ df_input_data["標準科目"] = df_input_data["科目"].map(output_subject_map)
122
+ df_input_data["出力_科目"] = df_input_data["標準科目"]
123
+ fillna_with_space(df_input_data)
 
 
 
 
 
124
 
125
  except Exception as e:
126
  print(f"Error processing SubjectSimilarityMapper: {e}")
127
  raise HTTPException(status_code=500, detail=str(e))
128
 
129
+ # StandardSubjectDataMapper
130
  try:
 
131
  if sentence_service.df_standard_subject_map_data is not None:
132
  standard_subject_data_mapper = StandardSubjectDataMapper(
133
  df_map_data=sentence_service.df_standard_subject_map_data
134
  )
135
  df_output_data = standard_subject_data_mapper.map_data(
136
+ df_input_data=df_input_data, input_key_columns=["出力_科目"], in_place=True
 
 
137
  )
138
+ fillna_with_space(df_output_data)
139
  else:
140
  df_output_data = df_input_data.copy()
141
 
 
144
  # Continue with original data if standard subject mapping fails
145
  df_output_data = df_input_data.copy()
146
 
147
+ # SubSubjectSimilarityMapper
148
  try:
 
149
  if sentence_service.df_sub_subject_map_data is not None:
150
  sub_subject_similarity_mapper = SubSubjectSimilarityMapper(
151
  cached_embedding_helper=sentence_service.sub_subject_cached_embedding_helper,
152
  df_map_data=sentence_service.df_sub_subject_map_data,
153
  )
154
+ df_input_sub_subject = df_output_data[
155
+ ["科目", "標準科目", "出力_科目", "中科目", "分類"]
156
+ ].drop_duplicates()
157
+ sub_subject_similarity_mapper.predict_input(df_input_data=df_input_sub_subject)
158
+
159
+ sub_subject_map_key_columns = ["科目", "標準科目", "出力_科目", "中科目", "分類"]
160
+ sub_subject_map_data_columns = [
161
+ "出力_基準中科目",
162
+ "出力_中科目類似度",
163
+ "出力_中科目",
164
+ "外部・内部区分",
165
+ ]
166
+
167
+ sub_subject_data_mapper = BaseDictionaryMapper(
168
+ df_input_sub_subject, sub_subject_map_key_columns, sub_subject_map_data_columns
169
+ )
170
+ sub_subject_data_mapper.map_data(
171
+ df_input_data=df_output_data,
172
+ input_key_columns=sub_subject_map_key_columns,
173
+ in_place=True,
174
  )
175
+ fillna_with_space(df_output_data)
176
 
177
  except Exception as e:
178
  print(f"Error processing SubSubjectSimilarityMapper: {e}")
179
  raise HTTPException(status_code=500, detail=str(e))
180
 
181
+ # NameSimilarityMapper
182
  try:
 
183
  if sentence_service.df_name_map_data is not None:
184
  name_sentence_mapper = NameSimilarityMapper(
185
  cached_embedding_helper=sentence_service.name_cached_embedding_helper,
186
  df_map_data=sentence_service.df_name_map_data,
187
  )
188
+ name_sentence_mapper.predict_input(df_input_data=df_output_data)
189
+ fillna_with_space(df_output_data)
190
 
191
  except Exception as e:
192
  print(f"Error processing NameSimilarityMapper: {e}")
193
  raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
194
 
195
+ # SubSubjectAndNameDataMapper
196
  try:
 
197
  if sentence_service.df_sub_subject_and_name_map_data is not None:
198
+ sub_subject_and_name_data_mapper = SubSubjectAndNameDataMapper(
199
  df_map_data=sentence_service.df_sub_subject_and_name_map_data
200
  )
201
+ sub_subject_and_name_data_mapper.map_data(df_input_data=df_output_data)
202
 
203
  except Exception as e:
204
  print(f"Error processing SubSubjectAndNameDataMapper: {e}")
205
  raise HTTPException(status_code=500, detail=str(e))
206
 
207
+ # UnitMapper
208
  try:
209
+ if sentence_service.df_unit_map_data is not None:
210
+ unit_similarity_mapper = UnitMapper(
211
+ cached_embedding_helper=sentence_service.unit_cached_embedding_helper,
212
+ df_map_data=sentence_service.df_unit_map_data,
213
+ )
214
+ unit_map_key_columns = ["単位"]
215
+ df_input_unit = df_input_data[unit_map_key_columns].drop_duplicates()
216
+ unit_similarity_mapper.predict_input(df_input_data=df_input_unit)
217
+
218
+ output_unit_data_columns = ["出力_基準単位", "出力_単位類似度", "出力_集計用単位", "出力_標準単位"]
219
+ unit_data_mapper = BaseDictionaryMapper(
220
+ df_input_unit, unit_map_key_columns, output_unit_data_columns
221
+ )
222
+ _ = unit_data_mapper.map_data(
223
+ df_input_data=df_output_data, input_key_columns=unit_map_key_columns, in_place=True
224
+ )
225
+ fillna_with_space(df_output_data)
226
+ except Exception as e:
227
+ print(f"Error processing UnitMapper: {e}")
228
+ raise HTTPException(status_code=500, detail=str(e))
 
 
 
229
 
230
+ # AbstractSimilarityMapper
231
+ try:
232
+ if sentence_service.df_abstract_map_data is not None:
233
  abstract_similarity_mapper = AbstractSimilarityMapper(
234
  cached_embedding_helper=sentence_service.abstract_cached_embedding_helper,
235
  df_map_data=sentence_service.df_abstract_map_data,
236
  )
237
+ abstract_similarity_mapper.predict_input(df_input_data=df_output_data)
 
 
238
 
239
  except Exception as e:
240
  print(f"Error processing AbstractSimilarityMapper: {e}")
241
  print(f"DEBUG: Full error traceback:")
 
 
242
  traceback.print_exc()
243
  # Don't raise the exception, continue processing
244
  print(f"DEBUG: Continuing without AbstractSimilarityMapper...")
245
 
246
+ # NameAndAbstractDataMapper
247
  try:
 
248
  if sentence_service.df_name_and_subject_map_data is not None:
249
  name_and_abstract_mapper = NameAndAbstractDataMapper(
250
  df_map_data=sentence_service.df_name_and_subject_map_data
251
  )
252
+ df_output_data["出力_項目名"] = df_output_data["出力_標準名称"]
253
+ _ = name_and_abstract_mapper.map_data(df_output_data)
254
+ fillna_with_space(df_output_data)
255
+ df_output_data["出力_項目名(中科目抜き)"] = df_output_data["出力_項目名"]
256
 
257
  except Exception as e:
258
  print(f"Error processing NameAndAbstractDataMapper: {e}")
259
  raise HTTPException(status_code=500, detail=str(e))
260
 
261
+ # SubSubjectLocationDataMapper
262
  try:
263
+ sub_subject_location_mapper = SubSubjectLocationDataMapper()
264
+ sub_subject_location_mapper.map_location(df_output_data)
265
+ df_output_data["名称"] = df_output_data["元名称"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  except Exception as e:
267
+ print(f"Error processing SubSubjectLocationDataMapper: {e}")
268
  raise HTTPException(status_code=500, detail=str(e))
269
 
270
  # Create output columns and ensure they have proper values
 
296
  for col, default_value in required_columns.items():
297
  if col not in df_output_data.columns:
298
  df_output_data[col] = default_value
 
299
  # Map output columns to match Excel structure
300
  # 出力_中科目 mapping - use the standard sub-subject from sub-subject mapper
301
  if "出力_中科目" in df_output_data.columns:
 
340
  print(f"Available columns after processing: {list(df_output_data.columns)}")
341
 
342
  # Final check and fallback for missing output columns
343
+ # if (
344
+ # "出力_中科目" not in df_output_data.columns
345
+ # or df_output_data["出力_中科目"].eq("").all()
346
+ # ):
347
+ # df_output_data["出力_中科目"] = df_output_data.get("中科目", "")
348
+
349
+ # if (
350
+ # "出力_項目名" not in df_output_data.columns
351
+ # or df_output_data["出力_項目名"].eq("").all()
352
+ # ):
353
+ # df_output_data["出力_項目名"] = df_output_data.get("名称", "")
354
+
355
+ # if (
356
+ # "出力_単位" not in df_output_data.columns
357
+ # or df_output_data["出力_単位"].eq("").all()
358
+ # ):
359
+ # df_output_data["出力_単位"] = df_output_data.get("単位", "")
360
+
361
+ # if "出力_確率度" not in df_output_data.columns:
362
+ # df_output_data["出力_確率度"] = 0 # Default confidence score
363
 
364
  # Define output columns in exact order as shown in Excel
365
  output_columns = [
 
520
  try:
521
  # Unit mapping
522
  if sentence_service.df_unit_map_data is not None:
523
+ unit_mapper = UnitMapper(
524
  cached_embedding_helper=sentence_service.unit_cached_embedding_helper,
525
  df_map_data=sentence_service.df_unit_map_data,
526
  )
527
  unit_mapper.predict_input(df_input_data=df_input_data)
528
 
529
  except Exception as e:
530
+ print(f"Error processing UnitMapper: {e}")
531
  raise HTTPException(status_code=500, detail=str(e))
532
 
533
  # Ensure required columns exist
validate_optimization.py CHANGED
@@ -25,7 +25,7 @@ class FileComparator:
25
  '出力_中科目',
26
  '出力_標準名称',
27
  '出力_項目名',
28
- '出力_標準単位'
29
  ]
30
 
31
  def load_original_data(self) -> pd.DataFrame:
@@ -236,7 +236,7 @@ def main():
236
  """Main function to compare two files"""
237
  # File paths
238
  original_file = "data/outputData_original.csv"
239
- second_file = "data/outputData_api_v2.csv"
240
 
241
  if not os.path.exists(original_file):
242
  print(f"❌ Original file not found: {original_file}")
 
25
  '出力_中科目',
26
  '出力_標準名称',
27
  '出力_項目名',
28
+ '出力_集計用単位'
29
  ]
30
 
31
  def load_original_data(self) -> pd.DataFrame:
 
236
  """Main function to compare two files"""
237
  # File paths
238
  original_file = "data/outputData_original.csv"
239
+ second_file = "data/outputData_api.csv"
240
 
241
  if not os.path.exists(original_file):
242
  print(f"❌ Original file not found: {original_file}")