Update app.py
Browse files
app.py
CHANGED
|
@@ -261,63 +261,87 @@ def apply_vocabulary_correction(text):
|
|
| 261 |
text = text.replace(original, corrected)
|
| 262 |
return text
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
def create_bulk_paragraphs(text, max_chars=500):
|
| 266 |
"""
|
| 267 |
-
ํ
์คํธ๋ฅผ
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
Args:
|
| 270 |
-
text: ์
๋ ฅ ํ
์คํธ
|
| 271 |
-
max_chars: ์ต๋ ๋ฌธ์ ์ (๊ธฐ๋ณธ๊ฐ: 500)
|
| 272 |
|
| 273 |
Returns:
|
| 274 |
List[str]: ๋ฒํฌ ๋จ์๋ก ๋ถํ ๋ ํ
์คํธ ๋ฆฌ์คํธ
|
| 275 |
"""
|
| 276 |
-
|
| 277 |
-
|
| 278 |
if not paragraphs:
|
| 279 |
return []
|
| 280 |
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
current_length = 0
|
| 284 |
|
| 285 |
-
|
| 286 |
-
|
|
|
|
| 287 |
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
if
|
| 292 |
-
|
| 293 |
-
current_bulk = []
|
| 294 |
-
current_length = 0
|
| 295 |
|
| 296 |
-
#
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
# ํ์ฌ ๋ฒํฌ์ ์ถ๊ฐํ์ ๋ 500์๋ฅผ ์ด๊ณผํ๋ ๊ฒฝ์ฐ
|
| 300 |
-
if (
|
| 301 |
-
current_length + para_length + len(current_bulk) > max_chars
|
| 302 |
-
and current_bulk
|
| 303 |
-
):
|
| 304 |
-
# ํ์ฌ ๋ฒํฌ๋ฅผ ์์ฑํ๊ณ ์ ๋ฒํฌ ์์
|
| 305 |
-
bulks.append("\n".join(current_bulk))
|
| 306 |
-
current_bulk = [para]
|
| 307 |
-
current_length = para_length
|
| 308 |
else:
|
| 309 |
-
#
|
| 310 |
-
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
-
# ๋ง์ง๋ง
|
| 314 |
-
if
|
| 315 |
-
bulks.append(
|
| 316 |
|
| 317 |
return bulks
|
| 318 |
|
| 319 |
|
| 320 |
-
|
| 321 |
def process_bulk(bulk_text, bulk_index, max_retries=3, article_info=""):
|
| 322 |
"""
|
| 323 |
ํ๋์ ๋ฒํฌ๋ฅผ ํ์ดํ๋ผ์ธ์ผ๋ก ์ฒ๋ฆฌํฉ๋๋ค.
|
|
|
|
| 261 |
text = text.replace(original, corrected)
|
| 262 |
return text
|
| 263 |
|
| 264 |
+
import kss
|
| 265 |
+
|
| 266 |
+
def _combine_parts(parts: List[Tuple[int, str]]) -> str:
|
| 267 |
+
"""
|
| 268 |
+
parts: [(paragraph_index, sentence), ...]
|
| 269 |
+
๊ฐ์ ๋ฌธ๋จ์ ' ', ๋ค๋ฅธ ๋ฌธ๋จ์ '\n'์ผ๋ก ๊ฒฐํฉ (TS์ combineTexts ๊ท์น)
|
| 270 |
+
"""
|
| 271 |
+
if not parts:
|
| 272 |
+
return ""
|
| 273 |
+
result = parts[0][1]
|
| 274 |
+
last_idx = parts[0][0]
|
| 275 |
+
for p_idx, sent in parts[1:]:
|
| 276 |
+
if p_idx == last_idx:
|
| 277 |
+
result += " " + sent
|
| 278 |
+
else:
|
| 279 |
+
result += "\n" + sent
|
| 280 |
+
last_idx = p_idx
|
| 281 |
+
return result
|
| 282 |
|
| 283 |
def create_bulk_paragraphs(text, max_chars=500):
|
| 284 |
"""
|
| 285 |
+
ํ
์คํธ๋ฅผ ๋ฌธ์ฅ ๋จ์๋ก ๋์ ํ๋ค๊ฐ, ๊ฒฐํฉ ๊ฒฐ๊ณผ ๊ธธ์ด๊ฐ max_chars๋ฅผ ๋์ผ๋ฉด
|
| 286 |
+
๊ทธ ์ง์ ์์ ๋๊ณ ์ ๋ฒํฌ๋ฅผ ์์ํฉ๋๋ค.
|
| 287 |
+
- ๊ฐ์ ๋ฌธ๋จ ๋ด ๋ฌธ์ฅ๋ค์ ๊ณต๋ฐฑ(' ')์ผ๋ก ๊ฒฐํฉ
|
| 288 |
+
- ๋ฌธ๋จ์ด ๋ฐ๋๋ฉด ๊ฐํ('\n')์ผ๋ก ๊ฒฐํฉ
|
| 289 |
+
- ๊ฐ๋ณ ๋ฌธ์ฅ์ ๋ ์ชผ๊ฐ์ง์ง ์์ (๋ฌธ์ฅ ์์ฒด๊ฐ max๋ฅผ ๋๋๋ผ๋ ๋จ๋
์ผ๋ก ๋ค์ด๊ฐ)
|
| 290 |
|
| 291 |
Args:
|
| 292 |
+
text (str): ์
๋ ฅ ํ
์คํธ(์ฌ๋ฌ ๋ฌธ๋จ ๊ฐ๋ฅ, '\n' ๊ธฐ์ค)
|
| 293 |
+
max_chars (int): ์ต๋ ๋ฌธ์ ์ (๊ธฐ๋ณธ๊ฐ: 500)
|
| 294 |
|
| 295 |
Returns:
|
| 296 |
List[str]: ๋ฒํฌ ๋จ์๋ก ๋ถํ ๋ ํ
์คํธ ๋ฆฌ์คํธ
|
| 297 |
"""
|
| 298 |
+
# 1) ๋ฌธ๋จ ๋ถ๋ฆฌ (๋น ์ค/๊ณต๋ฐฑ ๋ฌธ๋จ ์ ๊ฑฐ)
|
| 299 |
+
paragraphs = [p.strip() for p in str(text).split("\n") if p.strip()]
|
| 300 |
if not paragraphs:
|
| 301 |
return []
|
| 302 |
|
| 303 |
+
# 2) ๊ฐ ๋ฌธ๋จ์ KSS๋ก ๋ฌธ์ฅ ๋ถ๋ฆฌ
|
| 304 |
+
split_paragraphs: List[List[str]] = [list(kss.split_sentences(p)) for p in paragraphs]
|
|
|
|
| 305 |
|
| 306 |
+
bulks: List[str] = []
|
| 307 |
+
current_parts: List[Tuple[int, str]] = [] # (paragraph_index, sentence)
|
| 308 |
+
current_len = 0
|
| 309 |
|
| 310 |
+
for p_idx, sentences in enumerate(split_paragraphs):
|
| 311 |
+
for sent in sentences:
|
| 312 |
+
sent = sent.strip()
|
| 313 |
+
if not sent:
|
| 314 |
+
continue
|
|
|
|
|
|
|
| 315 |
|
| 316 |
+
# ํ์ฌ ์ฒญํฌ์ ์ด ๋ฌธ์ฅ์ ๋ฃ์์ ๋ ๊ธธ์ด ๊ณ์ฐ
|
| 317 |
+
if not current_parts:
|
| 318 |
+
add_len = len(sent) # ์ฒซ ๋ฌธ์ฅ์ ๊ตฌ๋ถ์ ์์
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 319 |
else:
|
| 320 |
+
# ๊ฐ์ ๋ฌธ๋จ=๊ณต๋ฐฑ 1, ๋ค๋ฅธ ๋ฌธ๋จ=๊ฐํ 1 โ ๊ธธ์ด ์ฐจ์ด๋ ์์ผ๋ฏ๋ก +1
|
| 321 |
+
add_len = 1 + len(sent)
|
| 322 |
+
|
| 323 |
+
# ๊ธธ์ด ์ด๊ณผ๋ฉด, ํ์ฌ ์ฒญํฌ๋ฅผ ๋จผ์ ํ์
|
| 324 |
+
if current_parts and (current_len + add_len > max_chars):
|
| 325 |
+
bulks.append(_combine_parts(current_parts))
|
| 326 |
+
current_parts = []
|
| 327 |
+
current_len = 0
|
| 328 |
+
# ๊ณ์ํด์ ์ด ๋ฌธ์ฅ์ผ๋ก ์ ์ฒญํฌ ์์
|
| 329 |
+
|
| 330 |
+
# ํ์ฌ ์ฒญํฌ์ ๋ฌธ์ฅ ์ถ๊ฐ
|
| 331 |
+
if not current_parts:
|
| 332 |
+
current_parts.append((p_idx, sent))
|
| 333 |
+
current_len = len(sent)
|
| 334 |
+
else:
|
| 335 |
+
current_parts.append((p_idx, sent))
|
| 336 |
+
current_len += 1 + len(sent) # ๊ตฌ๋ถ์(๊ณต๋ฐฑ/๊ฐํ=1) + ๋ฌธ์ฅ ๊ธธ์ด
|
| 337 |
|
| 338 |
+
# ๋ง์ง๋ง ์ฒญํฌ flush
|
| 339 |
+
if current_parts:
|
| 340 |
+
bulks.append(_combine_parts(current_parts))
|
| 341 |
|
| 342 |
return bulks
|
| 343 |
|
| 344 |
|
|
|
|
| 345 |
def process_bulk(bulk_text, bulk_index, max_retries=3, article_info=""):
|
| 346 |
"""
|
| 347 |
ํ๋์ ๋ฒํฌ๋ฅผ ํ์ดํ๋ผ์ธ์ผ๋ก ์ฒ๋ฆฌํฉ๋๋ค.
|