ChloeLee22 commited on
Commit
aaca751
ยท
verified ยท
1 Parent(s): cd1d660

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -36
app.py CHANGED
@@ -261,63 +261,87 @@ def apply_vocabulary_correction(text):
261
  text = text.replace(original, corrected)
262
  return text
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  def create_bulk_paragraphs(text, max_chars=500):
266
  """
267
- ํ…์ŠคํŠธ๋ฅผ 500์ž ๊ธฐ์ค€์œผ๋กœ ๋ฒŒํฌ ๋‹จ์œ„๋กœ ๋ถ„ํ• ํ•ฉ๋‹ˆ๋‹ค.
 
 
 
 
268
 
269
  Args:
270
- text: ์ž…๋ ฅ ํ…์ŠคํŠธ
271
- max_chars: ์ตœ๋Œ€ ๋ฌธ์ž ์ˆ˜ (๊ธฐ๋ณธ๊ฐ’: 500)
272
 
273
  Returns:
274
  List[str]: ๋ฒŒํฌ ๋‹จ์œ„๋กœ ๋ถ„ํ• ๋œ ํ…์ŠคํŠธ ๋ฆฌ์ŠคํŠธ
275
  """
276
- paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
277
-
278
  if not paragraphs:
279
  return []
280
 
281
- bulks = []
282
- current_bulk = []
283
- current_length = 0
284
 
285
- for para in paragraphs:
286
- para_length = len(para)
 
287
 
288
- # ํ˜„์žฌ ๋ฌธ๋‹จ์ด 500์ž๋ฅผ ์ดˆ๊ณผํ•˜๋Š” ๊ฒฝ์šฐ
289
- if para_length > max_chars:
290
- # ํ˜„์žฌ ๋ฒŒํฌ๊ฐ€ ์žˆ๋‹ค๋ฉด ์ถ”๊ฐ€
291
- if current_bulk:
292
- bulks.append("\n".join(current_bulk))
293
- current_bulk = []
294
- current_length = 0
295
 
296
- # ๊ธด ๋ฌธ๋‹จ์€ ๋‹จ๋…์œผ๋กœ ์ฒ˜๋ฆฌ
297
- bulks.append(para)
298
- else:
299
- # ํ˜„์žฌ ๋ฒŒํฌ์— ์ถ”๊ฐ€ํ–ˆ์„ ๋•Œ 500์ž๋ฅผ ์ดˆ๊ณผํ•˜๋Š” ๊ฒฝ์šฐ
300
- if (
301
- current_length + para_length + len(current_bulk) > max_chars
302
- and current_bulk
303
- ):
304
- # ํ˜„์žฌ ๋ฒŒํฌ๋ฅผ ์™„์„ฑํ•˜๊ณ  ์ƒˆ ๋ฒŒํฌ ์‹œ์ž‘
305
- bulks.append("\n".join(current_bulk))
306
- current_bulk = [para]
307
- current_length = para_length
308
  else:
309
- # ํ˜„์žฌ ๋ฒŒํฌ์— ์ถ”๊ฐ€
310
- current_bulk.append(para)
311
- current_length += para_length
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
 
313
- # ๋งˆ์ง€๋ง‰ ๋ฒŒํฌ ์ถ”๊ฐ€
314
- if current_bulk:
315
- bulks.append("\n".join(current_bulk))
316
 
317
  return bulks
318
 
319
 
320
-
321
  def process_bulk(bulk_text, bulk_index, max_retries=3, article_info=""):
322
  """
323
  ํ•˜๋‚˜์˜ ๋ฒŒํฌ๋ฅผ ํŒŒ์ดํ”„๋ผ์ธ์œผ๋กœ ์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค.
 
261
  text = text.replace(original, corrected)
262
  return text
263
 
264
+ import kss
265
+
266
+ def _combine_parts(parts: List[Tuple[int, str]]) -> str:
267
+ """
268
+ parts: [(paragraph_index, sentence), ...]
269
+ ๊ฐ™์€ ๋ฌธ๋‹จ์€ ' ', ๋‹ค๋ฅธ ๋ฌธ๋‹จ์€ '\n'์œผ๋กœ ๊ฒฐํ•ฉ (TS์˜ combineTexts ๊ทœ์น™)
270
+ """
271
+ if not parts:
272
+ return ""
273
+ result = parts[0][1]
274
+ last_idx = parts[0][0]
275
+ for p_idx, sent in parts[1:]:
276
+ if p_idx == last_idx:
277
+ result += " " + sent
278
+ else:
279
+ result += "\n" + sent
280
+ last_idx = p_idx
281
+ return result
282
 
283
  def create_bulk_paragraphs(text, max_chars=500):
284
  """
285
+ ํ…์ŠคํŠธ๋ฅผ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋ˆ„์ ํ•˜๋‹ค๊ฐ€, ๊ฒฐํ•ฉ ๊ฒฐ๊ณผ ๊ธธ์ด๊ฐ€ max_chars๋ฅผ ๋„˜์œผ๋ฉด
286
+ ๊ทธ ์ง€์ ์—์„œ ๋Š๊ณ  ์ƒˆ ๋ฒŒํฌ๋ฅผ ์‹œ์ž‘ํ•ฉ๋‹ˆ๋‹ค.
287
+ - ๊ฐ™์€ ๋ฌธ๋‹จ ๋‚ด ๋ฌธ์žฅ๋“ค์€ ๊ณต๋ฐฑ(' ')์œผ๋กœ ๊ฒฐํ•ฉ
288
+ - ๋ฌธ๋‹จ์ด ๋ฐ”๋€Œ๋ฉด ๊ฐœํ–‰('\n')์œผ๋กœ ๊ฒฐํ•ฉ
289
+ - ๊ฐœ๋ณ„ ๋ฌธ์žฅ์€ ๋” ์ชผ๊ฐœ์ง€์ง€ ์•Š์Œ (๋ฌธ์žฅ ์ž์ฒด๊ฐ€ max๋ฅผ ๋„˜๋”๋ผ๋„ ๋‹จ๋…์œผ๋กœ ๋“ค์–ด๊ฐ)
290
 
291
  Args:
292
+ text (str): ์ž…๋ ฅ ํ…์ŠคํŠธ(์—ฌ๋Ÿฌ ๋ฌธ๋‹จ ๊ฐ€๋Šฅ, '\n' ๊ธฐ์ค€)
293
+ max_chars (int): ์ตœ๋Œ€ ๋ฌธ์ž ์ˆ˜ (๊ธฐ๋ณธ๊ฐ’: 500)
294
 
295
  Returns:
296
  List[str]: ๋ฒŒํฌ ๋‹จ์œ„๋กœ ๋ถ„ํ• ๋œ ํ…์ŠคํŠธ ๋ฆฌ์ŠคํŠธ
297
  """
298
+ # 1) ๋ฌธ๋‹จ ๋ถ„๋ฆฌ (๋นˆ ์ค„/๊ณต๋ฐฑ ๋ฌธ๋‹จ ์ œ๊ฑฐ)
299
+ paragraphs = [p.strip() for p in str(text).split("\n") if p.strip()]
300
  if not paragraphs:
301
  return []
302
 
303
+ # 2) ๊ฐ ๋ฌธ๋‹จ์„ KSS๋กœ ๋ฌธ์žฅ ๋ถ„๋ฆฌ
304
+ split_paragraphs: List[List[str]] = [list(kss.split_sentences(p)) for p in paragraphs]
 
305
 
306
+ bulks: List[str] = []
307
+ current_parts: List[Tuple[int, str]] = [] # (paragraph_index, sentence)
308
+ current_len = 0
309
 
310
+ for p_idx, sentences in enumerate(split_paragraphs):
311
+ for sent in sentences:
312
+ sent = sent.strip()
313
+ if not sent:
314
+ continue
 
 
315
 
316
+ # ํ˜„์žฌ ์ฒญํฌ์— ์ด ๋ฌธ์žฅ์„ ๋„ฃ์—ˆ์„ ๋•Œ ๊ธธ์ด ๊ณ„์‚ฐ
317
+ if not current_parts:
318
+ add_len = len(sent) # ์ฒซ ๋ฌธ์žฅ์€ ๊ตฌ๋ถ„์ž ์—†์Œ
 
 
 
 
 
 
 
 
 
319
  else:
320
+ # ๊ฐ™์€ ๋ฌธ๋‹จ=๊ณต๋ฐฑ 1, ๋‹ค๋ฅธ ๋ฌธ๋‹จ=๊ฐœํ–‰ 1 โ†’ ๊ธธ์ด ์ฐจ์ด๋Š” ์—†์œผ๋ฏ€๋กœ +1
321
+ add_len = 1 + len(sent)
322
+
323
+ # ๊ธธ์ด ์ดˆ๊ณผ๋ฉด, ํ˜„์žฌ ์ฒญํฌ๋ฅผ ๋จผ์ € ํ™•์ •
324
+ if current_parts and (current_len + add_len > max_chars):
325
+ bulks.append(_combine_parts(current_parts))
326
+ current_parts = []
327
+ current_len = 0
328
+ # ๊ณ„์†ํ•ด์„œ ์ด ๋ฌธ์žฅ์œผ๋กœ ์ƒˆ ์ฒญํฌ ์‹œ์ž‘
329
+
330
+ # ํ˜„์žฌ ์ฒญํฌ์— ๋ฌธ์žฅ ์ถ”๊ฐ€
331
+ if not current_parts:
332
+ current_parts.append((p_idx, sent))
333
+ current_len = len(sent)
334
+ else:
335
+ current_parts.append((p_idx, sent))
336
+ current_len += 1 + len(sent) # ๊ตฌ๋ถ„์ž(๊ณต๋ฐฑ/๊ฐœํ–‰=1) + ๋ฌธ์žฅ ๊ธธ์ด
337
 
338
+ # ๋งˆ์ง€๋ง‰ ์ฒญํฌ flush
339
+ if current_parts:
340
+ bulks.append(_combine_parts(current_parts))
341
 
342
  return bulks
343
 
344
 
 
345
  def process_bulk(bulk_text, bulk_index, max_retries=3, article_info=""):
346
  """
347
  ํ•˜๋‚˜์˜ ๋ฒŒํฌ๋ฅผ ํŒŒ์ดํ”„๋ผ์ธ์œผ๋กœ ์ฒ˜๋ฆฌํ•ฉ๋‹ˆ๋‹ค.