Zengyf-CVer commited on
Commit
940a520
1 Parent(s): 755ce29

v02 update

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +57 -31
  3. data/test03.png +0 -0
  4. packages.txt +2 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🚀
4
  colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 3.0.18
8
  app_file: app.py
9
  pinned: false
10
  license: gpl-3.0
 
4
  colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 3.0.26
8
  app_file: app.py
9
  pinned: false
10
  license: gpl-3.0
app.py CHANGED
@@ -1,20 +1,21 @@
1
- # OCR Translate v0.1
2
  # 创建人:曾逸夫
3
- # 创建时间:2022-06-14
4
- # email: [email protected]
5
- # 项目地址:https://gitee.com/CV_Lab/ocr-translate
6
 
7
  import os
8
 
 
 
9
  import gradio as gr
10
  import nltk
 
11
  import pytesseract
12
  from nltk.tokenize import sent_tokenize
13
  from transformers import MarianMTModel, MarianTokenizer
14
 
15
  nltk.download('punkt')
16
 
17
- OCR_TR_DESCRIPTION = '''# OCR Translate v0.1
18
  <div id="content_align">基于Tesseract的OCR翻译系统</div>'''
19
 
20
  # 图片路径
@@ -26,6 +27,7 @@ choices = os.popen('tesseract --list-langs').read().split('\n')[1:-1]
26
 
27
  # 翻译模型选择
28
  def model_choice(src="en", trg="zh"):
 
29
  # https://huggingface.co/Helsinki-NLP/opus-mt-en-zh
30
  model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}" # 模型名称
31
 
@@ -55,23 +57,35 @@ def ocr_tesseract(img, languages):
55
  return ocr_str
56
 
57
 
58
- # 示例
59
- def set_example_image(example: list) -> dict:
60
- return gr.Image.update(value=example[0])
61
-
62
-
63
  # 清除
64
  def clear_content():
65
  return None
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  # 翻译
69
- def translate(input_text):
70
  # 参考:https://huggingface.co/docs/transformers/model_doc/marian
71
  if input_text is None or input_text == "":
72
  return "系统提示:没有可翻译的内容!"
73
 
74
- tokenizer, model = model_choice()
 
 
75
 
76
  translate_text = ""
77
  input_text_list = input_text.split("\n\n")
@@ -90,6 +104,7 @@ def translate(input_text):
90
 
91
  return translate_text[2:]
92
 
 
93
  def main():
94
 
95
  with gr.Blocks(css='style.css') as ocr_tr:
@@ -106,16 +121,31 @@ def main():
106
  with gr.Row():
107
  inputs_img = gr.Image(image_mode="RGB", source="upload", type="pil", label="图片")
108
  with gr.Row():
109
- inputs_lang = gr.CheckboxGroup(choices=choices, type="value", value=['eng'], label='语言')
 
 
 
110
 
111
  with gr.Row():
112
  clear_img_btn = gr.Button('Clear')
113
  ocr_btn = gr.Button(value='OCR 提取', variant="primary")
114
 
115
  with gr.Column():
116
- imgs_path = sorted(os.listdir(img_dir))
117
- example_images = gr.Dataset(components=[inputs_img],
118
- samples=[[f"{img_dir}/{i}"] for i in imgs_path])
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  # -------------- 翻译 --------------
121
  with gr.Box():
@@ -124,29 +154,25 @@ def main():
124
  gr.Markdown("### Step 02: 翻译")
125
 
126
  with gr.Row():
127
- with gr.Column():
128
- with gr.Row():
129
- outputs_text = gr.Textbox(label="提取内容", lines=20)
130
- with gr.Row():
131
- clear_text_btn = gr.Button('Clear')
132
- translate_btn = gr.Button(value='翻译', variant="primary")
133
- with gr.Column():
134
- outputs_tr_text = gr.Textbox(label="翻译内容", lines=20)
135
 
136
  # ---------------------- OCR Tesseract ----------------------
137
  ocr_btn.click(fn=ocr_tesseract, inputs=[inputs_img, inputs_lang], outputs=[
138
  outputs_text,])
139
-
140
  clear_img_btn.click(fn=clear_content, inputs=[], outputs=[inputs_img])
141
 
142
- example_images.click(fn=set_example_image, inputs=[
143
- example_images,], outputs=[
144
- inputs_img,])
145
-
146
- # ---------------------- OCR Tesseract ----------------------
147
- translate_btn.click(fn=translate, inputs=[outputs_text], outputs=[outputs_tr_text])
148
  clear_text_btn.click(fn=clear_content, inputs=[], outputs=[outputs_text])
149
 
 
 
 
 
150
  ocr_tr.launch(inbrowser=True)
151
 
152
 
 
1
+ # OCR Translate v0.2
2
  # 创建人:曾逸夫
3
+ # 创建时间:2022-07-19
 
 
4
 
5
  import os
6
 
7
+ # os.system("apt-get install xclip")
8
+
9
  import gradio as gr
10
  import nltk
11
+ import pyclip
12
  import pytesseract
13
  from nltk.tokenize import sent_tokenize
14
  from transformers import MarianMTModel, MarianTokenizer
15
 
16
  nltk.download('punkt')
17
 
18
+ OCR_TR_DESCRIPTION = '''# OCR Translate v0.2
19
  <div id="content_align">基于Tesseract的OCR翻译系统</div>'''
20
 
21
  # 图片路径
 
27
 
28
  # 翻译模型选择
29
  def model_choice(src="en", trg="zh"):
30
+ # https://huggingface.co/Helsinki-NLP/opus-mt-zh-en
31
  # https://huggingface.co/Helsinki-NLP/opus-mt-en-zh
32
  model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}" # 模型名称
33
 
 
57
  return ocr_str
58
 
59
 
 
 
 
 
 
60
  # 清除
61
  def clear_content():
62
  return None
63
 
64
 
65
+ # 复制到剪贴板
66
+ def cp_text(input_text):
67
+ # sudo apt-get install xclip
68
+ try:
69
+ pyclip.copy(input_text)
70
+ except Exception as e:
71
+ print("sudo apt-get install xclip")
72
+ print(e)
73
+
74
+
75
+ # 清除剪贴板
76
+ def cp_clear():
77
+ pyclip.clear()
78
+
79
+
80
  # 翻译
81
+ def translate(input_text, inputs_transStyle):
82
  # 参考:https://huggingface.co/docs/transformers/model_doc/marian
83
  if input_text is None or input_text == "":
84
  return "系统提示:没有可翻译的内容!"
85
 
86
+ # 选择翻译模型
87
+ trans_src, trans_trg = inputs_transStyle.split("-")[0], inputs_transStyle.split("-")[1]
88
+ tokenizer, model = model_choice(trans_src, trans_trg)
89
 
90
  translate_text = ""
91
  input_text_list = input_text.split("\n\n")
 
104
 
105
  return translate_text[2:]
106
 
107
+
108
  def main():
109
 
110
  with gr.Blocks(css='style.css') as ocr_tr:
 
121
  with gr.Row():
122
  inputs_img = gr.Image(image_mode="RGB", source="upload", type="pil", label="图片")
123
  with gr.Row():
124
+ inputs_lang = gr.CheckboxGroup(choices=["chi_sim", "eng"],
125
+ type="value",
126
+ value=['eng'],
127
+ label='语言')
128
 
129
  with gr.Row():
130
  clear_img_btn = gr.Button('Clear')
131
  ocr_btn = gr.Button(value='OCR 提取', variant="primary")
132
 
133
  with gr.Column():
134
+ with gr.Row():
135
+ outputs_text = gr.Textbox(label="提取内容", lines=20)
136
+ with gr.Row():
137
+ inputs_transStyle = gr.Radio(choices=["zh-en", "en-zh"],
138
+ type="value",
139
+ value="zh-en",
140
+ label='翻译模式')
141
+ with gr.Row():
142
+ clear_text_btn = gr.Button('Clear')
143
+ translate_btn = gr.Button(value='翻译', variant="primary")
144
+
145
+ with gr.Row():
146
+ example_list = [["./data/test.png", ["eng"]], ["./data/test02.png", ["eng"]],
147
+ ["./data/test03.png", ["chi_sim"]]]
148
+ gr.Examples(example_list, [inputs_img, inputs_lang], outputs_text, ocr_tesseract, cache_examples=False)
149
 
150
  # -------------- 翻译 --------------
151
  with gr.Box():
 
154
  gr.Markdown("### Step 02: 翻译")
155
 
156
  with gr.Row():
157
+ outputs_tr_text = gr.Textbox(label="翻译内容", lines=20)
158
+
159
+ with gr.Row():
160
+ cp_clear_btn = gr.Button(value='清除剪贴板')
161
+ cp_btn = gr.Button(value='复��到剪贴板', variant="primary")
 
 
 
162
 
163
  # ---------------------- OCR Tesseract ----------------------
164
  ocr_btn.click(fn=ocr_tesseract, inputs=[inputs_img, inputs_lang], outputs=[
165
  outputs_text,])
 
166
  clear_img_btn.click(fn=clear_content, inputs=[], outputs=[inputs_img])
167
 
168
+ # ---------------------- 翻译 ----------------------
169
+ translate_btn.click(fn=translate, inputs=[outputs_text, inputs_transStyle], outputs=[outputs_tr_text])
 
 
 
 
170
  clear_text_btn.click(fn=clear_content, inputs=[], outputs=[outputs_text])
171
 
172
+ # ---------------------- 复制到剪贴板 ----------------------
173
+ cp_btn.click(fn=cp_text, inputs=[outputs_tr_text], outputs=[])
174
+ cp_clear_btn.click(fn=cp_clear, inputs=[], outputs=[])
175
+
176
  ocr_tr.launch(inbrowser=True)
177
 
178
 
data/test03.png ADDED
packages.txt CHANGED
@@ -1 +1,2 @@
1
- tesseract-ocr-all
 
 
1
+ tesseract-ocr-all
2
+ xclip