mirror of
https://github.com/leigest519/ScreenCoder.git
synced 2026-02-13 10:12:46 +00:00
Initial commit
This commit is contained in:
181
UIED/detect_text/Text.py
Normal file
181
UIED/detect_text/Text.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Text:
|
||||
def __init__(self, id, content, location):
|
||||
self.id = id
|
||||
self.content = content
|
||||
self.location = location
|
||||
|
||||
self.width = self.location['right'] - self.location['left']
|
||||
self.height = self.location['bottom'] - self.location['top']
|
||||
self.area = self.width * self.height
|
||||
self.word_width = self.width / len(self.content)
|
||||
|
||||
'''
|
||||
********************************
|
||||
*** Relation with Other text ***
|
||||
********************************
|
||||
'''
|
||||
def is_justified(self, ele_b, direction='h', max_bias_justify=4):
|
||||
'''
|
||||
Check if the element is justified
|
||||
:param max_bias_justify: maximum bias if two elements to be justified
|
||||
:param direction:
|
||||
- 'v': vertical up-down connection
|
||||
- 'h': horizontal left-right connection
|
||||
'''
|
||||
l_a = self.location
|
||||
l_b = ele_b.location
|
||||
# connected vertically - up and below
|
||||
if direction == 'v':
|
||||
# left and right should be justified
|
||||
if abs(l_a['left'] - l_b['left']) < max_bias_justify and abs(l_a['right'] - l_b['right']) < max_bias_justify:
|
||||
return True
|
||||
return False
|
||||
elif direction == 'h':
|
||||
# top and bottom should be justified
|
||||
if abs(l_a['top'] - l_b['top']) < max_bias_justify and abs(l_a['bottom'] - l_b['bottom']) < max_bias_justify:
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_on_same_line(self, text_b, direction='h', bias_gap=4, bias_justify=4):
|
||||
'''
|
||||
Check if the element is on the same row(direction='h') or column(direction='v') with ele_b
|
||||
:param direction:
|
||||
- 'v': vertical up-down connection
|
||||
- 'h': horizontal left-right connection
|
||||
:return:
|
||||
'''
|
||||
l_a = self.location
|
||||
l_b = text_b.location
|
||||
# connected vertically - up and below
|
||||
if direction == 'v':
|
||||
# left and right should be justified
|
||||
if self.is_justified(text_b, direction='v', max_bias_justify=bias_justify):
|
||||
# top and bottom should be connected (small gap)
|
||||
if abs(l_a['bottom'] - l_b['top']) < bias_gap or abs(l_a['top'] - l_b['bottom']) < bias_gap:
|
||||
return True
|
||||
return False
|
||||
elif direction == 'h':
|
||||
# top and bottom should be justified
|
||||
if self.is_justified(text_b, direction='h', max_bias_justify=bias_justify):
|
||||
# top and bottom should be connected (small gap)
|
||||
if abs(l_a['right'] - l_b['left']) < bias_gap or abs(l_a['left'] - l_b['right']) < bias_gap:
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_intersected(self, text_b, bias):
|
||||
l_a = self.location
|
||||
l_b = text_b.location
|
||||
left_in = max(l_a['left'], l_b['left']) + bias
|
||||
top_in = max(l_a['top'], l_b['top']) + bias
|
||||
right_in = min(l_a['right'], l_b['right'])
|
||||
bottom_in = min(l_a['bottom'], l_b['bottom'])
|
||||
|
||||
w_in = max(0, right_in - left_in)
|
||||
h_in = max(0, bottom_in - top_in)
|
||||
area_in = w_in * h_in
|
||||
if area_in > 0:
|
||||
return True
|
||||
|
||||
'''
|
||||
***********************
|
||||
*** Revise the Text ***
|
||||
***********************
|
||||
'''
|
||||
def merge_text(self, text_b):
|
||||
text_a = self
|
||||
top = min(text_a.location['top'], text_b.location['top'])
|
||||
left = min(text_a.location['left'], text_b.location['left'])
|
||||
right = max(text_a.location['right'], text_b.location['right'])
|
||||
bottom = max(text_a.location['bottom'], text_b.location['bottom'])
|
||||
self.location = {'left': left, 'top': top, 'right': right, 'bottom': bottom}
|
||||
self.width = self.location['right'] - self.location['left']
|
||||
self.height = self.location['bottom'] - self.location['top']
|
||||
self.area = self.width * self.height
|
||||
|
||||
left_element = text_a
|
||||
right_element = text_b
|
||||
if text_a.location['left'] > text_b.location['left']:
|
||||
left_element = text_b
|
||||
right_element = text_a
|
||||
self.content = left_element.content + ' ' + right_element.content
|
||||
self.word_width = self.width / len(self.content)
|
||||
|
||||
def shrink_bound(self, binary_map):
|
||||
bin_clip = binary_map[self.location['top']:self.location['bottom'], self.location['left']:self.location['right']]
|
||||
height, width = np.shape(bin_clip)
|
||||
|
||||
shrink_top = 0
|
||||
shrink_bottom = 0
|
||||
for i in range(height):
|
||||
# top
|
||||
if shrink_top == 0:
|
||||
if sum(bin_clip[i]) == 0:
|
||||
shrink_top = 1
|
||||
else:
|
||||
shrink_top = -1
|
||||
elif shrink_top == 1:
|
||||
if sum(bin_clip[i]) != 0:
|
||||
self.location['top'] += i
|
||||
shrink_top = -1
|
||||
# bottom
|
||||
if shrink_bottom == 0:
|
||||
if sum(bin_clip[height-i-1]) == 0:
|
||||
shrink_bottom = 1
|
||||
else:
|
||||
shrink_bottom = -1
|
||||
elif shrink_bottom == 1:
|
||||
if sum(bin_clip[height-i-1]) != 0:
|
||||
self.location['bottom'] -= i
|
||||
shrink_bottom = -1
|
||||
|
||||
if shrink_top == -1 and shrink_bottom == -1:
|
||||
break
|
||||
|
||||
shrink_left = 0
|
||||
shrink_right = 0
|
||||
for j in range(width):
|
||||
# left
|
||||
if shrink_left == 0:
|
||||
if sum(bin_clip[:, j]) == 0:
|
||||
shrink_left = 1
|
||||
else:
|
||||
shrink_left = -1
|
||||
elif shrink_left == 1:
|
||||
if sum(bin_clip[:, j]) != 0:
|
||||
self.location['left'] += j
|
||||
shrink_left = -1
|
||||
# right
|
||||
if shrink_right == 0:
|
||||
if sum(bin_clip[:, width-j-1]) == 0:
|
||||
shrink_right = 1
|
||||
else:
|
||||
shrink_right = -1
|
||||
elif shrink_right == 1:
|
||||
if sum(bin_clip[:, width-j-1]) != 0:
|
||||
self.location['right'] -= j
|
||||
shrink_right = -1
|
||||
|
||||
if shrink_left == -1 and shrink_right == -1:
|
||||
break
|
||||
self.width = self.location['right'] - self.location['left']
|
||||
self.height = self.location['bottom'] - self.location['top']
|
||||
self.area = self.width * self.height
|
||||
self.word_width = self.width / len(self.content)
|
||||
|
||||
'''
|
||||
*********************
|
||||
*** Visualization ***
|
||||
*********************
|
||||
'''
|
||||
def visualize_element(self, img, color=(0, 0, 255), line=1, show=False):
|
||||
loc = self.location
|
||||
cv2.rectangle(img, (loc['left'], loc['top']), (loc['right'], loc['bottom']), color, line)
|
||||
if show:
|
||||
print(self.content)
|
||||
cv2.imshow('text', img)
|
||||
cv2.waitKey()
|
||||
cv2.destroyWindow('text')
|
||||
BIN
UIED/detect_text/__pycache__/Text.cpython-312.pyc
Normal file
BIN
UIED/detect_text/__pycache__/Text.cpython-312.pyc
Normal file
Binary file not shown.
BIN
UIED/detect_text/__pycache__/Text.cpython-35.pyc
Normal file
BIN
UIED/detect_text/__pycache__/Text.cpython-35.pyc
Normal file
Binary file not shown.
BIN
UIED/detect_text/__pycache__/ocr.cpython-312.pyc
Normal file
BIN
UIED/detect_text/__pycache__/ocr.cpython-312.pyc
Normal file
Binary file not shown.
BIN
UIED/detect_text/__pycache__/ocr.cpython-35.pyc
Normal file
BIN
UIED/detect_text/__pycache__/ocr.cpython-35.pyc
Normal file
Binary file not shown.
BIN
UIED/detect_text/__pycache__/text_detection.cpython-312.pyc
Normal file
BIN
UIED/detect_text/__pycache__/text_detection.cpython-312.pyc
Normal file
Binary file not shown.
BIN
UIED/detect_text/__pycache__/text_detection.cpython-35.pyc
Normal file
BIN
UIED/detect_text/__pycache__/text_detection.cpython-35.pyc
Normal file
Binary file not shown.
55
UIED/detect_text/ocr.py
Normal file
55
UIED/detect_text/ocr.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import cv2
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
from base64 import b64encode
|
||||
import time
|
||||
|
||||
|
||||
def Google_OCR_makeImageData(imgpath):
|
||||
with open(imgpath, 'rb') as f:
|
||||
ctxt = b64encode(f.read()).decode()
|
||||
img_req = {
|
||||
'image': {
|
||||
'content': ctxt
|
||||
},
|
||||
'features': [{
|
||||
'type': 'DOCUMENT_TEXT_DETECTION',
|
||||
# 'type': 'TEXT_DETECTION',
|
||||
'maxResults': 1
|
||||
}]
|
||||
}
|
||||
return json.dumps({"requests": img_req}).encode()
|
||||
|
||||
|
||||
def ocr_detection_google(imgpath):
|
||||
start = time.perf_counter()
|
||||
url = 'https://vision.googleapis.com/v1/images:annotate'
|
||||
api_key = 'AIzaSyDUc4iOUASJQYkVwSomIArTKhE2C6bHK8U' # *** Replace with your own Key ***
|
||||
imgdata = Google_OCR_makeImageData(imgpath)
|
||||
response = requests.post(url,
|
||||
data=imgdata,
|
||||
params={'key': api_key},
|
||||
headers={'Content_Type': 'application/json'})
|
||||
# print('*** Text Detection Time Taken:%.3fs ***' % (time.perf_counter() - start))
|
||||
print("*** Please replace the Google OCR key at detect_text/ocr.py line 28 with your own (apply in https://cloud.google.com/vision) ***")
|
||||
|
||||
response_json = response.json()
|
||||
if 'error' in response_json:
|
||||
error_msg = response_json['error']
|
||||
if 'BILLING_DISABLED' in str(error_msg):
|
||||
raise Exception("Google Vision API requires billing to be enabled. Please:\n"
|
||||
"1. Visit https://console.developers.google.com/billing/enable?project=718250946490\n"
|
||||
"2. Enable billing for your project\n"
|
||||
"3. Wait a few minutes for changes to propagate\n"
|
||||
"4. Or use PaddleOCR instead by setting method='paddle'")
|
||||
else:
|
||||
raise Exception(f"Google Vision API error: {error_msg}")
|
||||
|
||||
if 'responses' not in response_json:
|
||||
raise Exception(response_json)
|
||||
if response_json['responses'] == [{}]:
|
||||
# No Text
|
||||
return None
|
||||
else:
|
||||
return response_json['responses'][0]['textAnnotations'][1:]
|
||||
176
UIED/detect_text/text_detection.py
Normal file
176
UIED/detect_text/text_detection.py
Normal file
@@ -0,0 +1,176 @@
|
||||
import detect_text.ocr as ocr
|
||||
from detect_text.Text import Text
|
||||
import numpy as np
|
||||
import cv2
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
from os.path import join as pjoin
|
||||
|
||||
|
||||
def save_detection_json(file_path, texts, img_shape):
|
||||
f_out = open(file_path, 'w')
|
||||
output = {'img_shape': img_shape, 'texts': []}
|
||||
for text in texts:
|
||||
c = {'id': text.id, 'content': text.content}
|
||||
loc = text.location
|
||||
c['column_min'], c['row_min'], c['column_max'], c['row_max'] = loc['left'], loc['top'], loc['right'], loc['bottom']
|
||||
c['width'] = text.width
|
||||
c['height'] = text.height
|
||||
output['texts'].append(c)
|
||||
json.dump(output, f_out, indent=4)
|
||||
|
||||
|
||||
def visualize_texts(org_img, texts, shown_resize_height=None, show=False, write_path=None):
|
||||
img = org_img.copy()
|
||||
for text in texts:
|
||||
text.visualize_element(img, line=2)
|
||||
|
||||
img_resize = img
|
||||
if shown_resize_height is not None:
|
||||
img_resize = cv2.resize(img, (int(shown_resize_height * (img.shape[1]/img.shape[0])), shown_resize_height))
|
||||
|
||||
if show:
|
||||
cv2.imshow('texts', img_resize)
|
||||
cv2.waitKey(0)
|
||||
cv2.destroyWindow('texts')
|
||||
if write_path is not None:
|
||||
cv2.imwrite(write_path, img)
|
||||
|
||||
|
||||
def text_sentences_recognition(texts):
|
||||
'''
|
||||
Merge separate words detected by Google ocr into a sentence
|
||||
'''
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
temp_set = []
|
||||
for text_a in texts:
|
||||
merged = False
|
||||
for text_b in temp_set:
|
||||
if text_a.is_on_same_line(text_b, 'h', bias_justify=0.2 * min(text_a.height, text_b.height), bias_gap=2 * max(text_a.word_width, text_b.word_width)):
|
||||
text_b.merge_text(text_a)
|
||||
merged = True
|
||||
changed = True
|
||||
break
|
||||
if not merged:
|
||||
temp_set.append(text_a)
|
||||
texts = temp_set.copy()
|
||||
|
||||
for i, text in enumerate(texts):
|
||||
text.id = i
|
||||
return texts
|
||||
|
||||
|
||||
def merge_intersected_texts(texts):
|
||||
'''
|
||||
Merge intersected texts (sentences or words)
|
||||
'''
|
||||
changed = True
|
||||
while changed:
|
||||
changed = False
|
||||
temp_set = []
|
||||
for text_a in texts:
|
||||
merged = False
|
||||
for text_b in temp_set:
|
||||
if text_a.is_intersected(text_b, bias=2):
|
||||
text_b.merge_text(text_a)
|
||||
merged = True
|
||||
changed = True
|
||||
break
|
||||
if not merged:
|
||||
temp_set.append(text_a)
|
||||
texts = temp_set.copy()
|
||||
return texts
|
||||
|
||||
|
||||
def text_cvt_orc_format(ocr_result):
|
||||
texts = []
|
||||
if ocr_result is not None:
|
||||
for i, result in enumerate(ocr_result):
|
||||
error = False
|
||||
x_coordinates = []
|
||||
y_coordinates = []
|
||||
text_location = result['boundingPoly']['vertices']
|
||||
content = result['description']
|
||||
for loc in text_location:
|
||||
if 'x' not in loc or 'y' not in loc:
|
||||
error = True
|
||||
break
|
||||
x_coordinates.append(loc['x'])
|
||||
y_coordinates.append(loc['y'])
|
||||
if error: continue
|
||||
location = {'left': min(x_coordinates), 'top': min(y_coordinates),
|
||||
'right': max(x_coordinates), 'bottom': max(y_coordinates)}
|
||||
texts.append(Text(i, content, location))
|
||||
return texts
|
||||
|
||||
|
||||
def text_cvt_orc_format_paddle(paddle_result):
|
||||
texts = []
|
||||
if not (isinstance(paddle_result, list) and len(paddle_result) > 0 and isinstance(paddle_result[0], dict)):
|
||||
print("Unrecognized paddle_result format or empty result:", paddle_result)
|
||||
return texts
|
||||
|
||||
res = paddle_result[0]
|
||||
rec_texts = res.get('rec_texts', [])
|
||||
rec_polys = res.get('rec_polys', [])
|
||||
|
||||
text_id_counter = 0
|
||||
for content, points in zip(rec_texts, rec_polys):
|
||||
if not content:
|
||||
continue
|
||||
|
||||
points_array = np.array(points)
|
||||
location = {'left': int(np.min(points_array[:, 0])), 'top': int(np.min(points_array[:, 1])), 'right': int(np.max(points_array[:, 0])),
|
||||
'bottom': int(np.max(points_array[:, 1]))}
|
||||
texts.append(Text(text_id_counter, content, location))
|
||||
text_id_counter += 1
|
||||
return texts
|
||||
|
||||
|
||||
def text_filter_noise(texts):
|
||||
valid_texts = []
|
||||
for text in texts:
|
||||
if len(text.content) <= 1 and text.content.lower() not in ['a', ',', '.', '!', '?', '$', '%', ':', '&', '+']:
|
||||
continue
|
||||
valid_texts.append(text)
|
||||
return valid_texts
|
||||
|
||||
|
||||
def text_detection(input_file='../data/input/30800.jpg', output_file='../data/output', show=False, method='paddle', paddle_model=None):
|
||||
'''
|
||||
:param method: google or paddle
|
||||
:param paddle_model: the preload paddle model for paddle ocr
|
||||
'''
|
||||
start = time.perf_counter()
|
||||
name = input_file.split('/')[-1][:-4]
|
||||
ocr_root = pjoin(output_file, 'ocr')
|
||||
img = cv2.imread(input_file)
|
||||
|
||||
if method == 'google':
|
||||
print('*** Detect Text through Google OCR ***')
|
||||
ocr_result = ocr.ocr_detection_google(input_file)
|
||||
texts = text_cvt_orc_format(ocr_result)
|
||||
texts = merge_intersected_texts(texts)
|
||||
texts = text_filter_noise(texts)
|
||||
texts = text_sentences_recognition(texts)
|
||||
elif method == 'paddle':
|
||||
# The import of the paddle ocr can be separate to the beginning of the program if you decide to use this method
|
||||
from paddleocr import PaddleOCR
|
||||
print('*** Detect Text through Paddle OCR ***')
|
||||
if paddle_model is None:
|
||||
paddle_model = PaddleOCR(use_angle_cls=True, lang="ch")
|
||||
result = paddle_model.ocr(input_file)
|
||||
texts = text_cvt_orc_format_paddle(result)
|
||||
else:
|
||||
raise ValueError('Method has to be "google" or "paddle"')
|
||||
|
||||
visualize_texts(img, texts, shown_resize_height=800, show=show, write_path=pjoin(ocr_root, name+'.png'))
|
||||
save_detection_json(pjoin(ocr_root, name+'.json'), texts, img.shape)
|
||||
print("[Text Detection Completed in %.3f s] Input: %s Output: %s" % (time.perf_counter() - start, input_file, pjoin(ocr_root, name+'.json')))
|
||||
|
||||
|
||||
# text_detection()
|
||||
|
||||
Reference in New Issue
Block a user