Spaces:

Tzktz
/

Dit-document-layout-analysis

Running

Upload 7664 files

6fc683c verified over 1 year ago

1.73 kB

	import json
	import hashlib
	import io
	import os
	import base64
	from PIL import Image
	from tqdm import tqdm

	def calculate_md5(image):
	md5_hash = hashlib.md5()
	with io.BytesIO() as output:
	image.save(output, format='JPEG')
	image_data = output.getvalue()
	md5_hash.update(image_data)
	return md5_hash.hexdigest()

	def process_files(directory):
	tsv_data = []

	for file in tqdm(os.listdir(directory)):
	if file.endswith('.json'):
	json_path = os.path.join(directory, file)
	jpg_path = os.path.join(directory, file.replace('.json', '.jpg'))

	with open(json_path, 'r') as json_file:
	data = json.load(json_file)

	image = Image.open(jpg_path)
	md5 = calculate_md5(image)
	caption = data['caption']
	width = data['width']
	height = data['height']

	with io.BytesIO() as buffer:
	image.save(buffer, format='JPEG')
	image_base64 = base64.b64encode(buffer.getvalue()).decode("utf-8")

	combined_data_str = {'phrase': data['noun_chunks'], 'expression_v1': data['ref_exps']}

	tsv_row = [md5, caption, image_base64, width, height, combined_data_str]
	tsv_data.append('\t'.join(map(str, tsv_row)))

	return tsv_data

	def write_tsv(tsv_data, output_file):
	with open(output_file, 'w') as file:
	file.write('\n'.join(tsv_data))

	if __name__ == '__main__':
	directory = '/tmp/grit'
	output_file = '/tmp/output.tsv'
	tsv_data = process_files(directory)
	write_tsv(tsv_data, output_file)