Update block_parsor.py

Add Qwen, GPT, Gemini functions
This commit is contained in:
JimmyZhengyz
2025-07-30 13:03:50 +08:00
committed by GitHub
parent b10733e1e2
commit ef42f6b32e

View File

@@ -1,10 +1,10 @@
import os import os
import cv2 import cv2
import json import json
from utils import Doubao, encode_image, image_mask from utils import Doubao, Qwen, GPT, Gemini, encode_image, image_mask
DEFAULT_IMAGE_PATH = "data/input/test2.png" DEFAULT_IMAGE_PATH = "data/input/test2.png"
DEFAULT_API_PATH = "doubao_api.txt" DEFAULT_API_PATH = "doubao_api.txt" # Change the API key path for different models (i.e. doubao, qwen, gpt, gemini).
PROMPT_LIST = [ PROMPT_LIST = [
("header", "Please output the minimum bounding box of the header. Please output the bounding box in the format of <bbox>x1 y1 x2 y2</bbox>. Avoid the blank space in the header."), ("header", "Please output the minimum bounding box of the header. Please output the bounding box in the format of <bbox>x1 y1 x2 y2</bbox>. Avoid the blank space in the header."),
("sidebar", "Please output the minimum bounding box of the sidebar. Please output the bounding box in the format of <bbox>x1 y1 x2 y2</bbox>. Avoid meaningless blank space in the sidebar."), ("sidebar", "Please output the minimum bounding box of the sidebar. Please output the bounding box in the format of <bbox>x1 y1 x2 y2</bbox>. Avoid meaningless blank space in the sidebar."),
@@ -225,7 +225,7 @@ def save_bboxes_to_json(bboxes: dict[str, tuple[int, int, int, int]], image_path
# """ # """
# bboxes = {} # bboxes = {}
# current_image_path = image_path # current_image_path = image_path
# ark_client = Doubao(api_path) # ark_client = Doubao(api_path) # Change your client according to your needs: Qwen(api_path), GPT(api_path), Gemini(api_path)
# image = cv2.imread(image_path) # image = cv2.imread(image_path)
# if image is None: # if image is None:
@@ -309,7 +309,7 @@ if __name__ == "__main__":
print("=== Starting Simple Component Detection ===") print("=== Starting Simple Component Detection ===")
print(f"Input image: {image_path}") print(f"Input image: {image_path}")
print(f"API path: {api_path}") print(f"API path: {api_path}")
client = Doubao(api_path) client = Doubao(api_path) # Change your models according to your needs: Qwen(api_path), GPT(api_path), Gemini(api_path)
bbox_content = client.ask(PROMPT_MERGE, encode_image(image_path)) bbox_content = client.ask(PROMPT_MERGE, encode_image(image_path))
print(f"Model response: {bbox_content}\n") print(f"Model response: {bbox_content}\n")
bboxes = parse_bboxes(bbox_content, image_path) bboxes = parse_bboxes(bbox_content, image_path)