| import json | |
| import string | |
| def extract_object_name(text): | |
| parts = text.split("is") | |
| if len(parts) > 1: | |
| return parts[1].strip() | |
| return None | |
| text_pth = "/home/yuqian_fu/Projects/PSALM/check_text_select_scene_600_20250514.json" | |
| save_path = "/home/yuqian_fu/Projects/PSALM/check_text_select_scene_600_objname_llavatext.json" | |
| new_data = [] | |
| sent_id = 0 | |
| with open(text_pth, "r") as fp: | |
| datas = json.load(fp) | |
| # data是一帧帧图片 | |
| for data in datas: | |
| #instruct_list = [] | |
| new_annos = [] | |
| for anno in data["first_frame_anns"]: | |
| text = anno["text"] | |
| # 提取is之后的句子 | |
| raw = extract_object_name(text) | |
| #将raw变小写 | |
| raw_lower = raw.lower() | |
| # 删除 "green" 并去掉多余的空格 | |
| result = raw_lower.replace("green", "").strip() | |
| # 删除所有标点符号 | |
| sent = result.translate(str.maketrans('', '', string.punctuation)) | |
| #tokens = sent.split() | |
| # sample = { | |
| # "tokens": tokens, | |
| # "raw": raw, | |
| # "sent_id": sent_id, | |
| # "sent": sent | |
| # } | |
| anno["llava_text"] = sent | |
| new_annos.append(anno) | |
| sent_id += 1 | |
| #instruct_list.append(sample) | |
| # del anno["text"] #debug | |
| #data["instruction"] = instruct_list | |
| data["first_frame_anns"] = new_annos | |
| del data["instruction"] #debug | |
| new_data.append(data) | |
| #print(sent_id) | |
| print("len of new_data: ", len(new_data)) | |
| with open(save_path, "w") as fp: | |
| json.dump(new_data, fp) | |