import json def parse_json_dataset(path): with open(path, "r") as f: tasks = json.load(f) for task in tasks: label_line = "" text_line = "" full_text = task["data"]["text"] def append_segment(segment_text, segment_label): label = f"{segment_label} " text = f"{segment_text} " if len(text) > len(label): label_line_part = f"{label + ' ' * (len(text) - len(label))}" text_line_part = text elif len(text) < len(label): text_line_part = f"{text + ' ' * (len(label) - len(text))}" label_line_part = label else: text_line_part = text label_line_part = label return text_line_part, label_line_part def append_gap(gap_text): segments = [] if not gap_text: return segments start = 0 end = len(gap_text) while start < end and gap_text[start].isspace(): start += 1 while end > start and gap_text[end - 1].isspace(): end -= 1 leading = gap_text[:start] middle = gap_text[start:end] trailing = gap_text[end:] if leading: text_line_append = leading label_line_append = " " * len(leading) segments.append((text_line_append, label_line_append)) if middle: text_part, label_part = append_segment(middle, "0") segments.append((text_part, label_part)) if trailing: text_line_append = trailing label_line_append = " " * len(trailing) segments.append((text_line_append, label_line_append)) return segments results = [] annotations = task.get("annotations") or [] if annotations: results = annotations[0].get("result", []) results = sorted(results, key=lambda item: item["value"]["start"]) cursor = 0 for annotation in results: start = annotation["value"]["start"] end = annotation["value"]["end"] label = annotation["value"]["labels"][0] if cursor < start: gap_text = full_text[cursor:start] for text_part, label_part in append_gap(gap_text): text_line += text_part label_line += label_part text_segment = full_text[start:end] text_part, label_part = append_segment(text_segment, label) text_line += text_part label_line += label_part cursor = end if cursor < len(full_text): gap_text = full_text[cursor:] for text_part, label_part in append_gap(gap_text): text_line += text_part label_line += label_part print(text_line) print(label_line) print("\n") return tasks if __name__ == "__main__": parse_json_dataset("./datasets/annotated/ffmpeg_gemini_v1.json")