Files
clint-dataset/dataset_convertor.py
2026-04-08 17:27:11 +05:30

91 lines
3.1 KiB
Python

import json
def parse_json_dataset(path):
with open(path, "r") as f:
tasks = json.load(f)
for task in tasks:
label_line = ""
text_line = ""
full_text = task["data"]["text"]
def append_segment(segment_text, segment_label):
label = f"{segment_label} "
text = f"{segment_text} "
if len(text) > len(label):
label_line_part = f"{label + ' ' * (len(text) - len(label))}"
text_line_part = text
elif len(text) < len(label):
text_line_part = f"{text + ' ' * (len(label) - len(text))}"
label_line_part = label
else:
text_line_part = text
label_line_part = label
return text_line_part, label_line_part
def append_gap(gap_text):
segments = []
if not gap_text:
return segments
start = 0
end = len(gap_text)
while start < end and gap_text[start].isspace():
start += 1
while end > start and gap_text[end - 1].isspace():
end -= 1
leading = gap_text[:start]
middle = gap_text[start:end]
trailing = gap_text[end:]
if leading:
text_line_append = leading
label_line_append = " " * len(leading)
segments.append((text_line_append, label_line_append))
if middle:
text_part, label_part = append_segment(middle, "0")
segments.append((text_part, label_part))
if trailing:
text_line_append = trailing
label_line_append = " " * len(trailing)
segments.append((text_line_append, label_line_append))
return segments
results = []
annotations = task.get("annotations") or []
if annotations:
results = annotations[0].get("result", [])
results = sorted(results, key=lambda item: item["value"]["start"])
cursor = 0
for annotation in results:
start = annotation["value"]["start"]
end = annotation["value"]["end"]
label = annotation["value"]["labels"][0]
if cursor < start:
gap_text = full_text[cursor:start]
for text_part, label_part in append_gap(gap_text):
text_line += text_part
label_line += label_part
text_segment = full_text[start:end]
text_part, label_part = append_segment(text_segment, label)
text_line += text_part
label_line += label_part
cursor = end
if cursor < len(full_text):
gap_text = full_text[cursor:]
for text_part, label_part in append_gap(gap_text):
text_line += text_part
label_line += label_part
print(text_line)
print(label_line)
print("\n")
return tasks
if __name__ == "__main__":
parse_json_dataset("./datasets/annotated/ffmpeg_gemini_v1.json")