91 lines
3.1 KiB
Python
91 lines
3.1 KiB
Python
import json
|
|
|
|
|
|
def parse_json_dataset(path):
|
|
with open(path, "r") as f:
|
|
tasks = json.load(f)
|
|
|
|
for task in tasks:
|
|
label_line = ""
|
|
text_line = ""
|
|
|
|
full_text = task["data"]["text"]
|
|
def append_segment(segment_text, segment_label):
|
|
label = f"{segment_label} "
|
|
text = f"{segment_text} "
|
|
if len(text) > len(label):
|
|
label_line_part = f"{label + ' ' * (len(text) - len(label))}"
|
|
text_line_part = text
|
|
elif len(text) < len(label):
|
|
text_line_part = f"{text + ' ' * (len(label) - len(text))}"
|
|
label_line_part = label
|
|
else:
|
|
text_line_part = text
|
|
label_line_part = label
|
|
return text_line_part, label_line_part
|
|
|
|
def append_gap(gap_text):
|
|
segments = []
|
|
if not gap_text:
|
|
return segments
|
|
start = 0
|
|
end = len(gap_text)
|
|
while start < end and gap_text[start].isspace():
|
|
start += 1
|
|
while end > start and gap_text[end - 1].isspace():
|
|
end -= 1
|
|
leading = gap_text[:start]
|
|
middle = gap_text[start:end]
|
|
trailing = gap_text[end:]
|
|
if leading:
|
|
text_line_append = leading
|
|
label_line_append = " " * len(leading)
|
|
segments.append((text_line_append, label_line_append))
|
|
if middle:
|
|
text_part, label_part = append_segment(middle, "0")
|
|
segments.append((text_part, label_part))
|
|
if trailing:
|
|
text_line_append = trailing
|
|
label_line_append = " " * len(trailing)
|
|
segments.append((text_line_append, label_line_append))
|
|
return segments
|
|
|
|
results = []
|
|
annotations = task.get("annotations") or []
|
|
if annotations:
|
|
results = annotations[0].get("result", [])
|
|
results = sorted(results, key=lambda item: item["value"]["start"])
|
|
|
|
cursor = 0
|
|
for annotation in results:
|
|
start = annotation["value"]["start"]
|
|
end = annotation["value"]["end"]
|
|
label = annotation["value"]["labels"][0]
|
|
|
|
if cursor < start:
|
|
gap_text = full_text[cursor:start]
|
|
for text_part, label_part in append_gap(gap_text):
|
|
text_line += text_part
|
|
label_line += label_part
|
|
|
|
text_segment = full_text[start:end]
|
|
text_part, label_part = append_segment(text_segment, label)
|
|
text_line += text_part
|
|
label_line += label_part
|
|
cursor = end
|
|
|
|
if cursor < len(full_text):
|
|
gap_text = full_text[cursor:]
|
|
for text_part, label_part in append_gap(gap_text):
|
|
text_line += text_part
|
|
label_line += label_part
|
|
print(text_line)
|
|
print(label_line)
|
|
print("\n")
|
|
|
|
return tasks
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parse_json_dataset("./datasets/annotated/ffmpeg_gemini_v1.json")
|