minilm testing
This commit is contained in:
90
dataset_convertor.py
Normal file
90
dataset_convertor.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import json
|
||||
|
||||
|
||||
def parse_json_dataset(path):
|
||||
with open(path, "r") as f:
|
||||
tasks = json.load(f)
|
||||
|
||||
for task in tasks:
|
||||
label_line = ""
|
||||
text_line = ""
|
||||
|
||||
full_text = task["data"]["text"]
|
||||
def append_segment(segment_text, segment_label):
|
||||
label = f"{segment_label} "
|
||||
text = f"{segment_text} "
|
||||
if len(text) > len(label):
|
||||
label_line_part = f"{label + ' ' * (len(text) - len(label))}"
|
||||
text_line_part = text
|
||||
elif len(text) < len(label):
|
||||
text_line_part = f"{text + ' ' * (len(label) - len(text))}"
|
||||
label_line_part = label
|
||||
else:
|
||||
text_line_part = text
|
||||
label_line_part = label
|
||||
return text_line_part, label_line_part
|
||||
|
||||
def append_gap(gap_text):
|
||||
segments = []
|
||||
if not gap_text:
|
||||
return segments
|
||||
start = 0
|
||||
end = len(gap_text)
|
||||
while start < end and gap_text[start].isspace():
|
||||
start += 1
|
||||
while end > start and gap_text[end - 1].isspace():
|
||||
end -= 1
|
||||
leading = gap_text[:start]
|
||||
middle = gap_text[start:end]
|
||||
trailing = gap_text[end:]
|
||||
if leading:
|
||||
text_line_append = leading
|
||||
label_line_append = " " * len(leading)
|
||||
segments.append((text_line_append, label_line_append))
|
||||
if middle:
|
||||
text_part, label_part = append_segment(middle, "0")
|
||||
segments.append((text_part, label_part))
|
||||
if trailing:
|
||||
text_line_append = trailing
|
||||
label_line_append = " " * len(trailing)
|
||||
segments.append((text_line_append, label_line_append))
|
||||
return segments
|
||||
|
||||
results = []
|
||||
annotations = task.get("annotations") or []
|
||||
if annotations:
|
||||
results = annotations[0].get("result", [])
|
||||
results = sorted(results, key=lambda item: item["value"]["start"])
|
||||
|
||||
cursor = 0
|
||||
for annotation in results:
|
||||
start = annotation["value"]["start"]
|
||||
end = annotation["value"]["end"]
|
||||
label = annotation["value"]["labels"][0]
|
||||
|
||||
if cursor < start:
|
||||
gap_text = full_text[cursor:start]
|
||||
for text_part, label_part in append_gap(gap_text):
|
||||
text_line += text_part
|
||||
label_line += label_part
|
||||
|
||||
text_segment = full_text[start:end]
|
||||
text_part, label_part = append_segment(text_segment, label)
|
||||
text_line += text_part
|
||||
label_line += label_part
|
||||
cursor = end
|
||||
|
||||
if cursor < len(full_text):
|
||||
gap_text = full_text[cursor:]
|
||||
for text_part, label_part in append_gap(gap_text):
|
||||
text_line += text_part
|
||||
label_line += label_part
|
||||
print(text_line)
|
||||
print(label_line)
|
||||
print("\n")
|
||||
|
||||
return tasks
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse_json_dataset("./datasets/annotated/ffmpeg_gemini_v1.json")
|
||||
Reference in New Issue
Block a user