Files
clint-dataset/dataset_analysis.py
2026-04-07 22:00:40 +05:30

62 lines
2.1 KiB
Python

import json
from pathlib import Path
import os
from enums import TokenLabel
def parse_annotated(path):
# parse json file
#
# return the counts of each token label in the dataset
data = json.loads(Path(path).read_text(encoding="utf-8"))
if not isinstance(data, list):
raise ValueError("Annotated dataset must be a JSON array.")
counts = {label.name: 0 for label in TokenLabel}
label_names = {label.name: label.name for label in TokenLabel}
label_values = {label.value: label.name for label in TokenLabel}
for item in data:
annotations = item.get("annotations", [])
if not isinstance(annotations, list):
raise ValueError("Annotations must be a list.")
for annotation in annotations:
results = annotation.get("result", [])
if not isinstance(results, list):
raise ValueError("Annotation results must be a list.")
for result in results:
value = result.get("value", {})
labels = value.get("labels", [])
if not isinstance(labels, list):
raise ValueError("Result labels must be a list.")
for label in labels:
if label in label_names:
key = label
elif label in label_values:
key = label_values[label]
else:
raise ValueError(f"Unknown label: {label}")
counts[key] += 1
return counts
if __name__ == "__main__":
path = "./datasets/annotated/"
annotated_dataset_list = []
for file in os.walk(path):
for filename in file[2]:
if filename.endswith(".json"):
annotated_dataset_list.append(os.path.join(file[0], filename))
# number based menu to match file
for file in annotated_dataset_list:
print(f"{annotated_dataset_list.index(file)}: {file}")
_ = input("Enter the number of the annotated dataset to analyze: ")
path = annotated_dataset_list[int(_)]
counts = parse_annotated(path)
print(json.dumps(counts, indent=2))