Files
clint-dataset/dataset_analysis.py
2026-04-08 17:27:11 +05:30

71 lines
2.5 KiB
Python

import json
from pathlib import Path
import os
from enums import TokenLabel
def parse_annotated(path):
# parse json file
#
# return the counts of each token label in the dataset
data = json.loads(Path(path).read_text(encoding="utf-8"))
if not isinstance(data, list):
raise ValueError("Annotated dataset must be a JSON array.")
counts = {label.name: 0 for label in TokenLabel}
label_names = {label.name: label.name for label in TokenLabel}
label_values = {label.value: label.name for label in TokenLabel}
for item in data:
annotations = item.get("annotations", [])
if not isinstance(annotations, list):
raise ValueError("Annotations must be a list.")
for annotation in annotations:
results = annotation.get("result", [])
if not isinstance(results, list):
raise ValueError("Annotation results must be a list.")
for result in results:
value = result.get("value", {})
labels = value.get("labels", [])
if not isinstance(labels, list):
raise ValueError("Result labels must be a list.")
for label in labels:
if label in label_names:
key = label
elif label in label_values:
key = label_values[label]
else:
raise ValueError(f"Unknown label: {label}")
counts[key] += 1
return counts
if __name__ == "__main__":
path = "./datasets/annotated/"
annotated_dataset_list = []
for file in os.walk(path):
for filename in file[2]:
if filename.endswith(".json"):
annotated_dataset_list.append(os.path.join(file[0], filename))
# number based menu to match file
for file in annotated_dataset_list:
print(f"{annotated_dataset_list.index(file)}: {file}")
print("a: all annotated datasets")
selection = input("Enter the number of the annotated dataset to analyze: ")
if selection.lower() == "a":
combined_counts = {label.name: 0 for label in TokenLabel}
for dataset_path in annotated_dataset_list:
counts = parse_annotated(dataset_path)
for label, value in counts.items():
combined_counts[label] += value
print(json.dumps(combined_counts, indent=2))
else:
path = annotated_dataset_list[int(selection)]
counts = parse_annotated(path)
print(json.dumps(counts, indent=2))