62 lines
2.1 KiB
Python
62 lines
2.1 KiB
Python
import json
|
|
from pathlib import Path
|
|
import os
|
|
|
|
from enums import TokenLabel
|
|
|
|
|
|
def parse_annotated(path):
|
|
# parse json file
|
|
#
|
|
# return the counts of each token label in the dataset
|
|
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
|
if not isinstance(data, list):
|
|
raise ValueError("Annotated dataset must be a JSON array.")
|
|
|
|
counts = {label.name: 0 for label in TokenLabel}
|
|
label_names = {label.name: label.name for label in TokenLabel}
|
|
label_values = {label.value: label.name for label in TokenLabel}
|
|
|
|
for item in data:
|
|
annotations = item.get("annotations", [])
|
|
if not isinstance(annotations, list):
|
|
raise ValueError("Annotations must be a list.")
|
|
for annotation in annotations:
|
|
results = annotation.get("result", [])
|
|
if not isinstance(results, list):
|
|
raise ValueError("Annotation results must be a list.")
|
|
for result in results:
|
|
value = result.get("value", {})
|
|
labels = value.get("labels", [])
|
|
if not isinstance(labels, list):
|
|
raise ValueError("Result labels must be a list.")
|
|
for label in labels:
|
|
if label in label_names:
|
|
key = label
|
|
elif label in label_values:
|
|
key = label_values[label]
|
|
else:
|
|
raise ValueError(f"Unknown label: {label}")
|
|
counts[key] += 1
|
|
|
|
return counts
|
|
|
|
|
|
if __name__ == "__main__":
|
|
path = "./datasets/annotated/"
|
|
|
|
annotated_dataset_list = []
|
|
|
|
for file in os.walk(path):
|
|
for filename in file[2]:
|
|
if filename.endswith(".json"):
|
|
annotated_dataset_list.append(os.path.join(file[0], filename))
|
|
|
|
# number based menu to match file
|
|
for file in annotated_dataset_list:
|
|
print(f"{annotated_dataset_list.index(file)}: {file}")
|
|
_ = input("Enter the number of the annotated dataset to analyze: ")
|
|
path = annotated_dataset_list[int(_)]
|
|
counts = parse_annotated(path)
|
|
print(json.dumps(counts, indent=2))
|