import json from pathlib import Path import os from enums import TokenLabel def parse_annotated(path): # parse json file # # return the counts of each token label in the dataset data = json.loads(Path(path).read_text(encoding="utf-8")) if not isinstance(data, list): raise ValueError("Annotated dataset must be a JSON array.") counts = {label.name: 0 for label in TokenLabel} label_names = {label.name: label.name for label in TokenLabel} label_values = {label.value: label.name for label in TokenLabel} for item in data: annotations = item.get("annotations", []) if not isinstance(annotations, list): raise ValueError("Annotations must be a list.") for annotation in annotations: results = annotation.get("result", []) if not isinstance(results, list): raise ValueError("Annotation results must be a list.") for result in results: value = result.get("value", {}) labels = value.get("labels", []) if not isinstance(labels, list): raise ValueError("Result labels must be a list.") for label in labels: if label in label_names: key = label elif label in label_values: key = label_values[label] else: raise ValueError(f"Unknown label: {label}") counts[key] += 1 return counts if __name__ == "__main__": path = "./datasets/annotated/" annotated_dataset_list = [] for file in os.walk(path): for filename in file[2]: if filename.endswith(".json"): annotated_dataset_list.append(os.path.join(file[0], filename)) # number based menu to match file for file in annotated_dataset_list: print(f"{annotated_dataset_list.index(file)}: {file}") _ = input("Enter the number of the annotated dataset to analyze: ") path = annotated_dataset_list[int(_)] counts = parse_annotated(path) print(json.dumps(counts, indent=2))