added base datasets
This commit is contained in:
61
dataset_analysis.py
Normal file
61
dataset_analysis.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
from enums import TokenLabel
|
||||
|
||||
|
||||
def parse_annotated(path):
|
||||
# parse json file
|
||||
#
|
||||
# return the counts of each token label in the dataset
|
||||
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||
if not isinstance(data, list):
|
||||
raise ValueError("Annotated dataset must be a JSON array.")
|
||||
|
||||
counts = {label.name: 0 for label in TokenLabel}
|
||||
label_names = {label.name: label.name for label in TokenLabel}
|
||||
label_values = {label.value: label.name for label in TokenLabel}
|
||||
|
||||
for item in data:
|
||||
annotations = item.get("annotations", [])
|
||||
if not isinstance(annotations, list):
|
||||
raise ValueError("Annotations must be a list.")
|
||||
for annotation in annotations:
|
||||
results = annotation.get("result", [])
|
||||
if not isinstance(results, list):
|
||||
raise ValueError("Annotation results must be a list.")
|
||||
for result in results:
|
||||
value = result.get("value", {})
|
||||
labels = value.get("labels", [])
|
||||
if not isinstance(labels, list):
|
||||
raise ValueError("Result labels must be a list.")
|
||||
for label in labels:
|
||||
if label in label_names:
|
||||
key = label
|
||||
elif label in label_values:
|
||||
key = label_values[label]
|
||||
else:
|
||||
raise ValueError(f"Unknown label: {label}")
|
||||
counts[key] += 1
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
path = "./datasets/annotated/"
|
||||
|
||||
annotated_dataset_list = []
|
||||
|
||||
for file in os.walk(path):
|
||||
for filename in file[2]:
|
||||
if filename.endswith(".json"):
|
||||
annotated_dataset_list.append(os.path.join(file[0], filename))
|
||||
|
||||
# number based menu to match file
|
||||
for file in annotated_dataset_list:
|
||||
print(f"{annotated_dataset_list.index(file)}: {file}")
|
||||
_ = input("Enter the number of the annotated dataset to analyze: ")
|
||||
path = annotated_dataset_list[int(_)]
|
||||
counts = parse_annotated(path)
|
||||
print(json.dumps(counts, indent=2))
|
||||
Reference in New Issue
Block a user