import argparse import datetime as dt import json import os import secrets import string from pathlib import Path from typing import Iterable from google import genai from loguru import logger from .enums import TokenLabel DEFAULT_BATCH_SIZE = 20 DEFAULT_MODEL = "gemini-2.5-flash-lite" DEFAULT_RAW_LOG_PATH = "logs/gemini_raw.log" DEFAULT_CONVERT_INPUT_DIR = "datasets/preannotated" DEFAULT_CONVERT_OUTPUT_DIR = "datasets/annotated" LABEL_NAMES = [label.name for label in TokenLabel] VALUE_TO_NAME = {label.value: label.name for label in TokenLabel} ID_ALPHABET = string.ascii_uppercase + string.digits def _chunked(items: list[int], size: int) -> Iterable[list[int]]: if size < 1: raise ValueError("batch_size must be at least 1.") for start in range(0, len(items), size): yield items[start : start + size] def _generate_id() -> str: return ( f"{''.join(secrets.choice(ID_ALPHABET) for _ in range(3))}" f"-{''.join(secrets.choice(ID_ALPHABET) for _ in range(6))}" ) def _all_occurrences(text: str, span: str) -> list[int]: occurrences = [] start = 0 while True: idx = text.find(span, start) if idx == -1: break occurrences.append(idx) start = idx + 1 return occurrences def _build_prompt(texts: list[str]) -> str: labels = ", ".join(LABEL_NAMES) return ( "You are a token pre-annotator. For each input text, return JSON with tagged " "token/subword/word/span labels.\n" f"Allowed labels: {labels}.\n" "Rules:\n" "- Output ONLY valid JSON (no markdown).\n" "- Return a JSON array with the same length/order as the input.\n" "- Each item must be an object: " '{"text": "", "tags": [{"span": "", "label": "