added base datasets

This commit is contained in:
2026-04-07 22:00:40 +05:30
commit ec6fbe40e4
14 changed files with 9253 additions and 0 deletions

11
.gitignore vendored Normal file
View File

@@ -0,0 +1,11 @@
# Python-generated files
__pycache__/
*.py[oc]
build/
dist/
wheels/
*.egg-info
# Virtual environments
.venv
logs/

1
.python-version Normal file
View File

@@ -0,0 +1 @@
3.12

73
README.md Normal file
View File

@@ -0,0 +1,73 @@
# clint-dataset
Dataset for labelling queries containing tasks in natural language, with a focus on command-line operations pertaining to popular CLI tools.
These queries were generated by prompting various commercially available LLMs and were pre-annotated using Gemini-2.5-flash-lite. They were then converted to a label studio supported format and then annotations were manually revised.
## Setup
```bash
uv sync
```
Set your Gemini API key:
```bash
export GEMINI_API_KEY="your-key"
```
Optional environment variables:
```bash
export GEMINI_MODEL="gemini-2.5-flash-lite"
export GEMINI_RAW_LOG_FILE="logs/gemini_raw.log"
```
## Pre-annotate raw datasets
Raw datasets live in `datasets/raw` and contain:
```json
[{ "text": "Trim the first 15 seconds from 'video.mp4'." }]
```
Run the pre-annotator:
```bash
uv run python main.py --mode preannotate --input-dir datasets/raw --output-dir datasets/preannotated --batch-size 20
```
Output format (per item):
```json
{
"text": "Trim the first 15 seconds from 'video.mp4'.",
"tags": [
{ "span": "Trim", "label": "ACTION" },
{ "span": "15", "label": "NUMBER" }
]
}
```
Raw Gemini responses are logged to `logs/gemini_raw.log` (override with `--raw-log-file` or `GEMINI_RAW_LOG_FILE`).
## Convert preannotated → annotated
Convert pre-annotated files to Label Studiostyle annotated JSON:
```bash
uv run python main.py --mode convert --input-dir datasets/preannotated --output-dir datasets/annotated
```
The converter generates IDs in `XXX-XXXXXX` format for annotation results and sets `annotations[].id` to a sequential number.
## Analyze annotated datasets
`dataset_analysis.parse_annotated(path)` returns a dict of label counts:
```python
from dataset_analysis import parse_annotated
counts = parse_annotated("datasets/annotated/ffmpeg_gpt_v1.json")
print(counts)
```

61
dataset_analysis.py Normal file
View File

@@ -0,0 +1,61 @@
import json
from pathlib import Path
import os
from enums import TokenLabel
def parse_annotated(path):
# parse json file
#
# return the counts of each token label in the dataset
data = json.loads(Path(path).read_text(encoding="utf-8"))
if not isinstance(data, list):
raise ValueError("Annotated dataset must be a JSON array.")
counts = {label.name: 0 for label in TokenLabel}
label_names = {label.name: label.name for label in TokenLabel}
label_values = {label.value: label.name for label in TokenLabel}
for item in data:
annotations = item.get("annotations", [])
if not isinstance(annotations, list):
raise ValueError("Annotations must be a list.")
for annotation in annotations:
results = annotation.get("result", [])
if not isinstance(results, list):
raise ValueError("Annotation results must be a list.")
for result in results:
value = result.get("value", {})
labels = value.get("labels", [])
if not isinstance(labels, list):
raise ValueError("Result labels must be a list.")
for label in labels:
if label in label_names:
key = label
elif label in label_values:
key = label_values[label]
else:
raise ValueError(f"Unknown label: {label}")
counts[key] += 1
return counts
if __name__ == "__main__":
path = "./datasets/annotated/"
annotated_dataset_list = []
for file in os.walk(path):
for filename in file[2]:
if filename.endswith(".json"):
annotated_dataset_list.append(os.path.join(file[0], filename))
# number based menu to match file
for file in annotated_dataset_list:
print(f"{annotated_dataset_list.index(file)}: {file}")
_ = input("Enter the number of the annotated dataset to analyze: ")
path = annotated_dataset_list[int(_)]
counts = parse_annotated(path)
print(json.dumps(counts, indent=2))

2
dataset_augmentor.py Normal file
View File

@@ -0,0 +1,2 @@

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,377 @@
[
{
"text": "Trim the first 15 seconds from 'Summer Vacation (Italy) 2024.mp4' and save it."
},
{
"text": "Split the file 'The.Bear.S03E05.720p.mkv' into three equal segments."
},
{
"text": "Stitch together 'intro_sequence.mov' and 'Main_Feature_V2_Final.mp4' into a single output."
},
{
"text": "From 'interview_recording_01 (Backup).avi', slice out the segment from 00:05:00 to 00:10:00."
},
{
"text": "Take all videos in the /raw_footage/ folder and stitch them in alphabetical order."
},
{
"text": "For every file matching 'clip_??_scene.mp4', trim the last 2 seconds."
},
{
"text": "In the directory /exports/, find videos longer than 10 minutes and split them at the halfway point."
},
{
"text": "Splicing 'Logo_Overlay.png' at the start of 'Product_Demo_Final_1080p.mp4' is the first step."
},
{
"text": "Remove the middle 20 seconds from 'Vlog #42 - My New House.mp4' starting at 02:30."
},
{
"text": "If a video has a resolution of 3840x2160, split it into four 4K quadrants."
},
{
"text": "Stitch 'Part_A.mp4', 'Part_B.mp4', and 'Part_C.mp4' using a crossfade transition."
},
{
"text": "For each video in the 'Daily_Uploads' folder, trim the intro if it is longer than 5 seconds."
},
{
"text": "The file 'Family.Reunion.(2023).Part.1.mkv' needs to be trimmed to just the first hour."
},
{
"text": "Filter for all .mov files with a bitrate under 2000kbps and stitch them into 'low_res_compilation.mov'."
},
{
"text": "Apply a split every 30 seconds to the file 'security_cam_08-24-2026.mp4'."
},
{
"text": "Join 'scene_01_take_05.mp4' and 'scene_01_take_06.mp4' but trim the slate from the start of each."
},
{
"text": "Using a glob pattern 'vid_*.mp4', find all matching files and stitch them together."
},
{
"text": "With 'Wedding_Highlight_Video_(Extended_Version).mp4', slice the section between 12:45 and 15:20."
},
{
"text": "For any video containing 'DRAFT' in the filename, trim the last 10 seconds of silence."
},
{
"text": "Split 'concert_full_set.mkv' into separate files based on the metadata chapter markers."
},
{
"text": "Stitch 'b-roll_city_traffic.mp4' with 'audio_track_02.wav' and then trim it to 30 seconds."
},
{
"text": "Iterate through 'C:/Users/Media/Desktop/*.avi' and split each into 60-second clips."
},
{
"text": "If the duration of 'presentation_recording.mp4' exceeds 1 hour, split it into two parts."
},
{
"text": "Trim '01_Intro.mp4' to end exactly at 00:00:15."
},
{
"text": "Splicing 'outro_credits.mp4' onto the end of all videos in the 'Finished' folder is required."
},
{
"text": "From 'Nature_Documentary_S01E01_The_Forest.mp4', remove the segments with no audio."
},
{
"text": "Stitch all videos with the 'HDR' tag in their metadata into a single reel."
},
{
"text": "For every .mp4 file in the current directory, trim the first 100 frames."
},
{
"text": "Split 'long_tutorial_v3.mp4' at the 10:00, 20:00, and 30:00 marks."
},
{
"text": "The video 'Mountain_Climb_(Edited).mov' needs the last 5 minutes trimmed off."
},
{
"text": "Find files matching the regex '^scene_[0-9]{3}\\.mp4$' and stitch them sequentially."
},
{
"text": "Combine 'logo_animation_fixed.mp4' with 'main_content_v4.mp4' and 'social_media_tags.mp4'."
},
{
"text": "Slice 'podcasting_session_01.mp4' every time a silence longer than 2 seconds is detected."
},
{
"text": "If the file size is greater than 5GB, split '4K_Drone_Footage_Raw.mp4' into 2GB chunks."
},
{
"text": "Trim the video '2026-04-07_Log_Entry.mp4' to start at the first detected motion."
},
{
"text": "For each video in 'project_x', stitch them together if they share the same frame rate."
},
{
"text": "Splice 'emergency_alert_broadcast.mp4' into the middle of 'regular_programming.ts' at 15:00."
},
{
"text": "With 'The_Grand_Budapest_Hotel_Trailer.mp4', trim the black frames from the beginning and end."
},
{
"text": "Split 'GOPR0012.MP4' into segments whenever the camera GPS coordinates change."
},
{
"text": "Stitch 'Clip (1).mp4', 'Clip (2).mp4', and 'Clip (3).mp4' in numerical order."
},
{
"text": "For all videos in 'Archive' with a 4:3 aspect ratio, trim them to 1 minute."
},
{
"text": "Trim the audio-only portion from 'lecture_series_04.m4v' and stitch it with 'slides_04.mp4'."
},
{
"text": "Using the file 'interview_subject_A.mp4', slice out every instance where the speaker says 'um'."
},
{
"text": "Split the video 'test_pattern_v12.avi' every 500MB."
},
{
"text": "Stitch the files listed in 'file_list.txt' into a seamless video."
},
{
"text": "For any video created before 2025, trim the metadata and splice a new header."
},
{
"text": "Take 'my_cool_video.mp4' and 'another_one.mp4' and stitch them together."
},
{
"text": "The file 'Succession.S04E10.1080p.mkv' should be split at the 30-minute mark."
},
{
"text": "Trim the start of 'morning_routine_vlog.mp4' by 12 seconds."
},
{
"text": "Splice a 2-second black screen between 'intro.mp4' and 'body.mp4'."
},
{
"text": "Find videos with 'H.264' codec and stitch them into a compilation called 'H264_Reel.mp4'."
},
{
"text": "For each video in the folder 'TikTok_Drafts', trim the last 0.5 seconds."
},
{
"text": "Split 'CCTV_Storage_Disk_A.mp4' into hourly intervals."
},
{
"text": "From 'Cinematic_Shot_[001].mov', remove the first 2 seconds of camera shake."
},
{
"text": "Stitch 'Top_10_Countdown_Part_10.mp4' through 'Top_10_Countdown_Part_1.mp4' in reverse order."
},
{
"text": "Trim 'cooking_tutorial_raw.mp4' based on the bookmarks in the file metadata."
},
{
"text": "For all files matching '*_backup.mp4', split them in half and delete the second half."
},
{
"text": "Splice 'watermark_fixed.png' into the corner of 'product_render.mp4' for its entire duration."
},
{
"text": "With 'Breaking_Bad_S05E16.mkv', trim the 'previously on' segment."
},
{
"text": "Split the file 'ambience_loop_forest.mp4' exactly at the point where the loop repeats."
},
{
"text": "Stitch 'phone_video_01.mp4' and 'phone_video_02.mp4' after rotating the second one."
},
{
"text": "For every video in 'Dailies/Scene_5/', trim the clapperboard at the start."
},
{
"text": "In 'webinar_recap.mp4', slice out the Q&A session from 45:00 to the end."
},
{
"text": "If a video is 1920x1080, splice the 'standard_definition_warning.mp4' to the front."
},
{
"text": "Split 'gameplay_recording_3hr.mp4' into three 1-hour files."
},
{
"text": "Trim 'Birthday_Party_(Edited).mp4' so it ends right before the candles are blown out."
},
{
"text": "Stitch 'camera_1_view.mp4' and 'camera_2_view.mp4' into a side-by-side split screen."
},
{
"text": "For each .mkv file in the 'Movies' folder, trim the first 30 seconds of credits."
},
{
"text": "Using the pattern 'shot_v[0-9].mp4', stitch the highest version numbers together."
},
{
"text": "From 'stock_footage_clouds.mov', trim the sections where the sun is obscured."
},
{
"text": "Split 'unboxing_video_long.mp4' at every scene change detected with 0.4 threshold."
},
{
"text": "Stitch 'audio_narration.mp3' onto 'silent_presentation.mp4' and trim to the shorter length."
},
{
"text": "For any video with 'vertical' in the name, trim the top and bottom to make it 16:9."
},
{
"text": "Slice 'long_running_process.mp4' to only include the first and last 5 minutes."
},
{
"text": "If the video 'test_render_01.mp4' has no audio track, splice in 'white_noise.wav'."
},
{
"text": "Trim the intro and outro of all videos in the 'Course_Modules' subfolder."
},
{
"text": "Split '4k_landscape_timelapse.mp4' into 250 individual image frames."
},
{
"text": "Stitch 'wedding_ceremony.mp4' and 'wedding_reception.mp4' with a 5-second fade."
},
{
"text": "For each file in 'temp_renders/', if it is less than 1 second long, delete it."
},
{
"text": "Trim 'The_Dark_Knight_Trailer (2008).mp4' to only include the Joker scenes."
},
{
"text": "Splicing 'end_card_v3.mp4' to every video in 'Youtube_Uploads' is the goal."
},
{
"text": "Split 'CCTV_04_07_26.mp4' whenever motion is detected in the 'gate' region."
},
{
"text": "With 'tutorial_recording.mp4', slice the segment from 05:00 to 07:00 and save as 'Highlight.mp4'."
},
{
"text": "Stitch all videos in 'raw_clips' that were recorded at 60fps."
},
{
"text": "For any video named 'Untitled (Copy).mp4', trim it to 10 seconds and rename it."
},
{
"text": "Split 'audio_sync_test.mp4' into separate video and audio streams."
},
{
"text": "Trim 'Gym_Workout_Session.mp4' to remove the rest periods between sets."
},
{
"text": "Stitch 'A.mp4', 'B.mp4', and 'C.mp4' but reverse the order to 'C.mp4', 'B.mp4', 'A.mp4'."
},
{
"text": "For each video in 'Client_Review', splice the 'DRAFT' watermark across the middle."
},
{
"text": "Find videos with duration > 300s and split them into 60s clips."
},
{
"text": "The video 'Street_Food_Tour [HDR].mov' needs the first 2 minutes trimmed."
},
{
"text": "Splice 'transition_effect.mp4' between every video in the 'Montage' folder during stitching."
},
{
"text": "From 'Interview_with_CEO_Final.mp4', trim the first 3 seconds of dead air."
},
{
"text": "Split '24_hour_surveillance_feed.mp4' into 24 one-hour segments."
},
{
"text": "Stitch 'Intro (English).mp4' with 'Content.mp4' and 'Outro (English).mp4'."
},
{
"text": "For each file in the 'Renders' folder, if the width is 720, trim it to 5 seconds."
},
{
"text": "Trim 'Space_X_Launch_Live.mkv' to start at T-minus 10 seconds."
},
{
"text": "Using glob '*_scene_*.mp4', stitch clips with matching scene numbers together."
},
{
"text": "Slice 'nature_walk_4k.mp4' to remove the shaky footage at the 12-minute mark."
},
{
"text": "Split 'Podcast_Ep_12.mp4' at the timestamps provided in the 'chapters.json' file."
},
{
"text": "Stitch 'Scene_1.mp4' and 'Scene_1_Alt_End.mp4' and trim the overlap."
},
{
"text": "For all videos in 'Archive' with the file extension .flv, stitch them into one .mp4."
},
{
"text": "Trim the last 30 frames from every video in the 'Animation_Export' folder."
},
{
"text": "Splice 'Sponsor_Segment.mp4' into 'Gaming_Video_01.mp4' at the 8-minute mark."
},
{
"text": "If a video has stereo audio, split it into two mono-audio video files."
},
{
"text": "With 'Travel_Vlog_Ep1 (Final_Draft).mp4', trim the end where the camera falls."
},
{
"text": "Split 'Concert_Multicam.mp4' into 4 separate files, one for each camera angle."
},
{
"text": "Stitch all videos in 'Dailies' that have a creation date of '2026-04-07'."
},
{
"text": "For each video in the 'Trash' folder, trim it to 0 seconds (effectively clear content)."
},
{
"text": "Trim 'Drone_Shot_05.mov' to the first 1000 frames."
},
{
"text": "Slice 'Long_Lecture.mp4' to extract only the slides that contain the word 'Biology'."
},
{
"text": "Split 'Heavy_Metal_Music_Video.mp4' at every beat of the drum (120 BPM)."
},
{
"text": "Stitch 'Part_01_v1.mp4' through 'Part_10_v1.mp4' using the 'list' command."
},
{
"text": "For every video in 'exports/', trim the filename prefix 'FINAL_' and then stitch them."
},
{
"text": "Trim 'Ocean_Waves_Ambient.mp4' to be a perfect 10-second loop."
},
{
"text": "Splice 'Director_Commentary.ac3' into 'The_Movie_Title.mkv' as a second audio track."
},
{
"text": "If the file 'clip_a.mp4' is shorter than 'clip_b.mp4', stitch them in that order."
},
{
"text": "Split 'NASA_Space_Station_Feed.mp4' whenever the video bitrate drops to zero."
},
{
"text": "Trim the last 5 seconds from 'My_Daughter's_First_Steps.mp4'."
},
{
"text": "Stitch 'Intro.mp4' with every file in 'Chapters/' and save them as individual episodes."
},
{
"text": "For each video in 'Pending', trim the first 10 seconds and move to 'Ready'."
},
{
"text": "Slice 'Webinar_Recording_2026.mp4' at 01:20:00 and discard the rest."
},
{
"text": "Split 'Slow_Motion_Reference.mp4' into segments of 120 frames each."
},
{
"text": "Stitch all .mp4 files in 'Folder A' and 'Folder B' into a single master file."
},
{
"text": "Trim 'The_Last_of_Us_S01E01.mkv' to end right when the music starts."
}
]

View File

@@ -0,0 +1,200 @@
[
{
"text": "Convert interview.mp4 to H.265 and reduce the file size under 120MB"
},
{
"text": "Take all the MOV files in my Downloads folder and convert them to MP4"
},
{
"text": "Strip the audio from promo_video.mp4 and replace it with background_music.mp3"
},
{
"text": "Extract a clip from 00:02:15 to 00:05:44 from lecture.mkv without re-encoding"
},
{
"text": "Convert every PNG in this directory into a single timelapse video at 24fps"
},
{
"text": "Add burnt-in subtitles from subtitles.srt to hello_film.mp4"
},
{
"text": "Compress all my drone videos to be under 50MB each, keeping 1080p"
},
{
"text": "Extract the audio track from podcast_episode_12.mp4 as a FLAC file"
},
{
"text": "Watermark every video in the /exports folder with logo.png in the bottom right"
},
{
"text": "Resize input.mov to 720p and convert to WebM"
},
{
"text": "Take a screenshot from the video at the 30-second mark"
},
{
"text": "Merge part1.mp4, part2.mp4, and part3.mp4 into one file"
},
{
"text": "Normalize the loudness of all audio files in my podcast folder to -16 LUFS"
},
{
"text": "Speed up the timelapse footage to 4x without changing the audio pitch"
},
{
"text": "Create a side-by-side comparison video of before.mp4 and after.mp4"
},
{
"text": "Convert the MKV to MP4 keeping all subtitle and audio tracks"
},
{
"text": "Extract one frame every 5 seconds from the dashcam_recording video file and save them to frames/"
},
{
"text": "Add a 3-second black fade-in and fade-out to presentation clip video file"
},
{
"text": "Loop the henry.avi exactly 10 times and export as a single video"
},
{
"text": "Crop 39fjsai.mp4 to a vertical 9:16 format for Instagram"
},
{
"text": "Remove the first 10 seconds from all the clips in the /raw folder"
},
{
"text": "Convert my audio files from MP3 to AAC at 192kbps"
},
{
"text": "Create a GIF from the first 4 seconds of the youtube_download.webm starting at 1:12"
},
{
"text": "Reduce video bitrate to 1200kbps and encode audio to 64kbps aac of to___encode.mp4 and save it as output.mkv"
},
{
"text": "Rotate the hea3434gvjj__www.download.com.mp4 to 90 degrees clockwise"
},
{
"text": "Extract all keyframes from claivido_final.mkv as JPEG images"
},
{
"text": "Add chapter markers and a cover thumbnail to podcast.mp4"
},
{
"text": "Convert the entire /recordings folder from WAV to MP3 at 320kbps"
},
{
"text": "Stabilize all the videos created today and save them to stabilized folder"
},
{
"text": "Scale down all images in the sike directory to a max width of 1920px"
},
{
"text": "Re-encode stupid_video.avi using nvenc and with hevc at 1080p"
},
{
"text": "Create a picture-in-picture with webcam.mp4 overlaid on screen_recording.mp4 in bottom right"
},
{
"text": "Convert all my .avi files to MKV with lossless encoding"
},
{
"text": "Reverse all video files containing '_reverse_needed_' for a rewind effect and save them to a directory starting with reversed"
},
{
"text": "Re-wrap the H.264 stream from .ts to .mp4 with no re-encode"
},
{
"text": "Convert the RAW image sequence from /timelapse to a 4K ProRes video"
},
{
"text": "Split the 3-hour conference recording into 30-minute segments"
},
{
"text": "Remove the audio entirely from all videos in the /silent folder"
},
{
"text": "Create a low-res proxy version of the 4K footage for offline editing"
},
{
"text": "Encode the video with 2-pass encoding for a target bitrate of 2Mbps"
},
{
"text": "Add a scrolling lower-third text overlay to the interview"
},
{
"text": "Export audio as stereo WAV at 48kHz for DaVinci Resolve"
},
{
"text": "Take every file in the folder and generate a 10-second preview clip from the middle"
},
{
"text": "Convert the SRT subtitle file to WebVTT format"
},
{
"text": "Deinterlace the video files in /old_footage"
},
{
"text": "Create an optimized animated WebP from the video loop"
},
{
"text": "Stack 203jfh.mp4 and 29jfj21112.webm and aosidjgf22.mkv vertically into one tall video and save it as output"
},
{
"text": "Convert all images in the product folder to WebP with 80% quality"
},
{
"text": "Extract just the chapter 3 portion based on chapter metadata of 029jghijdf__asuihttp.mp4"
},
{
"text": "Compress all JPEGs in the folder to a max of 200KB per image"
},
{
"text": "Convert and re-tag all WMA files to AAC, preserving all metadata"
},
{
"text": "Create a 1:1 square version of all videos created last thursday with padding"
},
{
"text": "Trim the last 30 seconds off every clip in my /raw directory"
},
{
"text": "Add a podcast intro and outro to each episode file in the folder"
},
{
"text": "Mux the separate video_new_final-release.h264 and 12831245_audio.aac streams into an MP4"
},
{
"text": "Take the interview clip and output it in every social format: 16:9, 9:16, and 1:1"
},
{
"text": "Encode a lossless copy for archival and a compressed copy for sharing"
},
{
"text": "Apply CLAHE contrast enhancement to all images in the dataset folder"
},
{
"text": "Add chapter info from chapters.txt into the output MKV"
},
{
"text": "Create a video slideshow from photos in slideshow_photos/ with 3-second holds and a Ken Burns effect"
},
{
"text": "Convert the audio to mono for podcast distribution"
},
{
"text": "Batch-resize all product photos in product/ to exactly 800x800 with white padding"
},
{
"text": "Extract all frames between 10s and 20s in 290fj3.mp4 as PNG files"
},
{
"text": "Build a mosaic of all the clip thumbnails into one image grid"
},
{
"text": "Re-encode everything in the archive with AV1"
},
{
"text": "Clip the first 5 seconds from all lecture recordings and save separately"
}
]

23
enums.py Normal file
View File

@@ -0,0 +1,23 @@
from enum import Enum
class TokenLabel(Enum):
ACTION = "ACTION"
TARGET = "TARGET"
PREPOSITION = "PREPOSITION"
CONJUNCTION = "CONJUNCTION"
MODIFIER = "MODIFIER"
OBJECT = "OBJECT"
VALUE_TOKEN = "VALUE_TOKEN"
NUMBER = "NUMBER"
UNIT = "UNIT"
TIME = "TIME"
DATE = "DATE"
ORDINAL = "ORDINAL"
COMPARATOR = "COMPARATOR"
FILTER_HINT = "FILTER_HINT"
PATH = "PATH"
PATTERN = "PATTERN"
URL = "URL"
TIMESTAMP = "TIMESTAMP"
RAW_PHRASE = "0"

359
main.py Normal file
View File

@@ -0,0 +1,359 @@
import argparse
import datetime as dt
import json
import os
import secrets
import string
from pathlib import Path
from typing import Iterable
from google import genai
from loguru import logger
from .enums import TokenLabel
DEFAULT_BATCH_SIZE = 20
DEFAULT_MODEL = "gemini-2.5-flash-lite"
DEFAULT_RAW_LOG_PATH = "logs/gemini_raw.log"
DEFAULT_CONVERT_INPUT_DIR = "datasets/preannotated"
DEFAULT_CONVERT_OUTPUT_DIR = "datasets/annotated"
LABEL_NAMES = [label.name for label in TokenLabel]
VALUE_TO_NAME = {label.value: label.name for label in TokenLabel}
ID_ALPHABET = string.ascii_uppercase + string.digits
def _chunked(items: list[int], size: int) -> Iterable[list[int]]:
if size < 1:
raise ValueError("batch_size must be at least 1.")
for start in range(0, len(items), size):
yield items[start : start + size]
def _generate_id() -> str:
return (
f"{''.join(secrets.choice(ID_ALPHABET) for _ in range(3))}"
f"-{''.join(secrets.choice(ID_ALPHABET) for _ in range(6))}"
)
def _all_occurrences(text: str, span: str) -> list[int]:
occurrences = []
start = 0
while True:
idx = text.find(span, start)
if idx == -1:
break
occurrences.append(idx)
start = idx + 1
return occurrences
def _build_prompt(texts: list[str]) -> str:
labels = ", ".join(LABEL_NAMES)
return (
"You are a token pre-annotator. For each input text, return JSON with tagged "
"token/subword/word/span labels.\n"
f"Allowed labels: {labels}.\n"
"Rules:\n"
"- Output ONLY valid JSON (no markdown).\n"
"- Return a JSON array with the same length/order as the input.\n"
"- Each item must be an object: "
'{"text": "<original>", "tags": [{"span": "<exact substring>", "label": "<LABEL>"}]}.\n'
"- The span must be an exact substring of the original text.\n"
"- Use RAW_PHRASE when no other label applies.\n\n"
f"Input texts: {json.dumps(texts, ensure_ascii=True)}"
)
def _normalize_label(label: str) -> str:
if label in LABEL_NAMES:
return label
if label in VALUE_TO_NAME:
return VALUE_TO_NAME[label]
raise ValueError(f"Unknown label: {label}")
def _normalize_result(text: str, result: dict) -> dict:
if result.get("text") != text:
raise ValueError("Gemini result text does not match input text.")
tags = result.get("tags")
if not isinstance(tags, list):
raise ValueError("Gemini result tags must be a list.")
normalized_tags = []
for tag in tags:
if not isinstance(tag, dict):
raise ValueError("Each tag must be an object.")
span = tag.get("span")
label = tag.get("label")
if not isinstance(span, str) or not isinstance(label, str):
raise ValueError("Each tag must include string span and label fields.")
if span not in text:
raise ValueError(f"Span not found in text: {span}")
normalized_tags.append({"span": span, "label": _normalize_label(label)})
return {"text": text, "tags": normalized_tags}
def preannotate_tokens(
texts: list[str], client: genai.Client, model: str
) -> list[dict]:
prompt = _build_prompt(texts)
response = client.models.generate_content(model=model, contents=prompt)
if response.text is None:
raise ValueError("Gemini returned an empty response.")
logger.bind(raw_gemini=True).trace(response.text)
parsed = json.loads(response.text)
if not isinstance(parsed, list) or len(parsed) != len(texts):
raise ValueError("Gemini response must be a JSON array matching input length.")
return [_normalize_result(text, result) for text, result in zip(texts, parsed)]
def _load_raw_records(path: Path) -> list[dict]:
raw = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(raw, list):
raise ValueError(f"Dataset {path} must be a JSON array.")
for idx, item in enumerate(raw):
if not isinstance(item, dict) or "text" not in item:
raise ValueError(
f"Dataset {path} item {idx} must be an object with a text field."
)
return raw
def _load_preannotated_records(path: Path) -> list[dict]:
raw = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(raw, list):
raise ValueError(f"Dataset {path} must be a JSON array.")
for idx, item in enumerate(raw):
if not isinstance(item, dict) or "text" not in item:
raise ValueError(
f"Dataset {path} item {idx} must be an object with a text field."
)
if "tags" not in item or not isinstance(item["tags"], list):
raise ValueError(
f"Dataset {path} item {idx} must include a tags list for conversion."
)
return raw
def _build_labelstudio_results(text: str, tags: list[dict]) -> list[dict]:
occurrences_map: dict[str, list[int]] = {}
occurrence_index: dict[str, int] = {}
results = []
for tag in tags:
if not isinstance(tag, dict):
raise ValueError("Each tag must be an object.")
span = tag.get("span")
label = tag.get("label")
if not isinstance(span, str) or not isinstance(label, str):
raise ValueError("Each tag must include string span and label fields.")
if span not in occurrences_map:
occurrences_map[span] = _all_occurrences(text, span)
occurrence_index[span] = 0
occurrences = occurrences_map[span]
idx = occurrence_index[span]
if idx >= len(occurrences):
raise ValueError(f"Span not found in text: {span}")
start = occurrences[idx]
end = start + len(span)
occurrence_index[span] = idx + 1
results.append(
{
"value": {
"start": start,
"end": end,
"text": span,
"labels": [_normalize_label(label)],
},
"id": _generate_id(),
"from_name": "label",
"to_name": "text",
"type": "labels",
"origin": "manual",
}
)
return results
def _preannotate_dataset(
input_path: Path,
output_path: Path,
client: genai.Client,
model: str,
batch_size: int,
) -> None:
raw_items = _load_raw_records(input_path)
texts = [item["text"] for item in raw_items]
annotated_items: list[dict | None] = [None] * len(raw_items)
logger.info("Pre-annotating {} records from {}", len(texts), input_path)
for batch_indices in _chunked(list(range(len(texts))), batch_size):
batch_texts = [texts[idx] for idx in batch_indices]
logger.debug(
"Sending batch {}-{} (size {}) to Gemini",
batch_indices[0],
batch_indices[-1],
len(batch_texts),
)
batch_results = preannotate_tokens(batch_texts, client, model)
for idx, result in zip(batch_indices, batch_results):
item = dict(raw_items[idx])
item["tags"] = result["tags"]
annotated_items[idx] = item
if any(item is None for item in annotated_items):
raise ValueError("Pre-annotation failed to produce results for all items.")
output_path.write_text(
json.dumps(annotated_items, indent=2, ensure_ascii=True), encoding="utf-8"
)
logger.info("Wrote pre-annotated dataset to {}", output_path)
def _convert_preannotated_dataset(input_path: Path, output_path: Path) -> None:
preannotated_items = _load_preannotated_records(input_path)
now_iso = dt.datetime.now(dt.UTC).isoformat()
tasks = []
logger.info("Converting {} records from {}", len(preannotated_items), input_path)
for index, item in enumerate(preannotated_items):
text = item["text"]
tags = item["tags"]
task_id = _generate_id()
annotation_id = index + 1
results = _build_labelstudio_results(text, tags)
annotation = {
"id": annotation_id,
"completed_by": 2,
"result": results,
"was_cancelled": False,
"ground_truth": False,
"created_at": now_iso,
"updated_at": now_iso,
"draft_created_at": now_iso,
"lead_time": 0.0,
"prediction": {},
"result_count": len(results),
"unique_id": _generate_id(),
"import_id": None,
"last_action": None,
"bulk_created": False,
"task": task_id,
"project": None,
"updated_by": None,
"parent_prediction": None,
"parent_annotation": None,
"last_created_by": None,
}
tasks.append(
{
"id": task_id,
"annotations": [annotation],
"file_upload": None,
"drafts": [],
"predictions": [],
"data": {"text": text},
"meta": {},
"created_at": now_iso,
"updated_at": now_iso,
"allow_skip": True,
"inner_id": index + 1,
"total_annotations": 1,
"cancelled_annotations": 0,
"total_predictions": 0,
"comment_count": 0,
"unresolved_comment_count": 0,
"last_comment_updated_at": None,
"project": None,
"updated_by": None,
"comment_authors": [],
}
)
output_path.write_text(
json.dumps(tasks, indent=2, ensure_ascii=True), encoding="utf-8"
)
logger.info("Wrote converted annotated dataset to {}", output_path)
def main():
parser = argparse.ArgumentParser(description="Pre-annotate datasets with Gemini.")
parser.add_argument(
"--mode",
choices=["preannotate", "convert"],
default="preannotate",
)
parser.add_argument("--input-dir", default=None)
parser.add_argument("--output-dir", default=None)
parser.add_argument("--input-file", default=None)
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
parser.add_argument("--model", default=os.getenv("GEMINI_MODEL", DEFAULT_MODEL))
parser.add_argument(
"--raw-log-file",
default=os.getenv("GEMINI_RAW_LOG_FILE", DEFAULT_RAW_LOG_PATH),
)
args = parser.parse_args()
if args.mode == "preannotate":
raw_log_path = Path(args.raw_log_file)
raw_log_path.parent.mkdir(parents=True, exist_ok=True)
logger.add(
raw_log_path,
level="TRACE",
filter=lambda record: record["extra"].get("raw_gemini") is True,
format="{message}",
)
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise ValueError("GEMINI_API_KEY must be set in the environment.")
client = genai.Client(api_key=api_key)
input_dir = Path(args.input_dir or "datasets/raw")
output_dir = Path(args.output_dir or "datasets/preannotated")
output_dir.mkdir(parents=True, exist_ok=True)
if args.input_file:
input_paths = [Path(args.input_file)]
else:
input_paths = sorted(input_dir.glob("*.json"))
if not input_paths:
raise ValueError(f"No input datasets found in {input_dir}.")
logger.info(
"Starting pre-annotation: model={}, batch_size={}, input_dir={}, output_dir={}",
args.model,
args.batch_size,
input_dir,
output_dir,
)
for input_path in input_paths:
output_path = output_dir / input_path.name
_preannotate_dataset(
input_path, output_path, client, args.model, args.batch_size
)
return
input_dir = Path(args.input_dir or DEFAULT_CONVERT_INPUT_DIR)
output_dir = Path(args.output_dir or DEFAULT_CONVERT_OUTPUT_DIR)
output_dir.mkdir(parents=True, exist_ok=True)
if args.input_file:
input_paths = [Path(args.input_file)]
else:
input_paths = sorted(input_dir.glob("*.json"))
if not input_paths:
raise ValueError(f"No input datasets found in {input_dir}.")
logger.info(
"Starting conversion: input_dir={}, output_dir={}",
input_dir,
output_dir,
)
for input_path in input_paths:
output_path = output_dir / input_path.name
_convert_preannotated_dataset(input_path, output_path)
if __name__ == "__main__":
main()

12
pyproject.toml Normal file
View File

@@ -0,0 +1,12 @@
[project]
name = "clint-dataset"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"google-genai>=1.70.0",
"label-studio>=1.23.0",
"label-studio-ml>=1.0.9",
"loguru>=0.7.3",
]

3101
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff