added base datasets

2026-04-07 22:00:40 +05:30
commit ec6fbe40e4
14 changed files with 9253 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,11 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+
+# Virtual environments
+.venv
+logs/
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
+3.12
--- a/README.md
+++ b/README.md
@@ -0,0 +1,73 @@
+# clint-dataset
+
+Dataset for labelling queries containing tasks in natural language, with a focus on command-line operations pertaining to popular CLI tools.
+
+These queries were generated by prompting various commercially available LLMs and were pre-annotated using Gemini-2.5-flash-lite. They were then converted to a label studio supported format and then annotations were manually revised.
+
+## Setup
+
+```bash
+uv sync
+```
+
+Set your Gemini API key:
+
+```bash
+export GEMINI_API_KEY="your-key"
+```
+
+Optional environment variables:
+
+```bash
+export GEMINI_MODEL="gemini-2.5-flash-lite"
+export GEMINI_RAW_LOG_FILE="logs/gemini_raw.log"
+```
+
+## Pre-annotate raw datasets
+
+Raw datasets live in `datasets/raw` and contain:
+
+```json
+[{ "text": "Trim the first 15 seconds from 'video.mp4'." }]
+```
+
+Run the pre-annotator:
+
+```bash
+uv run python main.py --mode preannotate --input-dir datasets/raw --output-dir datasets/preannotated --batch-size 20
+```
+
+Output format (per item):
+
+```json
+{
+  "text": "Trim the first 15 seconds from 'video.mp4'.",
+  "tags": [
+    { "span": "Trim", "label": "ACTION" },
+    { "span": "15", "label": "NUMBER" }
+  ]
+}
+```
+
+Raw Gemini responses are logged to `logs/gemini_raw.log` (override with `--raw-log-file` or `GEMINI_RAW_LOG_FILE`).
+
+## Convert preannotated → annotated
+
+Convert pre-annotated files to Label Studio–style annotated JSON:
+
+```bash
+uv run python main.py --mode convert --input-dir datasets/preannotated --output-dir datasets/annotated
+```
+
+The converter generates IDs in `XXX-XXXXXX` format for annotation results and sets `annotations[].id` to a sequential number.
+
+## Analyze annotated datasets
+
+`dataset_analysis.parse_annotated(path)` returns a dict of label counts:
+
+```python
+from dataset_analysis import parse_annotated
+
+counts = parse_annotated("datasets/annotated/ffmpeg_gpt_v1.json")
+print(counts)
+```
--- a/dataset_analysis.py
+++ b/dataset_analysis.py
@@ -0,0 +1,61 @@
+import json
+from pathlib import Path
+import os
+
+from enums import TokenLabel
+
+
+def parse_annotated(path):
+    # parse json file
+    #
+    # return the counts of each token label in the dataset
+    data = json.loads(Path(path).read_text(encoding="utf-8"))
+    if not isinstance(data, list):
+        raise ValueError("Annotated dataset must be a JSON array.")
+
+    counts = {label.name: 0 for label in TokenLabel}
+    label_names = {label.name: label.name for label in TokenLabel}
+    label_values = {label.value: label.name for label in TokenLabel}
+
+    for item in data:
+        annotations = item.get("annotations", [])
+        if not isinstance(annotations, list):
+            raise ValueError("Annotations must be a list.")
+        for annotation in annotations:
+            results = annotation.get("result", [])
+            if not isinstance(results, list):
+                raise ValueError("Annotation results must be a list.")
+            for result in results:
+                value = result.get("value", {})
+                labels = value.get("labels", [])
+                if not isinstance(labels, list):
+                    raise ValueError("Result labels must be a list.")
+                for label in labels:
+                    if label in label_names:
+                        key = label
+                    elif label in label_values:
+                        key = label_values[label]
+                    else:
+                        raise ValueError(f"Unknown label: {label}")
+                    counts[key] += 1
+
+    return counts
+
+
+if __name__ == "__main__":
+    path = "./datasets/annotated/"
+
+    annotated_dataset_list = []
+
+    for file in os.walk(path):
+        for filename in file[2]:
+            if filename.endswith(".json"):
+                annotated_dataset_list.append(os.path.join(file[0], filename))
+
+    # number based menu to match file
+    for file in annotated_dataset_list:
+        print(f"{annotated_dataset_list.index(file)}: {file}")
+    _ = input("Enter the number of the annotated dataset to analyze: ")
+    path = annotated_dataset_list[int(_)]
+    counts = parse_annotated(path)
+    print(json.dumps(counts, indent=2))
--- a/dataset_augmentor.py
+++ b/dataset_augmentor.py
@@ -0,0 +1,2 @@
+
+
--- a/datasets/annotated/ffmpeg_gemini_v1.json
+++ b/datasets/annotated/ffmpeg_gemini_v1.json
--- a/datasets/annotated/ffmpeg_gpt_v1.json
+++ b/datasets/annotated/ffmpeg_gpt_v1.json
--- a/datasets/preannotated/ffmpeg_gemini_v1.json
+++ b/datasets/preannotated/ffmpeg_gemini_v1.json
--- a/datasets/raw/ffmpeg_gemini_v1.json
+++ b/datasets/raw/ffmpeg_gemini_v1.json
@@ -0,0 +1,377 @@
+[
+  {
+    "text": "Trim the first 15 seconds from 'Summer Vacation (Italy) 2024.mp4' and save it."
+  },
+  {
+    "text": "Split the file 'The.Bear.S03E05.720p.mkv' into three equal segments."
+  },
+  {
+    "text": "Stitch together 'intro_sequence.mov' and 'Main_Feature_V2_Final.mp4' into a single output."
+  },
+  {
+    "text": "From 'interview_recording_01 (Backup).avi', slice out the segment from 00:05:00 to 00:10:00."
+  },
+  {
+    "text": "Take all videos in the /raw_footage/ folder and stitch them in alphabetical order."
+  },
+  {
+    "text": "For every file matching 'clip_??_scene.mp4', trim the last 2 seconds."
+  },
+  {
+    "text": "In the directory /exports/, find videos longer than 10 minutes and split them at the halfway point."
+  },
+  {
+    "text": "Splicing 'Logo_Overlay.png' at the start of 'Product_Demo_Final_1080p.mp4' is the first step."
+  },
+  {
+    "text": "Remove the middle 20 seconds from 'Vlog #42 - My New House.mp4' starting at 02:30."
+  },
+  {
+    "text": "If a video has a resolution of 3840x2160, split it into four 4K quadrants."
+  },
+  {
+    "text": "Stitch 'Part_A.mp4', 'Part_B.mp4', and 'Part_C.mp4' using a crossfade transition."
+  },
+  {
+    "text": "For each video in the 'Daily_Uploads' folder, trim the intro if it is longer than 5 seconds."
+  },
+  {
+    "text": "The file 'Family.Reunion.(2023).Part.1.mkv' needs to be trimmed to just the first hour."
+  },
+  {
+    "text": "Filter for all .mov files with a bitrate under 2000kbps and stitch them into 'low_res_compilation.mov'."
+  },
+  {
+    "text": "Apply a split every 30 seconds to the file 'security_cam_08-24-2026.mp4'."
+  },
+  {
+    "text": "Join 'scene_01_take_05.mp4' and 'scene_01_take_06.mp4' but trim the slate from the start of each."
+  },
+  {
+    "text": "Using a glob pattern 'vid_*.mp4', find all matching files and stitch them together."
+  },
+  {
+    "text": "With 'Wedding_Highlight_Video_(Extended_Version).mp4', slice the section between 12:45 and 15:20."
+  },
+  {
+    "text": "For any video containing 'DRAFT' in the filename, trim the last 10 seconds of silence."
+  },
+  {
+    "text": "Split 'concert_full_set.mkv' into separate files based on the metadata chapter markers."
+  },
+  {
+    "text": "Stitch 'b-roll_city_traffic.mp4' with 'audio_track_02.wav' and then trim it to 30 seconds."
+  },
+  {
+    "text": "Iterate through 'C:/Users/Media/Desktop/*.avi' and split each into 60-second clips."
+  },
+  {
+    "text": "If the duration of 'presentation_recording.mp4' exceeds 1 hour, split it into two parts."
+  },
+  {
+    "text": "Trim '01_Intro.mp4' to end exactly at 00:00:15."
+  },
+  {
+    "text": "Splicing 'outro_credits.mp4' onto the end of all videos in the 'Finished' folder is required."
+  },
+  {
+    "text": "From 'Nature_Documentary_S01E01_The_Forest.mp4', remove the segments with no audio."
+  },
+  {
+    "text": "Stitch all videos with the 'HDR' tag in their metadata into a single reel."
+  },
+  {
+    "text": "For every .mp4 file in the current directory, trim the first 100 frames."
+  },
+  {
+    "text": "Split 'long_tutorial_v3.mp4' at the 10:00, 20:00, and 30:00 marks."
+  },
+  {
+    "text": "The video 'Mountain_Climb_(Edited).mov' needs the last 5 minutes trimmed off."
+  },
+  {
+    "text": "Find files matching the regex '^scene_[0-9]{3}\\.mp4$' and stitch them sequentially."
+  },
+  {
+    "text": "Combine 'logo_animation_fixed.mp4' with 'main_content_v4.mp4' and 'social_media_tags.mp4'."
+  },
+  {
+    "text": "Slice 'podcasting_session_01.mp4' every time a silence longer than 2 seconds is detected."
+  },
+  {
+    "text": "If the file size is greater than 5GB, split '4K_Drone_Footage_Raw.mp4' into 2GB chunks."
+  },
+  {
+    "text": "Trim the video '2026-04-07_Log_Entry.mp4' to start at the first detected motion."
+  },
+  {
+    "text": "For each video in 'project_x', stitch them together if they share the same frame rate."
+  },
+  {
+    "text": "Splice 'emergency_alert_broadcast.mp4' into the middle of 'regular_programming.ts' at 15:00."
+  },
+  {
+    "text": "With 'The_Grand_Budapest_Hotel_Trailer.mp4', trim the black frames from the beginning and end."
+  },
+  {
+    "text": "Split 'GOPR0012.MP4' into segments whenever the camera GPS coordinates change."
+  },
+  {
+    "text": "Stitch 'Clip (1).mp4', 'Clip (2).mp4', and 'Clip (3).mp4' in numerical order."
+  },
+  {
+    "text": "For all videos in 'Archive' with a 4:3 aspect ratio, trim them to 1 minute."
+  },
+  {
+    "text": "Trim the audio-only portion from 'lecture_series_04.m4v' and stitch it with 'slides_04.mp4'."
+  },
+  {
+    "text": "Using the file 'interview_subject_A.mp4', slice out every instance where the speaker says 'um'."
+  },
+  {
+    "text": "Split the video 'test_pattern_v12.avi' every 500MB."
+  },
+  {
+    "text": "Stitch the files listed in 'file_list.txt' into a seamless video."
+  },
+  {
+    "text": "For any video created before 2025, trim the metadata and splice a new header."
+  },
+  {
+    "text": "Take 'my_cool_video.mp4' and 'another_one.mp4' and stitch them together."
+  },
+  {
+    "text": "The file 'Succession.S04E10.1080p.mkv' should be split at the 30-minute mark."
+  },
+  {
+    "text": "Trim the start of 'morning_routine_vlog.mp4' by 12 seconds."
+  },
+  {
+    "text": "Splice a 2-second black screen between 'intro.mp4' and 'body.mp4'."
+  },
+  {
+    "text": "Find videos with 'H.264' codec and stitch them into a compilation called 'H264_Reel.mp4'."
+  },
+  {
+    "text": "For each video in the folder 'TikTok_Drafts', trim the last 0.5 seconds."
+  },
+  {
+    "text": "Split 'CCTV_Storage_Disk_A.mp4' into hourly intervals."
+  },
+  {
+    "text": "From 'Cinematic_Shot_[001].mov', remove the first 2 seconds of camera shake."
+  },
+  {
+    "text": "Stitch 'Top_10_Countdown_Part_10.mp4' through 'Top_10_Countdown_Part_1.mp4' in reverse order."
+  },
+  {
+    "text": "Trim 'cooking_tutorial_raw.mp4' based on the bookmarks in the file metadata."
+  },
+  {
+    "text": "For all files matching '*_backup.mp4', split them in half and delete the second half."
+  },
+  {
+    "text": "Splice 'watermark_fixed.png' into the corner of 'product_render.mp4' for its entire duration."
+  },
+  {
+    "text": "With 'Breaking_Bad_S05E16.mkv', trim the 'previously on' segment."
+  },
+  {
+    "text": "Split the file 'ambience_loop_forest.mp4' exactly at the point where the loop repeats."
+  },
+  {
+    "text": "Stitch 'phone_video_01.mp4' and 'phone_video_02.mp4' after rotating the second one."
+  },
+  {
+    "text": "For every video in 'Dailies/Scene_5/', trim the clapperboard at the start."
+  },
+  {
+    "text": "In 'webinar_recap.mp4', slice out the Q&A session from 45:00 to the end."
+  },
+  {
+    "text": "If a video is 1920x1080, splice the 'standard_definition_warning.mp4' to the front."
+  },
+  {
+    "text": "Split 'gameplay_recording_3hr.mp4' into three 1-hour files."
+  },
+  {
+    "text": "Trim 'Birthday_Party_(Edited).mp4' so it ends right before the candles are blown out."
+  },
+  {
+    "text": "Stitch 'camera_1_view.mp4' and 'camera_2_view.mp4' into a side-by-side split screen."
+  },
+  {
+    "text": "For each .mkv file in the 'Movies' folder, trim the first 30 seconds of credits."
+  },
+  {
+    "text": "Using the pattern 'shot_v[0-9].mp4', stitch the highest version numbers together."
+  },
+  {
+    "text": "From 'stock_footage_clouds.mov', trim the sections where the sun is obscured."
+  },
+  {
+    "text": "Split 'unboxing_video_long.mp4' at every scene change detected with 0.4 threshold."
+  },
+  {
+    "text": "Stitch 'audio_narration.mp3' onto 'silent_presentation.mp4' and trim to the shorter length."
+  },
+  {
+    "text": "For any video with 'vertical' in the name, trim the top and bottom to make it 16:9."
+  },
+  {
+    "text": "Slice 'long_running_process.mp4' to only include the first and last 5 minutes."
+  },
+  {
+    "text": "If the video 'test_render_01.mp4' has no audio track, splice in 'white_noise.wav'."
+  },
+  {
+    "text": "Trim the intro and outro of all videos in the 'Course_Modules' subfolder."
+  },
+  {
+    "text": "Split '4k_landscape_timelapse.mp4' into 250 individual image frames."
+  },
+  {
+    "text": "Stitch 'wedding_ceremony.mp4' and 'wedding_reception.mp4' with a 5-second fade."
+  },
+  {
+    "text": "For each file in 'temp_renders/', if it is less than 1 second long, delete it."
+  },
+  {
+    "text": "Trim 'The_Dark_Knight_Trailer (2008).mp4' to only include the Joker scenes."
+  },
+  {
+    "text": "Splicing 'end_card_v3.mp4' to every video in 'Youtube_Uploads' is the goal."
+  },
+  {
+    "text": "Split 'CCTV_04_07_26.mp4' whenever motion is detected in the 'gate' region."
+  },
+  {
+    "text": "With 'tutorial_recording.mp4', slice the segment from 05:00 to 07:00 and save as 'Highlight.mp4'."
+  },
+  {
+    "text": "Stitch all videos in 'raw_clips' that were recorded at 60fps."
+  },
+  {
+    "text": "For any video named 'Untitled (Copy).mp4', trim it to 10 seconds and rename it."
+  },
+  {
+    "text": "Split 'audio_sync_test.mp4' into separate video and audio streams."
+  },
+  {
+    "text": "Trim 'Gym_Workout_Session.mp4' to remove the rest periods between sets."
+  },
+  {
+    "text": "Stitch 'A.mp4', 'B.mp4', and 'C.mp4' but reverse the order to 'C.mp4', 'B.mp4', 'A.mp4'."
+  },
+  {
+    "text": "For each video in 'Client_Review', splice the 'DRAFT' watermark across the middle."
+  },
+  {
+    "text": "Find videos with duration > 300s and split them into 60s clips."
+  },
+  {
+    "text": "The video 'Street_Food_Tour [HDR].mov' needs the first 2 minutes trimmed."
+  },
+  {
+    "text": "Splice 'transition_effect.mp4' between every video in the 'Montage' folder during stitching."
+  },
+  {
+    "text": "From 'Interview_with_CEO_Final.mp4', trim the first 3 seconds of dead air."
+  },
+  {
+    "text": "Split '24_hour_surveillance_feed.mp4' into 24 one-hour segments."
+  },
+  {
+    "text": "Stitch 'Intro (English).mp4' with 'Content.mp4' and 'Outro (English).mp4'."
+  },
+  {
+    "text": "For each file in the 'Renders' folder, if the width is 720, trim it to 5 seconds."
+  },
+  {
+    "text": "Trim 'Space_X_Launch_Live.mkv' to start at T-minus 10 seconds."
+  },
+  {
+    "text": "Using glob '*_scene_*.mp4', stitch clips with matching scene numbers together."
+  },
+  {
+    "text": "Slice 'nature_walk_4k.mp4' to remove the shaky footage at the 12-minute mark."
+  },
+  {
+    "text": "Split 'Podcast_Ep_12.mp4' at the timestamps provided in the 'chapters.json' file."
+  },
+  {
+    "text": "Stitch 'Scene_1.mp4' and 'Scene_1_Alt_End.mp4' and trim the overlap."
+  },
+  {
+    "text": "For all videos in 'Archive' with the file extension .flv, stitch them into one .mp4."
+  },
+  {
+    "text": "Trim the last 30 frames from every video in the 'Animation_Export' folder."
+  },
+  {
+    "text": "Splice 'Sponsor_Segment.mp4' into 'Gaming_Video_01.mp4' at the 8-minute mark."
+  },
+  {
+    "text": "If a video has stereo audio, split it into two mono-audio video files."
+  },
+  {
+    "text": "With 'Travel_Vlog_Ep1 (Final_Draft).mp4', trim the end where the camera falls."
+  },
+  {
+    "text": "Split 'Concert_Multicam.mp4' into 4 separate files, one for each camera angle."
+  },
+  {
+    "text": "Stitch all videos in 'Dailies' that have a creation date of '2026-04-07'."
+  },
+  {
+    "text": "For each video in the 'Trash' folder, trim it to 0 seconds (effectively clear content)."
+  },
+  {
+    "text": "Trim 'Drone_Shot_05.mov' to the first 1000 frames."
+  },
+  {
+    "text": "Slice 'Long_Lecture.mp4' to extract only the slides that contain the word 'Biology'."
+  },
+  {
+    "text": "Split 'Heavy_Metal_Music_Video.mp4' at every beat of the drum (120 BPM)."
+  },
+  {
+    "text": "Stitch 'Part_01_v1.mp4' through 'Part_10_v1.mp4' using the 'list' command."
+  },
+  {
+    "text": "For every video in 'exports/', trim the filename prefix 'FINAL_' and then stitch them."
+  },
+  {
+    "text": "Trim 'Ocean_Waves_Ambient.mp4' to be a perfect 10-second loop."
+  },
+  {
+    "text": "Splice 'Director_Commentary.ac3' into 'The_Movie_Title.mkv' as a second audio track."
+  },
+  {
+    "text": "If the file 'clip_a.mp4' is shorter than 'clip_b.mp4', stitch them in that order."
+  },
+  {
+    "text": "Split 'NASA_Space_Station_Feed.mp4' whenever the video bitrate drops to zero."
+  },
+  {
+    "text": "Trim the last 5 seconds from 'My_Daughter's_First_Steps.mp4'."
+  },
+  {
+    "text": "Stitch 'Intro.mp4' with every file in 'Chapters/' and save them as individual episodes."
+  },
+  {
+    "text": "For each video in 'Pending', trim the first 10 seconds and move to 'Ready'."
+  },
+  {
+    "text": "Slice 'Webinar_Recording_2026.mp4' at 01:20:00 and discard the rest."
+  },
+  {
+    "text": "Split 'Slow_Motion_Reference.mp4' into segments of 120 frames each."
+  },
+  {
+    "text": "Stitch all .mp4 files in 'Folder A' and 'Folder B' into a single master file."
+  },
+  {
+    "text": "Trim 'The_Last_of_Us_S01E01.mkv' to end right when the music starts."
+  }
+]
--- a/datasets/raw/ffmpeg_gpt_v1.json
+++ b/datasets/raw/ffmpeg_gpt_v1.json
@@ -0,0 +1,200 @@
+[
+  {
+    "text": "Convert interview.mp4 to H.265 and reduce the file size under 120MB"
+  },
+  {
+    "text": "Take all the MOV files in my Downloads folder and convert them to MP4"
+  },
+  {
+    "text": "Strip the audio from promo_video.mp4 and replace it with background_music.mp3"
+  },
+  {
+    "text": "Extract a clip from 00:02:15 to 00:05:44 from lecture.mkv without re-encoding"
+  },
+  {
+    "text": "Convert every PNG in this directory into a single timelapse video at 24fps"
+  },
+  {
+    "text": "Add burnt-in subtitles from subtitles.srt to hello_film.mp4"
+  },
+  {
+    "text": "Compress all my drone videos to be under 50MB each, keeping 1080p"
+  },
+  {
+    "text": "Extract the audio track from podcast_episode_12.mp4 as a FLAC file"
+  },
+  {
+    "text": "Watermark every video in the /exports folder with logo.png in the bottom right"
+  },
+  {
+    "text": "Resize input.mov to 720p and convert to WebM"
+  },
+  {
+    "text": "Take a screenshot from the video at the 30-second mark"
+  },
+  {
+    "text": "Merge part1.mp4, part2.mp4, and part3.mp4 into one file"
+  },
+  {
+    "text": "Normalize the loudness of all audio files in my podcast folder to -16 LUFS"
+  },
+  {
+    "text": "Speed up the timelapse footage to 4x without changing the audio pitch"
+  },
+  {
+    "text": "Create a side-by-side comparison video of before.mp4 and after.mp4"
+  },
+  {
+    "text": "Convert the MKV to MP4 keeping all subtitle and audio tracks"
+  },
+  {
+    "text": "Extract one frame every 5 seconds from the dashcam_recording video file and save them to frames/"
+  },
+  {
+    "text": "Add a 3-second black fade-in and fade-out to presentation clip video file"
+  },
+  {
+    "text": "Loop the henry.avi exactly 10 times and export as a single video"
+  },
+  {
+    "text": "Crop 39fjsai.mp4 to a vertical 9:16 format for Instagram"
+  },
+  {
+    "text": "Remove the first 10 seconds from all the clips in the /raw folder"
+  },
+  {
+    "text": "Convert my audio files from MP3 to AAC at 192kbps"
+  },
+  {
+    "text": "Create a GIF from the first 4 seconds of the youtube_download.webm starting at 1:12"
+  },
+  {
+    "text": "Reduce video bitrate to 1200kbps and encode audio to 64kbps aac of to___encode.mp4 and save it as output.mkv"
+  },
+  {
+    "text": "Rotate the hea3434gvjj__www.download.com.mp4 to 90 degrees clockwise"
+  },
+  {
+    "text": "Extract all keyframes from claivido_final.mkv as JPEG images"
+  },
+  {
+    "text": "Add chapter markers and a cover thumbnail to podcast.mp4"
+  },
+  {
+    "text": "Convert the entire /recordings folder from WAV to MP3 at 320kbps"
+  },
+  {
+    "text": "Stabilize all the videos created today and save them to stabilized folder"
+  },
+  {
+    "text": "Scale down all images in the sike directory to a max width of 1920px"
+  },
+  {
+    "text": "Re-encode stupid_video.avi using nvenc and with hevc at 1080p"
+  },
+  {
+    "text": "Create a picture-in-picture with webcam.mp4 overlaid on screen_recording.mp4 in bottom right"
+  },
+  {
+    "text": "Convert all my .avi files to MKV with lossless encoding"
+  },
+  {
+    "text": "Reverse all video files containing '_reverse_needed_' for a rewind effect and save them to a directory starting with reversed"
+  },
+  {
+    "text": "Re-wrap the H.264 stream from .ts to .mp4 with no re-encode"
+  },
+  {
+    "text": "Convert the RAW image sequence from /timelapse to a 4K ProRes video"
+  },
+  {
+    "text": "Split the 3-hour conference recording into 30-minute segments"
+  },
+  {
+    "text": "Remove the audio entirely from all videos in the /silent folder"
+  },
+  {
+    "text": "Create a low-res proxy version of the 4K footage for offline editing"
+  },
+  {
+    "text": "Encode the video with 2-pass encoding for a target bitrate of 2Mbps"
+  },
+  {
+    "text": "Add a scrolling lower-third text overlay to the interview"
+  },
+  {
+    "text": "Export audio as stereo WAV at 48kHz for DaVinci Resolve"
+  },
+  {
+    "text": "Take every file in the folder and generate a 10-second preview clip from the middle"
+  },
+  {
+    "text": "Convert the SRT subtitle file to WebVTT format"
+  },
+  {
+    "text": "Deinterlace the video files in /old_footage"
+  },
+  {
+    "text": "Create an optimized animated WebP from the video loop"
+  },
+  {
+    "text": "Stack 203jfh.mp4 and 29jfj21112.webm and aosidjgf22.mkv vertically into one tall video and save it as output"
+  },
+  {
+    "text": "Convert all images in the product folder to WebP with 80% quality"
+  },
+  {
+    "text": "Extract just the chapter 3 portion based on chapter metadata of 029jghijdf__asuihttp.mp4"
+  },
+  {
+    "text": "Compress all JPEGs in the folder to a max of 200KB per image"
+  },
+  {
+    "text": "Convert and re-tag all WMA files to AAC, preserving all metadata"
+  },
+  {
+    "text": "Create a 1:1 square version of all videos created last thursday with padding"
+  },
+  {
+    "text": "Trim the last 30 seconds off every clip in my /raw directory"
+  },
+  {
+    "text": "Add a podcast intro and outro to each episode file in the folder"
+  },
+  {
+    "text": "Mux the separate video_new_final-release.h264 and 12831245_audio.aac streams into an MP4"
+  },
+  {
+    "text": "Take the interview clip and output it in every social format: 16:9, 9:16, and 1:1"
+  },
+  {
+    "text": "Encode a lossless copy for archival and a compressed copy for sharing"
+  },
+  {
+    "text": "Apply CLAHE contrast enhancement to all images in the dataset folder"
+  },
+  {
+    "text": "Add chapter info from chapters.txt into the output MKV"
+  },
+  {
+    "text": "Create a video slideshow from photos in slideshow_photos/ with 3-second holds and a Ken Burns effect"
+  },
+  {
+    "text": "Convert the audio to mono for podcast distribution"
+  },
+  {
+    "text": "Batch-resize all product photos in product/ to exactly 800x800 with white padding"
+  },
+  {
+    "text": "Extract all frames between 10s and 20s in 290fj3.mp4 as PNG files"
+  },
+  {
+    "text": "Build a mosaic of all the clip thumbnails into one image grid"
+  },
+  {
+    "text": "Re-encode everything in the archive with AV1"
+  },
+  {
+    "text": "Clip the first 5 seconds from all lecture recordings and save separately"
+  }
+]
--- a/enums.py
+++ b/enums.py
@@ -0,0 +1,23 @@
+from enum import Enum
+
+
+class TokenLabel(Enum):
+    ACTION = "ACTION"
+    TARGET = "TARGET"
+    PREPOSITION = "PREPOSITION"
+    CONJUNCTION = "CONJUNCTION"
+    MODIFIER = "MODIFIER"
+    OBJECT = "OBJECT"
+    VALUE_TOKEN = "VALUE_TOKEN"
+    NUMBER = "NUMBER"
+    UNIT = "UNIT"
+    TIME = "TIME"
+    DATE = "DATE"
+    ORDINAL = "ORDINAL"
+    COMPARATOR = "COMPARATOR"
+    FILTER_HINT = "FILTER_HINT"
+    PATH = "PATH"
+    PATTERN = "PATTERN"
+    URL = "URL"
+    TIMESTAMP = "TIMESTAMP"
+    RAW_PHRASE = "0"
--- a/main.py
+++ b/main.py
@@ -0,0 +1,359 @@
+import argparse
+import datetime as dt
+import json
+import os
+import secrets
+import string
+from pathlib import Path
+from typing import Iterable
+
+from google import genai
+from loguru import logger
+
+from .enums import TokenLabel
+
+DEFAULT_BATCH_SIZE = 20
+DEFAULT_MODEL = "gemini-2.5-flash-lite"
+DEFAULT_RAW_LOG_PATH = "logs/gemini_raw.log"
+DEFAULT_CONVERT_INPUT_DIR = "datasets/preannotated"
+DEFAULT_CONVERT_OUTPUT_DIR = "datasets/annotated"
+LABEL_NAMES = [label.name for label in TokenLabel]
+VALUE_TO_NAME = {label.value: label.name for label in TokenLabel}
+ID_ALPHABET = string.ascii_uppercase + string.digits
+
+
+def _chunked(items: list[int], size: int) -> Iterable[list[int]]:
+    if size < 1:
+        raise ValueError("batch_size must be at least 1.")
+    for start in range(0, len(items), size):
+        yield items[start : start + size]
+
+
+def _generate_id() -> str:
+    return (
+        f"{''.join(secrets.choice(ID_ALPHABET) for _ in range(3))}"
+        f"-{''.join(secrets.choice(ID_ALPHABET) for _ in range(6))}"
+    )
+
+
+def _all_occurrences(text: str, span: str) -> list[int]:
+    occurrences = []
+    start = 0
+    while True:
+        idx = text.find(span, start)
+        if idx == -1:
+            break
+        occurrences.append(idx)
+        start = idx + 1
+    return occurrences
+
+
+def _build_prompt(texts: list[str]) -> str:
+    labels = ", ".join(LABEL_NAMES)
+    return (
+        "You are a token pre-annotator. For each input text, return JSON with tagged "
+        "token/subword/word/span labels.\n"
+        f"Allowed labels: {labels}.\n"
+        "Rules:\n"
+        "- Output ONLY valid JSON (no markdown).\n"
+        "- Return a JSON array with the same length/order as the input.\n"
+        "- Each item must be an object: "
+        '{"text": "<original>", "tags": [{"span": "<exact substring>", "label": "<LABEL>"}]}.\n'
+        "- The span must be an exact substring of the original text.\n"
+        "- Use RAW_PHRASE when no other label applies.\n\n"
+        f"Input texts: {json.dumps(texts, ensure_ascii=True)}"
+    )
+
+
+def _normalize_label(label: str) -> str:
+    if label in LABEL_NAMES:
+        return label
+    if label in VALUE_TO_NAME:
+        return VALUE_TO_NAME[label]
+    raise ValueError(f"Unknown label: {label}")
+
+
+def _normalize_result(text: str, result: dict) -> dict:
+    if result.get("text") != text:
+        raise ValueError("Gemini result text does not match input text.")
+    tags = result.get("tags")
+    if not isinstance(tags, list):
+        raise ValueError("Gemini result tags must be a list.")
+    normalized_tags = []
+    for tag in tags:
+        if not isinstance(tag, dict):
+            raise ValueError("Each tag must be an object.")
+        span = tag.get("span")
+        label = tag.get("label")
+        if not isinstance(span, str) or not isinstance(label, str):
+            raise ValueError("Each tag must include string span and label fields.")
+        if span not in text:
+            raise ValueError(f"Span not found in text: {span}")
+        normalized_tags.append({"span": span, "label": _normalize_label(label)})
+    return {"text": text, "tags": normalized_tags}
+
+
+def preannotate_tokens(
+    texts: list[str], client: genai.Client, model: str
+) -> list[dict]:
+    prompt = _build_prompt(texts)
+    response = client.models.generate_content(model=model, contents=prompt)
+    if response.text is None:
+        raise ValueError("Gemini returned an empty response.")
+    logger.bind(raw_gemini=True).trace(response.text)
+    parsed = json.loads(response.text)
+    if not isinstance(parsed, list) or len(parsed) != len(texts):
+        raise ValueError("Gemini response must be a JSON array matching input length.")
+    return [_normalize_result(text, result) for text, result in zip(texts, parsed)]
+
+
+def _load_raw_records(path: Path) -> list[dict]:
+    raw = json.loads(path.read_text(encoding="utf-8"))
+    if not isinstance(raw, list):
+        raise ValueError(f"Dataset {path} must be a JSON array.")
+    for idx, item in enumerate(raw):
+        if not isinstance(item, dict) or "text" not in item:
+            raise ValueError(
+                f"Dataset {path} item {idx} must be an object with a text field."
+            )
+    return raw
+
+
+def _load_preannotated_records(path: Path) -> list[dict]:
+    raw = json.loads(path.read_text(encoding="utf-8"))
+    if not isinstance(raw, list):
+        raise ValueError(f"Dataset {path} must be a JSON array.")
+    for idx, item in enumerate(raw):
+        if not isinstance(item, dict) or "text" not in item:
+            raise ValueError(
+                f"Dataset {path} item {idx} must be an object with a text field."
+            )
+        if "tags" not in item or not isinstance(item["tags"], list):
+            raise ValueError(
+                f"Dataset {path} item {idx} must include a tags list for conversion."
+            )
+    return raw
+
+
+def _build_labelstudio_results(text: str, tags: list[dict]) -> list[dict]:
+    occurrences_map: dict[str, list[int]] = {}
+    occurrence_index: dict[str, int] = {}
+    results = []
+    for tag in tags:
+        if not isinstance(tag, dict):
+            raise ValueError("Each tag must be an object.")
+        span = tag.get("span")
+        label = tag.get("label")
+        if not isinstance(span, str) or not isinstance(label, str):
+            raise ValueError("Each tag must include string span and label fields.")
+        if span not in occurrences_map:
+            occurrences_map[span] = _all_occurrences(text, span)
+            occurrence_index[span] = 0
+        occurrences = occurrences_map[span]
+        idx = occurrence_index[span]
+        if idx >= len(occurrences):
+            raise ValueError(f"Span not found in text: {span}")
+        start = occurrences[idx]
+        end = start + len(span)
+        occurrence_index[span] = idx + 1
+        results.append(
+            {
+                "value": {
+                    "start": start,
+                    "end": end,
+                    "text": span,
+                    "labels": [_normalize_label(label)],
+                },
+                "id": _generate_id(),
+                "from_name": "label",
+                "to_name": "text",
+                "type": "labels",
+                "origin": "manual",
+            }
+        )
+    return results
+
+
+def _preannotate_dataset(
+    input_path: Path,
+    output_path: Path,
+    client: genai.Client,
+    model: str,
+    batch_size: int,
+) -> None:
+    raw_items = _load_raw_records(input_path)
+    texts = [item["text"] for item in raw_items]
+    annotated_items: list[dict | None] = [None] * len(raw_items)
+
+    logger.info("Pre-annotating {} records from {}", len(texts), input_path)
+    for batch_indices in _chunked(list(range(len(texts))), batch_size):
+        batch_texts = [texts[idx] for idx in batch_indices]
+        logger.debug(
+            "Sending batch {}-{} (size {}) to Gemini",
+            batch_indices[0],
+            batch_indices[-1],
+            len(batch_texts),
+        )
+        batch_results = preannotate_tokens(batch_texts, client, model)
+        for idx, result in zip(batch_indices, batch_results):
+            item = dict(raw_items[idx])
+            item["tags"] = result["tags"]
+            annotated_items[idx] = item
+
+    if any(item is None for item in annotated_items):
+        raise ValueError("Pre-annotation failed to produce results for all items.")
+
+    output_path.write_text(
+        json.dumps(annotated_items, indent=2, ensure_ascii=True), encoding="utf-8"
+    )
+    logger.info("Wrote pre-annotated dataset to {}", output_path)
+
+
+def _convert_preannotated_dataset(input_path: Path, output_path: Path) -> None:
+    preannotated_items = _load_preannotated_records(input_path)
+    now_iso = dt.datetime.now(dt.UTC).isoformat()
+    tasks = []
+    logger.info("Converting {} records from {}", len(preannotated_items), input_path)
+    for index, item in enumerate(preannotated_items):
+        text = item["text"]
+        tags = item["tags"]
+        task_id = _generate_id()
+        annotation_id = index + 1
+        results = _build_labelstudio_results(text, tags)
+        annotation = {
+            "id": annotation_id,
+            "completed_by": 2,
+            "result": results,
+            "was_cancelled": False,
+            "ground_truth": False,
+            "created_at": now_iso,
+            "updated_at": now_iso,
+            "draft_created_at": now_iso,
+            "lead_time": 0.0,
+            "prediction": {},
+            "result_count": len(results),
+            "unique_id": _generate_id(),
+            "import_id": None,
+            "last_action": None,
+            "bulk_created": False,
+            "task": task_id,
+            "project": None,
+            "updated_by": None,
+            "parent_prediction": None,
+            "parent_annotation": None,
+            "last_created_by": None,
+        }
+        tasks.append(
+            {
+                "id": task_id,
+                "annotations": [annotation],
+                "file_upload": None,
+                "drafts": [],
+                "predictions": [],
+                "data": {"text": text},
+                "meta": {},
+                "created_at": now_iso,
+                "updated_at": now_iso,
+                "allow_skip": True,
+                "inner_id": index + 1,
+                "total_annotations": 1,
+                "cancelled_annotations": 0,
+                "total_predictions": 0,
+                "comment_count": 0,
+                "unresolved_comment_count": 0,
+                "last_comment_updated_at": None,
+                "project": None,
+                "updated_by": None,
+                "comment_authors": [],
+            }
+        )
+
+    output_path.write_text(
+        json.dumps(tasks, indent=2, ensure_ascii=True), encoding="utf-8"
+    )
+    logger.info("Wrote converted annotated dataset to {}", output_path)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Pre-annotate datasets with Gemini.")
+    parser.add_argument(
+        "--mode",
+        choices=["preannotate", "convert"],
+        default="preannotate",
+    )
+    parser.add_argument("--input-dir", default=None)
+    parser.add_argument("--output-dir", default=None)
+    parser.add_argument("--input-file", default=None)
+    parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
+    parser.add_argument("--model", default=os.getenv("GEMINI_MODEL", DEFAULT_MODEL))
+    parser.add_argument(
+        "--raw-log-file",
+        default=os.getenv("GEMINI_RAW_LOG_FILE", DEFAULT_RAW_LOG_PATH),
+    )
+    args = parser.parse_args()
+
+    if args.mode == "preannotate":
+        raw_log_path = Path(args.raw_log_file)
+        raw_log_path.parent.mkdir(parents=True, exist_ok=True)
+        logger.add(
+            raw_log_path,
+            level="TRACE",
+            filter=lambda record: record["extra"].get("raw_gemini") is True,
+            format="{message}",
+        )
+
+        api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY must be set in the environment.")
+
+        client = genai.Client(api_key=api_key)
+        input_dir = Path(args.input_dir or "datasets/raw")
+        output_dir = Path(args.output_dir or "datasets/preannotated")
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        if args.input_file:
+            input_paths = [Path(args.input_file)]
+        else:
+            input_paths = sorted(input_dir.glob("*.json"))
+
+        if not input_paths:
+            raise ValueError(f"No input datasets found in {input_dir}.")
+
+        logger.info(
+            "Starting pre-annotation: model={}, batch_size={}, input_dir={}, output_dir={}",
+            args.model,
+            args.batch_size,
+            input_dir,
+            output_dir,
+        )
+        for input_path in input_paths:
+            output_path = output_dir / input_path.name
+            _preannotate_dataset(
+                input_path, output_path, client, args.model, args.batch_size
+            )
+        return
+
+    input_dir = Path(args.input_dir or DEFAULT_CONVERT_INPUT_DIR)
+    output_dir = Path(args.output_dir or DEFAULT_CONVERT_OUTPUT_DIR)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    if args.input_file:
+        input_paths = [Path(args.input_file)]
+    else:
+        input_paths = sorted(input_dir.glob("*.json"))
+
+    if not input_paths:
+        raise ValueError(f"No input datasets found in {input_dir}.")
+
+    logger.info(
+        "Starting conversion: input_dir={}, output_dir={}",
+        input_dir,
+        output_dir,
+    )
+    for input_path in input_paths:
+        output_path = output_dir / input_path.name
+        _convert_preannotated_dataset(input_path, output_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,12 @@
+[project]
+name = "clint-dataset"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "google-genai>=1.70.0",
+    "label-studio>=1.23.0",
+    "label-studio-ml>=1.0.9",
+    "loguru>=0.7.3",
+]
--- a/uv.lock
+++ b/uv.lock