added base datasets
This commit is contained in:
11
.gitignore
vendored
Normal file
11
.gitignore
vendored
Normal file
@@ -0,0 +1,11 @@
|
||||
# Python-generated files
|
||||
__pycache__/
|
||||
*.py[oc]
|
||||
build/
|
||||
dist/
|
||||
wheels/
|
||||
*.egg-info
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
logs/
|
||||
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
||||
3.12
|
||||
73
README.md
Normal file
73
README.md
Normal file
@@ -0,0 +1,73 @@
|
||||
# clint-dataset
|
||||
|
||||
Dataset for labelling queries containing tasks in natural language, with a focus on command-line operations pertaining to popular CLI tools.
|
||||
|
||||
These queries were generated by prompting various commercially available LLMs and were pre-annotated using Gemini-2.5-flash-lite. They were then converted to a label studio supported format and then annotations were manually revised.
|
||||
|
||||
## Setup
|
||||
|
||||
```bash
|
||||
uv sync
|
||||
```
|
||||
|
||||
Set your Gemini API key:
|
||||
|
||||
```bash
|
||||
export GEMINI_API_KEY="your-key"
|
||||
```
|
||||
|
||||
Optional environment variables:
|
||||
|
||||
```bash
|
||||
export GEMINI_MODEL="gemini-2.5-flash-lite"
|
||||
export GEMINI_RAW_LOG_FILE="logs/gemini_raw.log"
|
||||
```
|
||||
|
||||
## Pre-annotate raw datasets
|
||||
|
||||
Raw datasets live in `datasets/raw` and contain:
|
||||
|
||||
```json
|
||||
[{ "text": "Trim the first 15 seconds from 'video.mp4'." }]
|
||||
```
|
||||
|
||||
Run the pre-annotator:
|
||||
|
||||
```bash
|
||||
uv run python main.py --mode preannotate --input-dir datasets/raw --output-dir datasets/preannotated --batch-size 20
|
||||
```
|
||||
|
||||
Output format (per item):
|
||||
|
||||
```json
|
||||
{
|
||||
"text": "Trim the first 15 seconds from 'video.mp4'.",
|
||||
"tags": [
|
||||
{ "span": "Trim", "label": "ACTION" },
|
||||
{ "span": "15", "label": "NUMBER" }
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Raw Gemini responses are logged to `logs/gemini_raw.log` (override with `--raw-log-file` or `GEMINI_RAW_LOG_FILE`).
|
||||
|
||||
## Convert preannotated → annotated
|
||||
|
||||
Convert pre-annotated files to Label Studio–style annotated JSON:
|
||||
|
||||
```bash
|
||||
uv run python main.py --mode convert --input-dir datasets/preannotated --output-dir datasets/annotated
|
||||
```
|
||||
|
||||
The converter generates IDs in `XXX-XXXXXX` format for annotation results and sets `annotations[].id` to a sequential number.
|
||||
|
||||
## Analyze annotated datasets
|
||||
|
||||
`dataset_analysis.parse_annotated(path)` returns a dict of label counts:
|
||||
|
||||
```python
|
||||
from dataset_analysis import parse_annotated
|
||||
|
||||
counts = parse_annotated("datasets/annotated/ffmpeg_gpt_v1.json")
|
||||
print(counts)
|
||||
```
|
||||
61
dataset_analysis.py
Normal file
61
dataset_analysis.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
from enums import TokenLabel
|
||||
|
||||
|
||||
def parse_annotated(path):
|
||||
# parse json file
|
||||
#
|
||||
# return the counts of each token label in the dataset
|
||||
data = json.loads(Path(path).read_text(encoding="utf-8"))
|
||||
if not isinstance(data, list):
|
||||
raise ValueError("Annotated dataset must be a JSON array.")
|
||||
|
||||
counts = {label.name: 0 for label in TokenLabel}
|
||||
label_names = {label.name: label.name for label in TokenLabel}
|
||||
label_values = {label.value: label.name for label in TokenLabel}
|
||||
|
||||
for item in data:
|
||||
annotations = item.get("annotations", [])
|
||||
if not isinstance(annotations, list):
|
||||
raise ValueError("Annotations must be a list.")
|
||||
for annotation in annotations:
|
||||
results = annotation.get("result", [])
|
||||
if not isinstance(results, list):
|
||||
raise ValueError("Annotation results must be a list.")
|
||||
for result in results:
|
||||
value = result.get("value", {})
|
||||
labels = value.get("labels", [])
|
||||
if not isinstance(labels, list):
|
||||
raise ValueError("Result labels must be a list.")
|
||||
for label in labels:
|
||||
if label in label_names:
|
||||
key = label
|
||||
elif label in label_values:
|
||||
key = label_values[label]
|
||||
else:
|
||||
raise ValueError(f"Unknown label: {label}")
|
||||
counts[key] += 1
|
||||
|
||||
return counts
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
path = "./datasets/annotated/"
|
||||
|
||||
annotated_dataset_list = []
|
||||
|
||||
for file in os.walk(path):
|
||||
for filename in file[2]:
|
||||
if filename.endswith(".json"):
|
||||
annotated_dataset_list.append(os.path.join(file[0], filename))
|
||||
|
||||
# number based menu to match file
|
||||
for file in annotated_dataset_list:
|
||||
print(f"{annotated_dataset_list.index(file)}: {file}")
|
||||
_ = input("Enter the number of the annotated dataset to analyze: ")
|
||||
path = annotated_dataset_list[int(_)]
|
||||
counts = parse_annotated(path)
|
||||
print(json.dumps(counts, indent=2))
|
||||
2
dataset_augmentor.py
Normal file
2
dataset_augmentor.py
Normal file
@@ -0,0 +1,2 @@
|
||||
|
||||
|
||||
1
datasets/annotated/ffmpeg_gemini_v1.json
Normal file
1
datasets/annotated/ffmpeg_gemini_v1.json
Normal file
File diff suppressed because one or more lines are too long
1
datasets/annotated/ffmpeg_gpt_v1.json
Normal file
1
datasets/annotated/ffmpeg_gpt_v1.json
Normal file
File diff suppressed because one or more lines are too long
5031
datasets/preannotated/ffmpeg_gemini_v1.json
Normal file
5031
datasets/preannotated/ffmpeg_gemini_v1.json
Normal file
File diff suppressed because it is too large
Load Diff
377
datasets/raw/ffmpeg_gemini_v1.json
Normal file
377
datasets/raw/ffmpeg_gemini_v1.json
Normal file
@@ -0,0 +1,377 @@
|
||||
[
|
||||
{
|
||||
"text": "Trim the first 15 seconds from 'Summer Vacation (Italy) 2024.mp4' and save it."
|
||||
},
|
||||
{
|
||||
"text": "Split the file 'The.Bear.S03E05.720p.mkv' into three equal segments."
|
||||
},
|
||||
{
|
||||
"text": "Stitch together 'intro_sequence.mov' and 'Main_Feature_V2_Final.mp4' into a single output."
|
||||
},
|
||||
{
|
||||
"text": "From 'interview_recording_01 (Backup).avi', slice out the segment from 00:05:00 to 00:10:00."
|
||||
},
|
||||
{
|
||||
"text": "Take all videos in the /raw_footage/ folder and stitch them in alphabetical order."
|
||||
},
|
||||
{
|
||||
"text": "For every file matching 'clip_??_scene.mp4', trim the last 2 seconds."
|
||||
},
|
||||
{
|
||||
"text": "In the directory /exports/, find videos longer than 10 minutes and split them at the halfway point."
|
||||
},
|
||||
{
|
||||
"text": "Splicing 'Logo_Overlay.png' at the start of 'Product_Demo_Final_1080p.mp4' is the first step."
|
||||
},
|
||||
{
|
||||
"text": "Remove the middle 20 seconds from 'Vlog #42 - My New House.mp4' starting at 02:30."
|
||||
},
|
||||
{
|
||||
"text": "If a video has a resolution of 3840x2160, split it into four 4K quadrants."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'Part_A.mp4', 'Part_B.mp4', and 'Part_C.mp4' using a crossfade transition."
|
||||
},
|
||||
{
|
||||
"text": "For each video in the 'Daily_Uploads' folder, trim the intro if it is longer than 5 seconds."
|
||||
},
|
||||
{
|
||||
"text": "The file 'Family.Reunion.(2023).Part.1.mkv' needs to be trimmed to just the first hour."
|
||||
},
|
||||
{
|
||||
"text": "Filter for all .mov files with a bitrate under 2000kbps and stitch them into 'low_res_compilation.mov'."
|
||||
},
|
||||
{
|
||||
"text": "Apply a split every 30 seconds to the file 'security_cam_08-24-2026.mp4'."
|
||||
},
|
||||
{
|
||||
"text": "Join 'scene_01_take_05.mp4' and 'scene_01_take_06.mp4' but trim the slate from the start of each."
|
||||
},
|
||||
{
|
||||
"text": "Using a glob pattern 'vid_*.mp4', find all matching files and stitch them together."
|
||||
},
|
||||
{
|
||||
"text": "With 'Wedding_Highlight_Video_(Extended_Version).mp4', slice the section between 12:45 and 15:20."
|
||||
},
|
||||
{
|
||||
"text": "For any video containing 'DRAFT' in the filename, trim the last 10 seconds of silence."
|
||||
},
|
||||
{
|
||||
"text": "Split 'concert_full_set.mkv' into separate files based on the metadata chapter markers."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'b-roll_city_traffic.mp4' with 'audio_track_02.wav' and then trim it to 30 seconds."
|
||||
},
|
||||
{
|
||||
"text": "Iterate through 'C:/Users/Media/Desktop/*.avi' and split each into 60-second clips."
|
||||
},
|
||||
{
|
||||
"text": "If the duration of 'presentation_recording.mp4' exceeds 1 hour, split it into two parts."
|
||||
},
|
||||
{
|
||||
"text": "Trim '01_Intro.mp4' to end exactly at 00:00:15."
|
||||
},
|
||||
{
|
||||
"text": "Splicing 'outro_credits.mp4' onto the end of all videos in the 'Finished' folder is required."
|
||||
},
|
||||
{
|
||||
"text": "From 'Nature_Documentary_S01E01_The_Forest.mp4', remove the segments with no audio."
|
||||
},
|
||||
{
|
||||
"text": "Stitch all videos with the 'HDR' tag in their metadata into a single reel."
|
||||
},
|
||||
{
|
||||
"text": "For every .mp4 file in the current directory, trim the first 100 frames."
|
||||
},
|
||||
{
|
||||
"text": "Split 'long_tutorial_v3.mp4' at the 10:00, 20:00, and 30:00 marks."
|
||||
},
|
||||
{
|
||||
"text": "The video 'Mountain_Climb_(Edited).mov' needs the last 5 minutes trimmed off."
|
||||
},
|
||||
{
|
||||
"text": "Find files matching the regex '^scene_[0-9]{3}\\.mp4$' and stitch them sequentially."
|
||||
},
|
||||
{
|
||||
"text": "Combine 'logo_animation_fixed.mp4' with 'main_content_v4.mp4' and 'social_media_tags.mp4'."
|
||||
},
|
||||
{
|
||||
"text": "Slice 'podcasting_session_01.mp4' every time a silence longer than 2 seconds is detected."
|
||||
},
|
||||
{
|
||||
"text": "If the file size is greater than 5GB, split '4K_Drone_Footage_Raw.mp4' into 2GB chunks."
|
||||
},
|
||||
{
|
||||
"text": "Trim the video '2026-04-07_Log_Entry.mp4' to start at the first detected motion."
|
||||
},
|
||||
{
|
||||
"text": "For each video in 'project_x', stitch them together if they share the same frame rate."
|
||||
},
|
||||
{
|
||||
"text": "Splice 'emergency_alert_broadcast.mp4' into the middle of 'regular_programming.ts' at 15:00."
|
||||
},
|
||||
{
|
||||
"text": "With 'The_Grand_Budapest_Hotel_Trailer.mp4', trim the black frames from the beginning and end."
|
||||
},
|
||||
{
|
||||
"text": "Split 'GOPR0012.MP4' into segments whenever the camera GPS coordinates change."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'Clip (1).mp4', 'Clip (2).mp4', and 'Clip (3).mp4' in numerical order."
|
||||
},
|
||||
{
|
||||
"text": "For all videos in 'Archive' with a 4:3 aspect ratio, trim them to 1 minute."
|
||||
},
|
||||
{
|
||||
"text": "Trim the audio-only portion from 'lecture_series_04.m4v' and stitch it with 'slides_04.mp4'."
|
||||
},
|
||||
{
|
||||
"text": "Using the file 'interview_subject_A.mp4', slice out every instance where the speaker says 'um'."
|
||||
},
|
||||
{
|
||||
"text": "Split the video 'test_pattern_v12.avi' every 500MB."
|
||||
},
|
||||
{
|
||||
"text": "Stitch the files listed in 'file_list.txt' into a seamless video."
|
||||
},
|
||||
{
|
||||
"text": "For any video created before 2025, trim the metadata and splice a new header."
|
||||
},
|
||||
{
|
||||
"text": "Take 'my_cool_video.mp4' and 'another_one.mp4' and stitch them together."
|
||||
},
|
||||
{
|
||||
"text": "The file 'Succession.S04E10.1080p.mkv' should be split at the 30-minute mark."
|
||||
},
|
||||
{
|
||||
"text": "Trim the start of 'morning_routine_vlog.mp4' by 12 seconds."
|
||||
},
|
||||
{
|
||||
"text": "Splice a 2-second black screen between 'intro.mp4' and 'body.mp4'."
|
||||
},
|
||||
{
|
||||
"text": "Find videos with 'H.264' codec and stitch them into a compilation called 'H264_Reel.mp4'."
|
||||
},
|
||||
{
|
||||
"text": "For each video in the folder 'TikTok_Drafts', trim the last 0.5 seconds."
|
||||
},
|
||||
{
|
||||
"text": "Split 'CCTV_Storage_Disk_A.mp4' into hourly intervals."
|
||||
},
|
||||
{
|
||||
"text": "From 'Cinematic_Shot_[001].mov', remove the first 2 seconds of camera shake."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'Top_10_Countdown_Part_10.mp4' through 'Top_10_Countdown_Part_1.mp4' in reverse order."
|
||||
},
|
||||
{
|
||||
"text": "Trim 'cooking_tutorial_raw.mp4' based on the bookmarks in the file metadata."
|
||||
},
|
||||
{
|
||||
"text": "For all files matching '*_backup.mp4', split them in half and delete the second half."
|
||||
},
|
||||
{
|
||||
"text": "Splice 'watermark_fixed.png' into the corner of 'product_render.mp4' for its entire duration."
|
||||
},
|
||||
{
|
||||
"text": "With 'Breaking_Bad_S05E16.mkv', trim the 'previously on' segment."
|
||||
},
|
||||
{
|
||||
"text": "Split the file 'ambience_loop_forest.mp4' exactly at the point where the loop repeats."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'phone_video_01.mp4' and 'phone_video_02.mp4' after rotating the second one."
|
||||
},
|
||||
{
|
||||
"text": "For every video in 'Dailies/Scene_5/', trim the clapperboard at the start."
|
||||
},
|
||||
{
|
||||
"text": "In 'webinar_recap.mp4', slice out the Q&A session from 45:00 to the end."
|
||||
},
|
||||
{
|
||||
"text": "If a video is 1920x1080, splice the 'standard_definition_warning.mp4' to the front."
|
||||
},
|
||||
{
|
||||
"text": "Split 'gameplay_recording_3hr.mp4' into three 1-hour files."
|
||||
},
|
||||
{
|
||||
"text": "Trim 'Birthday_Party_(Edited).mp4' so it ends right before the candles are blown out."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'camera_1_view.mp4' and 'camera_2_view.mp4' into a side-by-side split screen."
|
||||
},
|
||||
{
|
||||
"text": "For each .mkv file in the 'Movies' folder, trim the first 30 seconds of credits."
|
||||
},
|
||||
{
|
||||
"text": "Using the pattern 'shot_v[0-9].mp4', stitch the highest version numbers together."
|
||||
},
|
||||
{
|
||||
"text": "From 'stock_footage_clouds.mov', trim the sections where the sun is obscured."
|
||||
},
|
||||
{
|
||||
"text": "Split 'unboxing_video_long.mp4' at every scene change detected with 0.4 threshold."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'audio_narration.mp3' onto 'silent_presentation.mp4' and trim to the shorter length."
|
||||
},
|
||||
{
|
||||
"text": "For any video with 'vertical' in the name, trim the top and bottom to make it 16:9."
|
||||
},
|
||||
{
|
||||
"text": "Slice 'long_running_process.mp4' to only include the first and last 5 minutes."
|
||||
},
|
||||
{
|
||||
"text": "If the video 'test_render_01.mp4' has no audio track, splice in 'white_noise.wav'."
|
||||
},
|
||||
{
|
||||
"text": "Trim the intro and outro of all videos in the 'Course_Modules' subfolder."
|
||||
},
|
||||
{
|
||||
"text": "Split '4k_landscape_timelapse.mp4' into 250 individual image frames."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'wedding_ceremony.mp4' and 'wedding_reception.mp4' with a 5-second fade."
|
||||
},
|
||||
{
|
||||
"text": "For each file in 'temp_renders/', if it is less than 1 second long, delete it."
|
||||
},
|
||||
{
|
||||
"text": "Trim 'The_Dark_Knight_Trailer (2008).mp4' to only include the Joker scenes."
|
||||
},
|
||||
{
|
||||
"text": "Splicing 'end_card_v3.mp4' to every video in 'Youtube_Uploads' is the goal."
|
||||
},
|
||||
{
|
||||
"text": "Split 'CCTV_04_07_26.mp4' whenever motion is detected in the 'gate' region."
|
||||
},
|
||||
{
|
||||
"text": "With 'tutorial_recording.mp4', slice the segment from 05:00 to 07:00 and save as 'Highlight.mp4'."
|
||||
},
|
||||
{
|
||||
"text": "Stitch all videos in 'raw_clips' that were recorded at 60fps."
|
||||
},
|
||||
{
|
||||
"text": "For any video named 'Untitled (Copy).mp4', trim it to 10 seconds and rename it."
|
||||
},
|
||||
{
|
||||
"text": "Split 'audio_sync_test.mp4' into separate video and audio streams."
|
||||
},
|
||||
{
|
||||
"text": "Trim 'Gym_Workout_Session.mp4' to remove the rest periods between sets."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'A.mp4', 'B.mp4', and 'C.mp4' but reverse the order to 'C.mp4', 'B.mp4', 'A.mp4'."
|
||||
},
|
||||
{
|
||||
"text": "For each video in 'Client_Review', splice the 'DRAFT' watermark across the middle."
|
||||
},
|
||||
{
|
||||
"text": "Find videos with duration > 300s and split them into 60s clips."
|
||||
},
|
||||
{
|
||||
"text": "The video 'Street_Food_Tour [HDR].mov' needs the first 2 minutes trimmed."
|
||||
},
|
||||
{
|
||||
"text": "Splice 'transition_effect.mp4' between every video in the 'Montage' folder during stitching."
|
||||
},
|
||||
{
|
||||
"text": "From 'Interview_with_CEO_Final.mp4', trim the first 3 seconds of dead air."
|
||||
},
|
||||
{
|
||||
"text": "Split '24_hour_surveillance_feed.mp4' into 24 one-hour segments."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'Intro (English).mp4' with 'Content.mp4' and 'Outro (English).mp4'."
|
||||
},
|
||||
{
|
||||
"text": "For each file in the 'Renders' folder, if the width is 720, trim it to 5 seconds."
|
||||
},
|
||||
{
|
||||
"text": "Trim 'Space_X_Launch_Live.mkv' to start at T-minus 10 seconds."
|
||||
},
|
||||
{
|
||||
"text": "Using glob '*_scene_*.mp4', stitch clips with matching scene numbers together."
|
||||
},
|
||||
{
|
||||
"text": "Slice 'nature_walk_4k.mp4' to remove the shaky footage at the 12-minute mark."
|
||||
},
|
||||
{
|
||||
"text": "Split 'Podcast_Ep_12.mp4' at the timestamps provided in the 'chapters.json' file."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'Scene_1.mp4' and 'Scene_1_Alt_End.mp4' and trim the overlap."
|
||||
},
|
||||
{
|
||||
"text": "For all videos in 'Archive' with the file extension .flv, stitch them into one .mp4."
|
||||
},
|
||||
{
|
||||
"text": "Trim the last 30 frames from every video in the 'Animation_Export' folder."
|
||||
},
|
||||
{
|
||||
"text": "Splice 'Sponsor_Segment.mp4' into 'Gaming_Video_01.mp4' at the 8-minute mark."
|
||||
},
|
||||
{
|
||||
"text": "If a video has stereo audio, split it into two mono-audio video files."
|
||||
},
|
||||
{
|
||||
"text": "With 'Travel_Vlog_Ep1 (Final_Draft).mp4', trim the end where the camera falls."
|
||||
},
|
||||
{
|
||||
"text": "Split 'Concert_Multicam.mp4' into 4 separate files, one for each camera angle."
|
||||
},
|
||||
{
|
||||
"text": "Stitch all videos in 'Dailies' that have a creation date of '2026-04-07'."
|
||||
},
|
||||
{
|
||||
"text": "For each video in the 'Trash' folder, trim it to 0 seconds (effectively clear content)."
|
||||
},
|
||||
{
|
||||
"text": "Trim 'Drone_Shot_05.mov' to the first 1000 frames."
|
||||
},
|
||||
{
|
||||
"text": "Slice 'Long_Lecture.mp4' to extract only the slides that contain the word 'Biology'."
|
||||
},
|
||||
{
|
||||
"text": "Split 'Heavy_Metal_Music_Video.mp4' at every beat of the drum (120 BPM)."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'Part_01_v1.mp4' through 'Part_10_v1.mp4' using the 'list' command."
|
||||
},
|
||||
{
|
||||
"text": "For every video in 'exports/', trim the filename prefix 'FINAL_' and then stitch them."
|
||||
},
|
||||
{
|
||||
"text": "Trim 'Ocean_Waves_Ambient.mp4' to be a perfect 10-second loop."
|
||||
},
|
||||
{
|
||||
"text": "Splice 'Director_Commentary.ac3' into 'The_Movie_Title.mkv' as a second audio track."
|
||||
},
|
||||
{
|
||||
"text": "If the file 'clip_a.mp4' is shorter than 'clip_b.mp4', stitch them in that order."
|
||||
},
|
||||
{
|
||||
"text": "Split 'NASA_Space_Station_Feed.mp4' whenever the video bitrate drops to zero."
|
||||
},
|
||||
{
|
||||
"text": "Trim the last 5 seconds from 'My_Daughter's_First_Steps.mp4'."
|
||||
},
|
||||
{
|
||||
"text": "Stitch 'Intro.mp4' with every file in 'Chapters/' and save them as individual episodes."
|
||||
},
|
||||
{
|
||||
"text": "For each video in 'Pending', trim the first 10 seconds and move to 'Ready'."
|
||||
},
|
||||
{
|
||||
"text": "Slice 'Webinar_Recording_2026.mp4' at 01:20:00 and discard the rest."
|
||||
},
|
||||
{
|
||||
"text": "Split 'Slow_Motion_Reference.mp4' into segments of 120 frames each."
|
||||
},
|
||||
{
|
||||
"text": "Stitch all .mp4 files in 'Folder A' and 'Folder B' into a single master file."
|
||||
},
|
||||
{
|
||||
"text": "Trim 'The_Last_of_Us_S01E01.mkv' to end right when the music starts."
|
||||
}
|
||||
]
|
||||
200
datasets/raw/ffmpeg_gpt_v1.json
Normal file
200
datasets/raw/ffmpeg_gpt_v1.json
Normal file
@@ -0,0 +1,200 @@
|
||||
[
|
||||
{
|
||||
"text": "Convert interview.mp4 to H.265 and reduce the file size under 120MB"
|
||||
},
|
||||
{
|
||||
"text": "Take all the MOV files in my Downloads folder and convert them to MP4"
|
||||
},
|
||||
{
|
||||
"text": "Strip the audio from promo_video.mp4 and replace it with background_music.mp3"
|
||||
},
|
||||
{
|
||||
"text": "Extract a clip from 00:02:15 to 00:05:44 from lecture.mkv without re-encoding"
|
||||
},
|
||||
{
|
||||
"text": "Convert every PNG in this directory into a single timelapse video at 24fps"
|
||||
},
|
||||
{
|
||||
"text": "Add burnt-in subtitles from subtitles.srt to hello_film.mp4"
|
||||
},
|
||||
{
|
||||
"text": "Compress all my drone videos to be under 50MB each, keeping 1080p"
|
||||
},
|
||||
{
|
||||
"text": "Extract the audio track from podcast_episode_12.mp4 as a FLAC file"
|
||||
},
|
||||
{
|
||||
"text": "Watermark every video in the /exports folder with logo.png in the bottom right"
|
||||
},
|
||||
{
|
||||
"text": "Resize input.mov to 720p and convert to WebM"
|
||||
},
|
||||
{
|
||||
"text": "Take a screenshot from the video at the 30-second mark"
|
||||
},
|
||||
{
|
||||
"text": "Merge part1.mp4, part2.mp4, and part3.mp4 into one file"
|
||||
},
|
||||
{
|
||||
"text": "Normalize the loudness of all audio files in my podcast folder to -16 LUFS"
|
||||
},
|
||||
{
|
||||
"text": "Speed up the timelapse footage to 4x without changing the audio pitch"
|
||||
},
|
||||
{
|
||||
"text": "Create a side-by-side comparison video of before.mp4 and after.mp4"
|
||||
},
|
||||
{
|
||||
"text": "Convert the MKV to MP4 keeping all subtitle and audio tracks"
|
||||
},
|
||||
{
|
||||
"text": "Extract one frame every 5 seconds from the dashcam_recording video file and save them to frames/"
|
||||
},
|
||||
{
|
||||
"text": "Add a 3-second black fade-in and fade-out to presentation clip video file"
|
||||
},
|
||||
{
|
||||
"text": "Loop the henry.avi exactly 10 times and export as a single video"
|
||||
},
|
||||
{
|
||||
"text": "Crop 39fjsai.mp4 to a vertical 9:16 format for Instagram"
|
||||
},
|
||||
{
|
||||
"text": "Remove the first 10 seconds from all the clips in the /raw folder"
|
||||
},
|
||||
{
|
||||
"text": "Convert my audio files from MP3 to AAC at 192kbps"
|
||||
},
|
||||
{
|
||||
"text": "Create a GIF from the first 4 seconds of the youtube_download.webm starting at 1:12"
|
||||
},
|
||||
{
|
||||
"text": "Reduce video bitrate to 1200kbps and encode audio to 64kbps aac of to___encode.mp4 and save it as output.mkv"
|
||||
},
|
||||
{
|
||||
"text": "Rotate the hea3434gvjj__www.download.com.mp4 to 90 degrees clockwise"
|
||||
},
|
||||
{
|
||||
"text": "Extract all keyframes from claivido_final.mkv as JPEG images"
|
||||
},
|
||||
{
|
||||
"text": "Add chapter markers and a cover thumbnail to podcast.mp4"
|
||||
},
|
||||
{
|
||||
"text": "Convert the entire /recordings folder from WAV to MP3 at 320kbps"
|
||||
},
|
||||
{
|
||||
"text": "Stabilize all the videos created today and save them to stabilized folder"
|
||||
},
|
||||
{
|
||||
"text": "Scale down all images in the sike directory to a max width of 1920px"
|
||||
},
|
||||
{
|
||||
"text": "Re-encode stupid_video.avi using nvenc and with hevc at 1080p"
|
||||
},
|
||||
{
|
||||
"text": "Create a picture-in-picture with webcam.mp4 overlaid on screen_recording.mp4 in bottom right"
|
||||
},
|
||||
{
|
||||
"text": "Convert all my .avi files to MKV with lossless encoding"
|
||||
},
|
||||
{
|
||||
"text": "Reverse all video files containing '_reverse_needed_' for a rewind effect and save them to a directory starting with reversed"
|
||||
},
|
||||
{
|
||||
"text": "Re-wrap the H.264 stream from .ts to .mp4 with no re-encode"
|
||||
},
|
||||
{
|
||||
"text": "Convert the RAW image sequence from /timelapse to a 4K ProRes video"
|
||||
},
|
||||
{
|
||||
"text": "Split the 3-hour conference recording into 30-minute segments"
|
||||
},
|
||||
{
|
||||
"text": "Remove the audio entirely from all videos in the /silent folder"
|
||||
},
|
||||
{
|
||||
"text": "Create a low-res proxy version of the 4K footage for offline editing"
|
||||
},
|
||||
{
|
||||
"text": "Encode the video with 2-pass encoding for a target bitrate of 2Mbps"
|
||||
},
|
||||
{
|
||||
"text": "Add a scrolling lower-third text overlay to the interview"
|
||||
},
|
||||
{
|
||||
"text": "Export audio as stereo WAV at 48kHz for DaVinci Resolve"
|
||||
},
|
||||
{
|
||||
"text": "Take every file in the folder and generate a 10-second preview clip from the middle"
|
||||
},
|
||||
{
|
||||
"text": "Convert the SRT subtitle file to WebVTT format"
|
||||
},
|
||||
{
|
||||
"text": "Deinterlace the video files in /old_footage"
|
||||
},
|
||||
{
|
||||
"text": "Create an optimized animated WebP from the video loop"
|
||||
},
|
||||
{
|
||||
"text": "Stack 203jfh.mp4 and 29jfj21112.webm and aosidjgf22.mkv vertically into one tall video and save it as output"
|
||||
},
|
||||
{
|
||||
"text": "Convert all images in the product folder to WebP with 80% quality"
|
||||
},
|
||||
{
|
||||
"text": "Extract just the chapter 3 portion based on chapter metadata of 029jghijdf__asuihttp.mp4"
|
||||
},
|
||||
{
|
||||
"text": "Compress all JPEGs in the folder to a max of 200KB per image"
|
||||
},
|
||||
{
|
||||
"text": "Convert and re-tag all WMA files to AAC, preserving all metadata"
|
||||
},
|
||||
{
|
||||
"text": "Create a 1:1 square version of all videos created last thursday with padding"
|
||||
},
|
||||
{
|
||||
"text": "Trim the last 30 seconds off every clip in my /raw directory"
|
||||
},
|
||||
{
|
||||
"text": "Add a podcast intro and outro to each episode file in the folder"
|
||||
},
|
||||
{
|
||||
"text": "Mux the separate video_new_final-release.h264 and 12831245_audio.aac streams into an MP4"
|
||||
},
|
||||
{
|
||||
"text": "Take the interview clip and output it in every social format: 16:9, 9:16, and 1:1"
|
||||
},
|
||||
{
|
||||
"text": "Encode a lossless copy for archival and a compressed copy for sharing"
|
||||
},
|
||||
{
|
||||
"text": "Apply CLAHE contrast enhancement to all images in the dataset folder"
|
||||
},
|
||||
{
|
||||
"text": "Add chapter info from chapters.txt into the output MKV"
|
||||
},
|
||||
{
|
||||
"text": "Create a video slideshow from photos in slideshow_photos/ with 3-second holds and a Ken Burns effect"
|
||||
},
|
||||
{
|
||||
"text": "Convert the audio to mono for podcast distribution"
|
||||
},
|
||||
{
|
||||
"text": "Batch-resize all product photos in product/ to exactly 800x800 with white padding"
|
||||
},
|
||||
{
|
||||
"text": "Extract all frames between 10s and 20s in 290fj3.mp4 as PNG files"
|
||||
},
|
||||
{
|
||||
"text": "Build a mosaic of all the clip thumbnails into one image grid"
|
||||
},
|
||||
{
|
||||
"text": "Re-encode everything in the archive with AV1"
|
||||
},
|
||||
{
|
||||
"text": "Clip the first 5 seconds from all lecture recordings and save separately"
|
||||
}
|
||||
]
|
||||
23
enums.py
Normal file
23
enums.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class TokenLabel(Enum):
|
||||
ACTION = "ACTION"
|
||||
TARGET = "TARGET"
|
||||
PREPOSITION = "PREPOSITION"
|
||||
CONJUNCTION = "CONJUNCTION"
|
||||
MODIFIER = "MODIFIER"
|
||||
OBJECT = "OBJECT"
|
||||
VALUE_TOKEN = "VALUE_TOKEN"
|
||||
NUMBER = "NUMBER"
|
||||
UNIT = "UNIT"
|
||||
TIME = "TIME"
|
||||
DATE = "DATE"
|
||||
ORDINAL = "ORDINAL"
|
||||
COMPARATOR = "COMPARATOR"
|
||||
FILTER_HINT = "FILTER_HINT"
|
||||
PATH = "PATH"
|
||||
PATTERN = "PATTERN"
|
||||
URL = "URL"
|
||||
TIMESTAMP = "TIMESTAMP"
|
||||
RAW_PHRASE = "0"
|
||||
359
main.py
Normal file
359
main.py
Normal file
@@ -0,0 +1,359 @@
|
||||
import argparse
|
||||
import datetime as dt
|
||||
import json
|
||||
import os
|
||||
import secrets
|
||||
import string
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from google import genai
|
||||
from loguru import logger
|
||||
|
||||
from .enums import TokenLabel
|
||||
|
||||
DEFAULT_BATCH_SIZE = 20
|
||||
DEFAULT_MODEL = "gemini-2.5-flash-lite"
|
||||
DEFAULT_RAW_LOG_PATH = "logs/gemini_raw.log"
|
||||
DEFAULT_CONVERT_INPUT_DIR = "datasets/preannotated"
|
||||
DEFAULT_CONVERT_OUTPUT_DIR = "datasets/annotated"
|
||||
LABEL_NAMES = [label.name for label in TokenLabel]
|
||||
VALUE_TO_NAME = {label.value: label.name for label in TokenLabel}
|
||||
ID_ALPHABET = string.ascii_uppercase + string.digits
|
||||
|
||||
|
||||
def _chunked(items: list[int], size: int) -> Iterable[list[int]]:
|
||||
if size < 1:
|
||||
raise ValueError("batch_size must be at least 1.")
|
||||
for start in range(0, len(items), size):
|
||||
yield items[start : start + size]
|
||||
|
||||
|
||||
def _generate_id() -> str:
|
||||
return (
|
||||
f"{''.join(secrets.choice(ID_ALPHABET) for _ in range(3))}"
|
||||
f"-{''.join(secrets.choice(ID_ALPHABET) for _ in range(6))}"
|
||||
)
|
||||
|
||||
|
||||
def _all_occurrences(text: str, span: str) -> list[int]:
|
||||
occurrences = []
|
||||
start = 0
|
||||
while True:
|
||||
idx = text.find(span, start)
|
||||
if idx == -1:
|
||||
break
|
||||
occurrences.append(idx)
|
||||
start = idx + 1
|
||||
return occurrences
|
||||
|
||||
|
||||
def _build_prompt(texts: list[str]) -> str:
|
||||
labels = ", ".join(LABEL_NAMES)
|
||||
return (
|
||||
"You are a token pre-annotator. For each input text, return JSON with tagged "
|
||||
"token/subword/word/span labels.\n"
|
||||
f"Allowed labels: {labels}.\n"
|
||||
"Rules:\n"
|
||||
"- Output ONLY valid JSON (no markdown).\n"
|
||||
"- Return a JSON array with the same length/order as the input.\n"
|
||||
"- Each item must be an object: "
|
||||
'{"text": "<original>", "tags": [{"span": "<exact substring>", "label": "<LABEL>"}]}.\n'
|
||||
"- The span must be an exact substring of the original text.\n"
|
||||
"- Use RAW_PHRASE when no other label applies.\n\n"
|
||||
f"Input texts: {json.dumps(texts, ensure_ascii=True)}"
|
||||
)
|
||||
|
||||
|
||||
def _normalize_label(label: str) -> str:
|
||||
if label in LABEL_NAMES:
|
||||
return label
|
||||
if label in VALUE_TO_NAME:
|
||||
return VALUE_TO_NAME[label]
|
||||
raise ValueError(f"Unknown label: {label}")
|
||||
|
||||
|
||||
def _normalize_result(text: str, result: dict) -> dict:
|
||||
if result.get("text") != text:
|
||||
raise ValueError("Gemini result text does not match input text.")
|
||||
tags = result.get("tags")
|
||||
if not isinstance(tags, list):
|
||||
raise ValueError("Gemini result tags must be a list.")
|
||||
normalized_tags = []
|
||||
for tag in tags:
|
||||
if not isinstance(tag, dict):
|
||||
raise ValueError("Each tag must be an object.")
|
||||
span = tag.get("span")
|
||||
label = tag.get("label")
|
||||
if not isinstance(span, str) or not isinstance(label, str):
|
||||
raise ValueError("Each tag must include string span and label fields.")
|
||||
if span not in text:
|
||||
raise ValueError(f"Span not found in text: {span}")
|
||||
normalized_tags.append({"span": span, "label": _normalize_label(label)})
|
||||
return {"text": text, "tags": normalized_tags}
|
||||
|
||||
|
||||
def preannotate_tokens(
|
||||
texts: list[str], client: genai.Client, model: str
|
||||
) -> list[dict]:
|
||||
prompt = _build_prompt(texts)
|
||||
response = client.models.generate_content(model=model, contents=prompt)
|
||||
if response.text is None:
|
||||
raise ValueError("Gemini returned an empty response.")
|
||||
logger.bind(raw_gemini=True).trace(response.text)
|
||||
parsed = json.loads(response.text)
|
||||
if not isinstance(parsed, list) or len(parsed) != len(texts):
|
||||
raise ValueError("Gemini response must be a JSON array matching input length.")
|
||||
return [_normalize_result(text, result) for text, result in zip(texts, parsed)]
|
||||
|
||||
|
||||
def _load_raw_records(path: Path) -> list[dict]:
|
||||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||
if not isinstance(raw, list):
|
||||
raise ValueError(f"Dataset {path} must be a JSON array.")
|
||||
for idx, item in enumerate(raw):
|
||||
if not isinstance(item, dict) or "text" not in item:
|
||||
raise ValueError(
|
||||
f"Dataset {path} item {idx} must be an object with a text field."
|
||||
)
|
||||
return raw
|
||||
|
||||
|
||||
def _load_preannotated_records(path: Path) -> list[dict]:
|
||||
raw = json.loads(path.read_text(encoding="utf-8"))
|
||||
if not isinstance(raw, list):
|
||||
raise ValueError(f"Dataset {path} must be a JSON array.")
|
||||
for idx, item in enumerate(raw):
|
||||
if not isinstance(item, dict) or "text" not in item:
|
||||
raise ValueError(
|
||||
f"Dataset {path} item {idx} must be an object with a text field."
|
||||
)
|
||||
if "tags" not in item or not isinstance(item["tags"], list):
|
||||
raise ValueError(
|
||||
f"Dataset {path} item {idx} must include a tags list for conversion."
|
||||
)
|
||||
return raw
|
||||
|
||||
|
||||
def _build_labelstudio_results(text: str, tags: list[dict]) -> list[dict]:
|
||||
occurrences_map: dict[str, list[int]] = {}
|
||||
occurrence_index: dict[str, int] = {}
|
||||
results = []
|
||||
for tag in tags:
|
||||
if not isinstance(tag, dict):
|
||||
raise ValueError("Each tag must be an object.")
|
||||
span = tag.get("span")
|
||||
label = tag.get("label")
|
||||
if not isinstance(span, str) or not isinstance(label, str):
|
||||
raise ValueError("Each tag must include string span and label fields.")
|
||||
if span not in occurrences_map:
|
||||
occurrences_map[span] = _all_occurrences(text, span)
|
||||
occurrence_index[span] = 0
|
||||
occurrences = occurrences_map[span]
|
||||
idx = occurrence_index[span]
|
||||
if idx >= len(occurrences):
|
||||
raise ValueError(f"Span not found in text: {span}")
|
||||
start = occurrences[idx]
|
||||
end = start + len(span)
|
||||
occurrence_index[span] = idx + 1
|
||||
results.append(
|
||||
{
|
||||
"value": {
|
||||
"start": start,
|
||||
"end": end,
|
||||
"text": span,
|
||||
"labels": [_normalize_label(label)],
|
||||
},
|
||||
"id": _generate_id(),
|
||||
"from_name": "label",
|
||||
"to_name": "text",
|
||||
"type": "labels",
|
||||
"origin": "manual",
|
||||
}
|
||||
)
|
||||
return results
|
||||
|
||||
|
||||
def _preannotate_dataset(
|
||||
input_path: Path,
|
||||
output_path: Path,
|
||||
client: genai.Client,
|
||||
model: str,
|
||||
batch_size: int,
|
||||
) -> None:
|
||||
raw_items = _load_raw_records(input_path)
|
||||
texts = [item["text"] for item in raw_items]
|
||||
annotated_items: list[dict | None] = [None] * len(raw_items)
|
||||
|
||||
logger.info("Pre-annotating {} records from {}", len(texts), input_path)
|
||||
for batch_indices in _chunked(list(range(len(texts))), batch_size):
|
||||
batch_texts = [texts[idx] for idx in batch_indices]
|
||||
logger.debug(
|
||||
"Sending batch {}-{} (size {}) to Gemini",
|
||||
batch_indices[0],
|
||||
batch_indices[-1],
|
||||
len(batch_texts),
|
||||
)
|
||||
batch_results = preannotate_tokens(batch_texts, client, model)
|
||||
for idx, result in zip(batch_indices, batch_results):
|
||||
item = dict(raw_items[idx])
|
||||
item["tags"] = result["tags"]
|
||||
annotated_items[idx] = item
|
||||
|
||||
if any(item is None for item in annotated_items):
|
||||
raise ValueError("Pre-annotation failed to produce results for all items.")
|
||||
|
||||
output_path.write_text(
|
||||
json.dumps(annotated_items, indent=2, ensure_ascii=True), encoding="utf-8"
|
||||
)
|
||||
logger.info("Wrote pre-annotated dataset to {}", output_path)
|
||||
|
||||
|
||||
def _convert_preannotated_dataset(input_path: Path, output_path: Path) -> None:
|
||||
preannotated_items = _load_preannotated_records(input_path)
|
||||
now_iso = dt.datetime.now(dt.UTC).isoformat()
|
||||
tasks = []
|
||||
logger.info("Converting {} records from {}", len(preannotated_items), input_path)
|
||||
for index, item in enumerate(preannotated_items):
|
||||
text = item["text"]
|
||||
tags = item["tags"]
|
||||
task_id = _generate_id()
|
||||
annotation_id = index + 1
|
||||
results = _build_labelstudio_results(text, tags)
|
||||
annotation = {
|
||||
"id": annotation_id,
|
||||
"completed_by": 2,
|
||||
"result": results,
|
||||
"was_cancelled": False,
|
||||
"ground_truth": False,
|
||||
"created_at": now_iso,
|
||||
"updated_at": now_iso,
|
||||
"draft_created_at": now_iso,
|
||||
"lead_time": 0.0,
|
||||
"prediction": {},
|
||||
"result_count": len(results),
|
||||
"unique_id": _generate_id(),
|
||||
"import_id": None,
|
||||
"last_action": None,
|
||||
"bulk_created": False,
|
||||
"task": task_id,
|
||||
"project": None,
|
||||
"updated_by": None,
|
||||
"parent_prediction": None,
|
||||
"parent_annotation": None,
|
||||
"last_created_by": None,
|
||||
}
|
||||
tasks.append(
|
||||
{
|
||||
"id": task_id,
|
||||
"annotations": [annotation],
|
||||
"file_upload": None,
|
||||
"drafts": [],
|
||||
"predictions": [],
|
||||
"data": {"text": text},
|
||||
"meta": {},
|
||||
"created_at": now_iso,
|
||||
"updated_at": now_iso,
|
||||
"allow_skip": True,
|
||||
"inner_id": index + 1,
|
||||
"total_annotations": 1,
|
||||
"cancelled_annotations": 0,
|
||||
"total_predictions": 0,
|
||||
"comment_count": 0,
|
||||
"unresolved_comment_count": 0,
|
||||
"last_comment_updated_at": None,
|
||||
"project": None,
|
||||
"updated_by": None,
|
||||
"comment_authors": [],
|
||||
}
|
||||
)
|
||||
|
||||
output_path.write_text(
|
||||
json.dumps(tasks, indent=2, ensure_ascii=True), encoding="utf-8"
|
||||
)
|
||||
logger.info("Wrote converted annotated dataset to {}", output_path)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Pre-annotate datasets with Gemini.")
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
choices=["preannotate", "convert"],
|
||||
default="preannotate",
|
||||
)
|
||||
parser.add_argument("--input-dir", default=None)
|
||||
parser.add_argument("--output-dir", default=None)
|
||||
parser.add_argument("--input-file", default=None)
|
||||
parser.add_argument("--batch-size", type=int, default=DEFAULT_BATCH_SIZE)
|
||||
parser.add_argument("--model", default=os.getenv("GEMINI_MODEL", DEFAULT_MODEL))
|
||||
parser.add_argument(
|
||||
"--raw-log-file",
|
||||
default=os.getenv("GEMINI_RAW_LOG_FILE", DEFAULT_RAW_LOG_PATH),
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.mode == "preannotate":
|
||||
raw_log_path = Path(args.raw_log_file)
|
||||
raw_log_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
logger.add(
|
||||
raw_log_path,
|
||||
level="TRACE",
|
||||
filter=lambda record: record["extra"].get("raw_gemini") is True,
|
||||
format="{message}",
|
||||
)
|
||||
|
||||
api_key = os.getenv("GEMINI_API_KEY")
|
||||
if not api_key:
|
||||
raise ValueError("GEMINI_API_KEY must be set in the environment.")
|
||||
|
||||
client = genai.Client(api_key=api_key)
|
||||
input_dir = Path(args.input_dir or "datasets/raw")
|
||||
output_dir = Path(args.output_dir or "datasets/preannotated")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if args.input_file:
|
||||
input_paths = [Path(args.input_file)]
|
||||
else:
|
||||
input_paths = sorted(input_dir.glob("*.json"))
|
||||
|
||||
if not input_paths:
|
||||
raise ValueError(f"No input datasets found in {input_dir}.")
|
||||
|
||||
logger.info(
|
||||
"Starting pre-annotation: model={}, batch_size={}, input_dir={}, output_dir={}",
|
||||
args.model,
|
||||
args.batch_size,
|
||||
input_dir,
|
||||
output_dir,
|
||||
)
|
||||
for input_path in input_paths:
|
||||
output_path = output_dir / input_path.name
|
||||
_preannotate_dataset(
|
||||
input_path, output_path, client, args.model, args.batch_size
|
||||
)
|
||||
return
|
||||
|
||||
input_dir = Path(args.input_dir or DEFAULT_CONVERT_INPUT_DIR)
|
||||
output_dir = Path(args.output_dir or DEFAULT_CONVERT_OUTPUT_DIR)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if args.input_file:
|
||||
input_paths = [Path(args.input_file)]
|
||||
else:
|
||||
input_paths = sorted(input_dir.glob("*.json"))
|
||||
|
||||
if not input_paths:
|
||||
raise ValueError(f"No input datasets found in {input_dir}.")
|
||||
|
||||
logger.info(
|
||||
"Starting conversion: input_dir={}, output_dir={}",
|
||||
input_dir,
|
||||
output_dir,
|
||||
)
|
||||
for input_path in input_paths:
|
||||
output_path = output_dir / input_path.name
|
||||
_convert_preannotated_dataset(input_path, output_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
12
pyproject.toml
Normal file
12
pyproject.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[project]
|
||||
name = "clint-dataset"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"google-genai>=1.70.0",
|
||||
"label-studio>=1.23.0",
|
||||
"label-studio-ml>=1.0.9",
|
||||
"loguru>=0.7.3",
|
||||
]
|
||||
Reference in New Issue
Block a user