commit 836e754c3440e4f7cdf3d780c7f234c2bae33461 Author: kura Date: Wed Mar 18 15:36:08 2026 +0800 init diff --git a/README.md b/README.md new file mode 100644 index 0000000..53382bb --- /dev/null +++ b/README.md @@ -0,0 +1,58 @@ +# CrossSubtitle-AI + +基于 `Tauri v2 + Vue 3 + Pinia + Tailwind CSS` 的本地优先字幕工作台,覆盖以下 MVP 链路: + +- 导入音视频文件并创建任务队列 +- 使用 `ffmpeg` 抽取 16kHz 单声道 WAV +- 执行基础 VAD 切分并生成语音片段时间轴 +- 进入 Whisper 转录/翻译环节 +- 可选接入 OpenAI-compatible 接口生成中文译文 +- 实时推送任务进度和字幕片段 +- 导出 `SRT / VTT / ASS` + +## 目录结构 + +- `src/`: Vue 前端界面、Pinia 状态、字幕编辑器 +- `src-tauri/src/audio.rs`: 音频抽取与 WAV 读取 +- `src-tauri/src/vad.rs`: VAD API 与基础能量检测实现 +- `src-tauri/src/whisper.rs`: Whisper 接口层 +- `src-tauri/src/translate.rs`: OpenAI-compatible 滑动窗口翻译 +- `src-tauri/src/subtitle.rs`: SRT / VTT / ASS 导出 +- `src-tauri/src/task.rs`: 任务编排与事件广播 + +## 当前实现说明 + +- 当前仓库已补齐完整工程骨架与核心数据流。 +- 由于本机环境缺少 Rust 工具链,本次未能执行 `cargo check` 或 `tauri dev`。 +- `whisper.rs` 目前提供了稳定的接口与任务路由,但真实 `whisper-rs` 推理仍需在安装 Rust 后继续接入具体模型调用。 +- `vad.rs` 已提供可用的 VAD 模块 API,默认实现为能量检测回退;如果你后续放入 Silero ONNX,可在此模块内替换为 `ort` 推理。 + +## 运行前准备 + +1. 安装 Rust 工具链。 +2. 安装 `ffmpeg`,并确保可通过命令行直接调用。 +3. 安装前端依赖: + +```bash +npm install +``` + +4. 如需中文翻译,配置环境变量: + +```bash +export OPENAI_API_BASE=https://your-openai-compatible-endpoint/v1 +export OPENAI_API_KEY=your_api_key +export OPENAI_MODEL=gpt-4o-mini +``` + +5. 启动桌面应用: + +```bash +npm run dev +``` + +## 下一步建议 + +- 将 `src-tauri/src/whisper.rs` 的占位实现替换为真实 `whisper-rs` 推理。 +- 在 `src-tauri/src/vad.rs` 接入 Silero VAD ONNX Runtime。 +- 加入文件选择器、任务恢复、批量导出与测试用例。 diff --git a/agent.md b/agent.md new file mode 100644 index 0000000..3702cb8 --- /dev/null +++ b/agent.md @@ -0,0 +1,337 @@ +CrossSubtitle-AI 技术需求与开发规划书 +1. 项目愿景 +CrossSubtitle-AI 的目标是打造一款 极简、高性能、完全本地优先 的视频转录与翻译工具,面向多语言视频内容生产场景,最终输出高质量字幕文件。 + +核心价值: + +本地运行,保护音视频内容隐私 +利用 AI PC 的 GPU / NPU / CoreML 加速,实现高吞吐推理 +减少 Whisper 在静音、背景音、纯音乐场景下的“幻听”问题 +支持全球主流语言识别,并稳定转换为中文或英文字幕 +提供实时预览、可编辑、可导出的完整字幕工作流 +2. 技术目标 +2.1 功能目标 +支持视频/音频文件导入 +自动抽取音频并标准化预处理 +基于 VAD 切分有效语音区间 +执行本地 Whisper 转录/翻译 +根据目标语言选择不同翻译策略 +实时展示字幕生成进度 +支持导出 SRT / ASS / VTT +2.2 非功能目标 +启动快、包体小、资源占用可控 +支持 macOS / Windows +推理链路可扩展,便于后续接入更多模型 +模块解耦,便于替换 VAD、ASR、翻译后端 +前后端通信稳定,可观察任务状态和错误信息 +3. 技术栈选型 +模块 技术选型 说明 +表现层(UI) Vue 3 + Vite + Tailwind CSS + Pinia 轻量、开发效率高、组件化好 +容器层(Desktop) Tauri v2 + Rust 包体小、性能高、系统能力强 +音频处理 FFmpeg Sidecar 统一抽流与重采样 +语音活动检测 Silero VAD + ONNX Runtime (ort) 过滤静音/噪声/纯音乐,提高识别质量 +转录引擎 whisper-rs 本地推理,支持多平台硬件加速 +翻译引擎 Whisper Native / OpenAI-compatible API / 本地 LLM 按目标语言动态选择策略 +字幕导出 Rust 原生实现 统一输出 SRT / ASS / VTT +4. 系统架构设计 +整体处理链路如下: + +用户导入视频或音频文件 +Rust 调用 FFmpeg 提取并转码为标准 PCM +VAD 模块分析音频,生成语音片段时间轴 +Whisper 模块对片段执行转录或直译 +若目标为中文,则进入 LLM 翻译链路 +实时将结果推送给 Vue 前端展示 +用户校对字幕并导出目标格式 +5. 核心模块设计 +5.1 音频预处理管线 +输入 +视频文件:mp4 / mkv / mov / avi +音频文件:mp3 / wav / m4a / flac +处理步骤 +1. FFmpeg 抽流与标准化 +统一转换为: + +采样率:16kHz +位深:16bit +声道:mono +输出格式:PCM 或 WAV +建议命令逻辑: + +ffmpeg -i input.mp4 -ac 1 -ar 16000 -f wav output.wav +2. PCM 解码与缓存 +Rust 读取 WAV/PCM 数据,转换为: + +Vec +采样值范围归一化至 [-1.0, 1.0]。 + +3. Silero VAD 检测 +使用 ort 加载 silero_vad.onnx,输出语音片段: + +Vec<(f32, f32)> +每个元组表示: + +起始时间(秒) +结束时间(秒) +预期收益 +跳过静音与非语音段 +降低 Whisper 幻听概率 +减少无效推理时长 +提升整体处理速度 +5.2 智能转录与翻译引擎 +路由策略 +路径 A:源语言 -> 英文 +直接使用 Whisper 原生翻译能力: + +task = translate +适用场景: + +非英语语音转英文字幕 +对实时性要求较高 +尽量减少外部翻译调用 +路径 B:源语言 -> 中文 +分两步: + +Whisper 转录原文 +task = transcribe +将原文批量送入 LLM 翻译为中文 +适用场景: + +非英语内容转中文 +需要更自然的语义表达 +对专有名词、人称、上下文一致性要求更高 +5.3 上下文关联翻译 +为避免逐句翻译导致语义断裂,翻译模块采用 滑动窗口策略。 + +策略设计 +每批发送 10~15 条字幕 +携带上一批末尾若干条作为上下文 +保留角色称呼、专有名词、代词指代一致性 +示例结构 +{ + "context": [ + "上一段末尾字幕1", + "上一段末尾字幕2" + ], + "segments": [ + "当前待翻译字幕1", + "当前待翻译字幕2" + ] +} +目标 +提升“你/您”“他/她”等代词一致性 +减少断句错误 +提高术语统一性 +5.4 实时同步与前端预览 +Rust 后端在每个阶段通过事件通知前端: + +文件入队 +抽流进度 +VAD 进度 +Whisper 推理进度 +翻译进度 +单条字幕产出 +任务完成/失败 +Tauri 事件建议: + +window.emit("task:progress", payload) +window.emit("task:segment", payload) +window.emit("task:error", payload) +window.emit("task:done", payload) +前端 UI 应支持: + +多文件任务队列 +当前任务状态可视化 +实时字幕流式展示 +双栏对照编辑 +时间戳定位 +5.5 字幕导出模块 +支持格式 +SRT +ASS +VTT +输出要求 +时间轴准确 +支持多语言文本 +保留编辑后的最终内容 +ASS 可扩展样式配置 +字幕数据结构建议统一为: + +type SubtitleSegment = { + id: string + start: number + end: number + sourceText: string + translatedText?: string +} +6. 数据流设计 +建议建立统一任务模型: + +type TaskStatus = + | 'queued' + | 'extracting' + | 'vad_processing' + | 'transcribing' + | 'translating' + | 'completed' + | 'failed' +type SubtitleTask = { + id: string + filePath: string + fileName: string + sourceLang?: string + targetLang: 'zh' | 'en' + status: TaskStatus + progress: number + segments: SubtitleSegment[] + error?: string +} +7. 模块划分建议 +Rust 侧 +audio.rs +负责: + +调用 FFmpeg +管理音频抽取与格式标准化 +输出 WAV/PCM 文件路径 +vad.rs +负责: + +加载 ONNX Runtime +执行 Silero VAD +返回语音时间片段 +whisper.rs +负责: + +加载 Whisper 模型 +执行转录/翻译 +汇报处理进度 +translate.rs +负责: + +对接 OpenAI-compatible API 或本地模型 +管理上下文窗口翻译 +返回中文结果 +subtitle.rs +负责: + +统一字幕结构 +导出 SRT / ASS / VTT +task.rs +负责: + +管理任务生命周期 +协调各模块串联 +向前端广播事件 +Vue 侧 +页面建议 +文件拖拽上传页 +任务队列面板 +字幕预览与编辑面板 +导出设置面板 +Pinia Store 划分 +useTaskStore +useSubtitleStore +useSettingsStore +8. 开发路线图 +第一阶段:环境与音频引擎(Week 1) +Rust +集成 tauri-plugin-shell +打通 FFmpeg sidecar 调用 +实现 audio.rs +实现 vad.rs 基础版本 +Vue +搭建基础界面 +实现多文件拖拽上传 +建立任务队列状态管理 +交付物 +可导入文件 +可抽取音频 +可生成 VAD 时间片段 +第二阶段:Whisper 推理核心(Week 1-2) +Rust +实现 whisper.rs +支持模型动态加载 +支持 large-v3-turbo 及量化模型 +加入平台加速检测逻辑 +Vue +显示 VAD / 转录进度条 +展示实时字幕输出 +交付物 +完整本地转录链路可跑通 +前端实时看到字幕增量结果 +第三阶段:翻译中枢(Week 2) +Rust +实现 translate.rs +对接 OpenAI-compatible 接口 +实现滑动窗口翻译缓存 +Vue +双栏字幕编辑器 +支持原文/译文联动显示 +点击时间戳跳转定位 +交付物 +中英目标语言翻译流程打通 +具备可编辑字幕界面 +第四阶段:优化与打包(Week 3) +性能 +优化批处理与内存占用 +Windows 侧尝试 Vulkan 加速 +macOS 侧适配 CoreML +工程化 +增加错误处理与日志 +配置 Github Actions +自动构建 Windows / macOS 安装包 +交付物 +可分发桌面应用 +基本可用的发布版本 +9. 风险与关键问题 +1. Whisper 幻听问题不能只靠模型解决 +必须结合: + +VAD 过滤 +合理切片 +静音抑制 +长音频分段策略 +2. 不同平台硬件加速差异较大 +需要预留能力检测和回退机制: + +可用 GPU/NPU 就启用 +不可用时回退 CPU +前端明确提示当前推理模式 +3. 翻译质量依赖上下文设计 +中文输出质量主要取决于: + +分批粒度 +上下文拼接方式 +专有名词缓存策略 +4. 超长文件需要任务恢复与断点信息 +后续可考虑: + +中间结果缓存 +崩溃恢复 +已完成片段复用 +10. 第一版 MVP 范围建议 +为了尽快落地,建议第一版先做: + +单文件处理 +本地 Whisper 转录 +VAD 过滤 +英文直译 / 中文二段翻译 +SRT 导出 +简单实时预览 +先不做: + +视频预览播放器 +ASS 高级样式编辑 +批量复杂任务编排 +云端模型管理 +自动术语表 +11. 给 Codex 的可执行 Prompt +模块 A:VAD 实现 +请在 Rust 的 Tauri v2 项目中实现一个 `src-tauri/src/vad.rs` 模块,使用 `ort` 加载 Silero VAD ONNX 模型。输入为 `Vec` 格式的 16kHz 单声道音频采样,输出为 `Vec<(f32, f32)>`,表示检测到语音的开始和结束秒数。请封装成结构清晰、内存安全的 API,并处理模型初始化、推理失败和阈值配置。 +模块 B:Whisper 命令封装 +请基于 `whisper-rs` 为 Tauri v2 编写一个 Rust Command。命令接收 WAV 文件路径、源语言、目标语言和任务 ID。如果目标语言是英文,则使用 Whisper 的 `translate` 任务;否则使用 `transcribe`。处理过程中通过 `window.emit` 持续向前端发送进度事件和分段结果事件。请把实现拆分到 `src-tauri/src/whisper.rs`,并提供可复用的数据结构。 +模块 C:翻译滑动窗口 +请在 Rust 中实现 `src-tauri/src/translate.rs`,对接 OpenAI-compatible Chat Completions 接口。输入是一组带时间轴的字幕片段,目标是将原文翻译成中文。要求实现滑动窗口翻译:每批 10 到 15 条,并附带上一批末尾若干条作为上下文,以保证人称和术语一致。请返回结构化结果,并处理接口超时、重试和 JSON 解析失败。 +模块 D:字幕导出 +请在 Rust 中实现一个 `subtitle.rs` 模块,接收统一的字幕片段结构,支持导出为 SRT、VTT 和 ASS 三种格式。要求时间格式正确,支持多语言文本,并为 ASS 预留基础样式配置。 \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..a7cbc8a --- /dev/null +++ b/index.html @@ -0,0 +1,12 @@ + + + + + + CrossSubtitle-AI + + +
+ + + diff --git a/package.json b/package.json new file mode 100644 index 0000000..4d029b7 --- /dev/null +++ b/package.json @@ -0,0 +1,25 @@ +{ + "name": "crosssubtitle-ai", + "version": "0.1.0", + "private": true, + "type": "module", + "scripts": { + "dev": "vite", + "build": "vue-tsc --noEmit && vite build", + "preview": "vite preview" + }, + "dependencies": { + "@tauri-apps/api": "^2.0.0", + "pinia": "^2.1.7", + "vue": "^3.5.13" + }, + "devDependencies": { + "@vitejs/plugin-vue": "^5.2.1", + "autoprefixer": "^10.4.20", + "postcss": "^8.4.49", + "tailwindcss": "^3.4.16", + "typescript": "^5.7.2", + "vite": "^6.0.3", + "vue-tsc": "^2.1.10" + } +} diff --git a/postcss.config.js b/postcss.config.js new file mode 100644 index 0000000..2e7af2b --- /dev/null +++ b/postcss.config.js @@ -0,0 +1,6 @@ +export default { + plugins: { + tailwindcss: {}, + autoprefixer: {}, + }, +} diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml new file mode 100644 index 0000000..252b167 --- /dev/null +++ b/src-tauri/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "crosssubtitle-ai" +version = "0.1.0" +edition = "2021" + +[lib] +name = "crosssubtitle_ai_lib" +crate-type = ["staticlib", "cdylib", "rlib"] + +[build-dependencies] +tauri-build = { version = "2.0.3", features = [] } + +[dependencies] +anyhow = "1.0" +hound = "3.5" +reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +tauri = { version = "2.0.6", features = [] } +tempfile = "3.14" +thiserror = "2.0" +tokio = { version = "1.42", features = ["macros", "rt-multi-thread", "time"] } +uuid = { version = "1.11", features = ["serde", "v4"] } +walkdir = "2.5" + diff --git a/src-tauri/build.rs b/src-tauri/build.rs new file mode 100644 index 0000000..d860e1e --- /dev/null +++ b/src-tauri/build.rs @@ -0,0 +1,3 @@ +fn main() { + tauri_build::build() +} diff --git a/src-tauri/capabilities/default.json b/src-tauri/capabilities/default.json new file mode 100644 index 0000000..be76cab --- /dev/null +++ b/src-tauri/capabilities/default.json @@ -0,0 +1,11 @@ +{ + "$schema": "../gen/schemas/desktop-schema.json", + "identifier": "default", + "description": "Default capability for CrossSubtitle-AI", + "windows": ["main"], + "permissions": [ + "core:event:default", + "core:path:default", + "core:window:default" + ] +} diff --git a/src-tauri/src/audio.rs b/src-tauri/src/audio.rs new file mode 100644 index 0000000..1111560 --- /dev/null +++ b/src-tauri/src/audio.rs @@ -0,0 +1,62 @@ +use std::{ + fs, + path::{Path, PathBuf}, + process::Command, +}; + +use anyhow::{anyhow, Context, Result}; + +pub struct AudioPipeline; + +impl AudioPipeline { + pub fn extract_to_wav(input_path: &str, workspace: &Path) -> Result { + fs::create_dir_all(workspace) + .with_context(|| format!("failed to create workspace: {}", workspace.display()))?; + + let output_path = workspace.join("normalized.wav"); + let status = Command::new("ffmpeg") + .arg("-y") + .arg("-i") + .arg(input_path) + .arg("-ac") + .arg("1") + .arg("-ar") + .arg("16000") + .arg("-f") + .arg("wav") + .arg(&output_path) + .status() + .context("failed to launch ffmpeg, please install ffmpeg and ensure it is in PATH")?; + + if !status.success() { + return Err(anyhow!("ffmpeg exited with status: {status}")); + } + + Ok(output_path) + } + + pub fn load_wav_f32(path: &Path) -> Result> { + let mut reader = + hound::WavReader::open(path).with_context(|| format!("failed to open {}", path.display()))?; + let spec = reader.spec(); + + if spec.channels != 1 { + return Err(anyhow!( + "expected mono wav, found {} channels in {}", + spec.channels, + path.display() + )); + } + + let samples = reader + .samples::() + .map(|sample| { + sample + .map(|value| value as f32 / i16::MAX as f32) + .map_err(anyhow::Error::from) + }) + .collect::>>()?; + + Ok(samples) + } +} diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs new file mode 100644 index 0000000..7dc6c1b --- /dev/null +++ b/src-tauri/src/lib.rs @@ -0,0 +1,63 @@ +mod audio; +mod models; +mod state; +mod subtitle; +mod task; +mod translate; +mod vad; +mod whisper; + +use models::{StartTaskPayload, SubtitleSegment, SubtitleTask}; +use state::AppState; + +#[tauri::command] +async fn start_subtitle_task( + app: tauri::AppHandle, + window: tauri::Window, + state: tauri::State<'_, AppState>, + payload: StartTaskPayload, +) -> std::result::Result { + task::start_task(app, window, state, payload) + .await + .map_err(error_to_string) +} + +#[tauri::command] +fn list_tasks(state: tauri::State<'_, AppState>) -> std::result::Result, String> { + task::list_tasks(state).map_err(error_to_string) +} + +#[tauri::command] +fn update_segment_text( + state: tauri::State<'_, AppState>, + segment: SubtitleSegment, +) -> std::result::Result { + task::update_segment_text(state, segment).map_err(error_to_string) +} + +#[tauri::command] +fn export_subtitles( + state: tauri::State<'_, AppState>, + task_id: String, + format: String, +) -> std::result::Result { + task::export_task(state, task_id, format).map_err(error_to_string) +} + +fn error_to_string(error: anyhow::Error) -> String { + format!("{error:#}") +} + +#[cfg_attr(mobile, tauri::mobile_entry_point)] +pub fn run() { + tauri::Builder::default() + .manage(AppState::default()) + .invoke_handler(tauri::generate_handler![ + start_subtitle_task, + list_tasks, + update_segment_text, + export_subtitles + ]) + .run(tauri::generate_context!()) + .expect("error while running tauri application"); +} diff --git a/src-tauri/src/main.rs b/src-tauri/src/main.rs new file mode 100644 index 0000000..76ba9b7 --- /dev/null +++ b/src-tauri/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + crosssubtitle_ai_lib::run(); +} diff --git a/src-tauri/src/models.rs b/src-tauri/src/models.rs new file mode 100644 index 0000000..9524d79 --- /dev/null +++ b/src-tauri/src/models.rs @@ -0,0 +1,88 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum TaskStatus { + Queued, + Extracting, + VadProcessing, + Transcribing, + Translating, + Completed, + Failed, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "snake_case")] +pub enum TargetLanguage { + Zh, + En, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SubtitleSegment { + pub id: String, + pub task_id: String, + pub start: f32, + pub end: f32, + pub source_text: String, + pub translated_text: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SubtitleTask { + pub id: String, + pub file_path: String, + pub file_name: String, + pub source_lang: Option, + pub target_lang: TargetLanguage, + pub status: TaskStatus, + pub progress: f32, + pub segments: Vec, + pub error: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct StartTaskPayload { + pub file_path: String, + pub source_lang: Option, + pub target_lang: TargetLanguage, + pub whisper_model_path: Option, + pub vad_model_path: Option, +} + +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct ProgressEvent { + pub task_id: String, + pub status: TaskStatus, + pub progress: f32, + pub message: String, +} + +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct SegmentEvent { + pub task_id: String, + pub segment: SubtitleSegment, +} + +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct ErrorEvent { + pub task_id: String, + pub message: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct TranslationConfig { + pub api_base: String, + pub api_key: String, + pub model: String, + pub batch_size: usize, + pub context_size: usize, +} diff --git a/src-tauri/src/state.rs b/src-tauri/src/state.rs new file mode 100644 index 0000000..4187846 --- /dev/null +++ b/src-tauri/src/state.rs @@ -0,0 +1,47 @@ +use std::{collections::HashMap, sync::Mutex}; + +use anyhow::{anyhow, Result}; + +use crate::models::{SubtitleSegment, SubtitleTask}; + +#[derive(Default)] +pub struct AppState { + tasks: Mutex>, +} + +impl AppState { + pub fn upsert_task(&self, task: SubtitleTask) -> Result<()> { + let mut guard = self.tasks.lock().map_err(|_| anyhow!("task store poisoned"))?; + guard.insert(task.id.clone(), task); + Ok(()) + } + + pub fn get_task(&self, task_id: &str) -> Result { + let guard = self.tasks.lock().map_err(|_| anyhow!("task store poisoned"))?; + guard + .get(task_id) + .cloned() + .ok_or_else(|| anyhow!("task not found: {task_id}")) + } + + pub fn list_tasks(&self) -> Result> { + let guard = self.tasks.lock().map_err(|_| anyhow!("task store poisoned"))?; + let mut tasks = guard.values().cloned().collect::>(); + tasks.sort_by(|left, right| right.id.cmp(&left.id)); + Ok(tasks) + } + + pub fn update_segment(&self, segment: SubtitleSegment) -> Result { + let mut guard = self.tasks.lock().map_err(|_| anyhow!("task store poisoned"))?; + let task = guard + .get_mut(&segment.task_id) + .ok_or_else(|| anyhow!("task not found: {}", segment.task_id))?; + + match task.segments.iter_mut().find(|item| item.id == segment.id) { + Some(existing) => *existing = segment, + None => task.segments.push(segment), + } + + Ok(task.clone()) + } +} diff --git a/src-tauri/src/subtitle.rs b/src-tauri/src/subtitle.rs new file mode 100644 index 0000000..844ebf9 --- /dev/null +++ b/src-tauri/src/subtitle.rs @@ -0,0 +1,143 @@ +use anyhow::{anyhow, Result}; + +use crate::models::SubtitleSegment; + +#[derive(Debug, Clone, Copy)] +pub enum SubtitleFormat { + Srt, + Vtt, + Ass, +} + +impl SubtitleFormat { + pub fn extension(&self) -> &'static str { + match self { + Self::Srt => "srt", + Self::Vtt => "vtt", + Self::Ass => "ass", + } + } +} + +impl TryFrom<&str> for SubtitleFormat { + type Error = anyhow::Error; + + fn try_from(value: &str) -> Result { + match value.to_lowercase().as_str() { + "srt" => Ok(Self::Srt), + "vtt" => Ok(Self::Vtt), + "ass" => Ok(Self::Ass), + _ => Err(anyhow!("unsupported subtitle format: {value}")), + } + } +} + +#[derive(Debug, Clone)] +pub struct AssStyle { + pub name: String, + pub font_name: String, + pub font_size: u32, + pub primary_colour: String, + pub outline_colour: String, +} + +impl Default for AssStyle { + fn default() -> Self { + Self { + name: "Default".to_string(), + font_name: "Arial".to_string(), + font_size: 22, + primary_colour: "&H00FFFFFF".to_string(), + outline_colour: "&H00000000".to_string(), + } + } +} + +pub fn render(segments: &[SubtitleSegment], format: SubtitleFormat) -> String { + match format { + SubtitleFormat::Srt => render_srt(segments), + SubtitleFormat::Vtt => render_vtt(segments), + SubtitleFormat::Ass => render_ass(segments, AssStyle::default()), + } +} + +fn render_srt(segments: &[SubtitleSegment]) -> String { + segments + .iter() + .enumerate() + .map(|(index, segment)| { + format!( + "{}\n{} --> {}\n{}\n", + index + 1, + format_timestamp(segment.start, ","), + format_timestamp(segment.end, ","), + segment.translated_text.as_deref().unwrap_or(&segment.source_text) + ) + }) + .collect::>() + .join("\n") +} + +fn render_vtt(segments: &[SubtitleSegment]) -> String { + let body = segments + .iter() + .map(|segment| { + format!( + "{} --> {}\n{}\n", + format_timestamp(segment.start, "."), + format_timestamp(segment.end, "."), + segment.translated_text.as_deref().unwrap_or(&segment.source_text) + ) + }) + .collect::>() + .join("\n"); + + format!("WEBVTT\n\n{}", body) +} + +fn render_ass(segments: &[SubtitleSegment], style: AssStyle) -> String { + let header = format!( + "[Script Info]\nScriptType: v4.00+\nCollisions: Normal\nPlayResX: 1280\nPlayResY: 720\n\n[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\nStyle: {},{},{},{},&H000000FF,{},&H64000000,0,0,0,0,100,100,0,0,1,2,1,2,32,32,24,1\n\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n", + style.name, style.font_name, style.font_size, style.primary_colour, style.outline_colour + ); + + let body = segments + .iter() + .map(|segment| { + let text = segment + .translated_text + .as_deref() + .unwrap_or(&segment.source_text) + .replace('\n', "\\N"); + format!( + "Dialogue: 0,{}, {},{},{},0,0,0,,{}", + format_ass_timestamp(segment.start), + format_ass_timestamp(segment.end), + style.name, + "", + text + ) + }) + .collect::>() + .join("\n"); + + format!("{header}{body}\n") +} + +fn format_timestamp(seconds: f32, separator: &str) -> String { + let millis = (seconds * 1000.0).round() as u64; + let hours = millis / 3_600_000; + let minutes = (millis % 3_600_000) / 60_000; + let secs = (millis % 60_000) / 1_000; + let ms = millis % 1_000; + format!("{hours:02}:{minutes:02}:{secs:02}{separator}{ms:03}") +} + +fn format_ass_timestamp(seconds: f32) -> String { + let centis = (seconds * 100.0).round() as u64; + let hours = centis / 360_000; + let minutes = (centis % 360_000) / 6_000; + let secs = (centis % 6_000) / 100; + let cs = centis % 100; + format!("{hours}:{minutes:02}:{secs:02}.{cs:02}") +} diff --git a/src-tauri/src/task.rs b/src-tauri/src/task.rs new file mode 100644 index 0000000..ff277b7 --- /dev/null +++ b/src-tauri/src/task.rs @@ -0,0 +1,210 @@ +use std::{ + fs, + path::PathBuf, +}; + +use anyhow::{Context, Result}; +use tauri::{Emitter, Manager, Window}; +use uuid::Uuid; + +use crate::{ + audio::AudioPipeline, + models::{ + ErrorEvent, ProgressEvent, StartTaskPayload, SubtitleSegment, SubtitleTask, TargetLanguage, + TaskStatus, TranslationConfig, + }, + state::AppState, + subtitle::{render, SubtitleFormat}, + translate::Translator, + vad::{VadConfig, VadEngine}, + whisper::WhisperEngine, +}; + +pub async fn start_task( + app: tauri::AppHandle, + window: Window, + state: tauri::State<'_, AppState>, + payload: StartTaskPayload, +) -> Result { + let file_path = PathBuf::from(&payload.file_path); + let task = SubtitleTask { + id: Uuid::new_v4().to_string(), + file_name: file_path + .file_name() + .and_then(|item| item.to_str()) + .unwrap_or("unknown") + .to_string(), + file_path: payload.file_path.clone(), + source_lang: payload.source_lang.clone(), + target_lang: payload.target_lang.clone(), + status: TaskStatus::Queued, + progress: 0.0, + segments: Vec::new(), + error: None, + }; + + state.upsert_task(task.clone())?; + + let task_for_spawn = task.clone(); + let payload_for_spawn = payload.clone(); + let app_handle = app.clone(); + let app_handle_for_error = app.clone(); + let window_handle = window.clone(); + let task_id = task.id.clone(); + + tauri::async_runtime::spawn(async move { + if let Err(error) = run_pipeline(app_handle, window_handle.clone(), task_for_spawn, payload_for_spawn).await { + if let Ok(mut failed_task) = app_handle_for_error.state::().get_task(&task_id) { + failed_task.status = TaskStatus::Failed; + failed_task.error = Some(error.to_string()); + let _ = app_handle_for_error.state::().upsert_task(failed_task); + } + let _ = emit_error(&window_handle, &task_id, &error.to_string()); + } + }); + + Ok(task) +} + +async fn run_pipeline( + app: tauri::AppHandle, + window: Window, + mut task: SubtitleTask, + payload: StartTaskPayload, +) -> Result<()> { + let app_state = app.state::(); + let workspace = std::env::temp_dir().join("crosssubtitle-ai").join(&task.id); + + set_status(&window, &app_state, &mut task, TaskStatus::Extracting, 8.0, "正在抽取音频")?; + let wav_path = AudioPipeline::extract_to_wav(&payload.file_path, &workspace)?; + + set_status(&window, &app_state, &mut task, TaskStatus::VadProcessing, 22.0, "正在分析语音片段")?; + let samples = AudioPipeline::load_wav_f32(&wav_path)?; + let vad = VadEngine::new(payload.vad_model_path.clone(), VadConfig::default())?; + let speech_ranges = vad.detect_segments(&samples); + + set_status(&window, &app_state, &mut task, TaskStatus::Transcribing, 45.0, "正在执行 Whisper")?; + let whisper = WhisperEngine::new(payload.whisper_model_path.clone()); + let mut segments = whisper.infer_segments( + &wav_path, + &task.id, + task.source_lang.as_deref(), + &task.target_lang, + &speech_ranges, + )?; + + for segment in &segments { + window.emit( + "task:segment", + crate::models::SegmentEvent { + task_id: task.id.clone(), + segment: segment.clone(), + }, + )?; + } + + task.segments = segments.clone(); + app_state.upsert_task(task.clone())?; + + if matches!(task.target_lang, TargetLanguage::Zh) { + if let Some(config) = load_translation_config() { + set_status(&window, &app_state, &mut task, TaskStatus::Translating, 72.0, "正在生成中文译文")?; + let translator = Translator::new(config)?; + segments = translator.translate_to_zh(&segments).await?; + task.segments = segments.clone(); + app_state.upsert_task(task.clone())?; + + for segment in segments { + window.emit( + "task:segment", + crate::models::SegmentEvent { + task_id: task.id.clone(), + segment, + }, + )?; + } + } + } + + task.status = TaskStatus::Completed; + task.progress = 100.0; + app_state.upsert_task(task.clone())?; + window.emit("task:done", task)?; + Ok(()) +} + +fn load_translation_config() -> Option { + let api_base = std::env::var("OPENAI_API_BASE").ok()?; + let api_key = std::env::var("OPENAI_API_KEY").ok()?; + let model = std::env::var("OPENAI_MODEL").unwrap_or_else(|_| "gpt-4o-mini".to_string()); + Some(TranslationConfig { + api_base, + api_key, + model, + batch_size: 12, + context_size: 3, + }) +} + +fn set_status( + window: &Window, + state: &AppState, + task: &mut SubtitleTask, + status: TaskStatus, + progress: f32, + message: &str, +) -> Result<()> { + task.status = status.clone(); + task.progress = progress; + state.upsert_task(task.clone())?; + window.emit( + "task:progress", + ProgressEvent { + task_id: task.id.clone(), + status, + progress, + message: message.to_string(), + }, + )?; + Ok(()) +} + +pub fn update_segment_text(state: tauri::State<'_, AppState>, segment: SubtitleSegment) -> Result { + state.update_segment(segment) +} + +pub fn list_tasks(state: tauri::State<'_, AppState>) -> Result> { + state.list_tasks() +} + +pub fn export_task(state: tauri::State<'_, AppState>, task_id: String, format: String) -> Result { + let task = state.get_task(&task_id)?; + let format = SubtitleFormat::try_from(format.as_str())?; + let content = render(&task.segments, format); + + let stem = PathBuf::from(&task.file_name) + .file_stem() + .and_then(|item| item.to_str()) + .unwrap_or("subtitle"); + + let output_dir = std::env::current_dir() + .context("failed to get current directory")? + .join("exports"); + fs::create_dir_all(&output_dir)?; + + let output_path = output_dir.join(format!("{stem}.{}", format.extension())); + fs::write(&output_path, content)?; + + Ok(output_path.display().to_string()) +} + +fn emit_error(window: &Window, task_id: &str, message: &str) -> Result<()> { + window.emit( + "task:error", + ErrorEvent { + task_id: task_id.to_string(), + message: message.to_string(), + }, + )?; + Ok(()) +} diff --git a/src-tauri/src/translate.rs b/src-tauri/src/translate.rs new file mode 100644 index 0000000..d65929e --- /dev/null +++ b/src-tauri/src/translate.rs @@ -0,0 +1,174 @@ +use std::time::Duration; + +use anyhow::{anyhow, Context, Result}; +use reqwest::Client; +use serde::{Deserialize, Serialize}; +use tokio::time::sleep; + +use crate::models::{SubtitleSegment, TranslationConfig}; + +#[derive(Debug, Serialize)] +struct ChatCompletionRequest { + model: String, + messages: Vec, + temperature: f32, + response_format: ResponseFormat, +} + +#[derive(Debug, Serialize)] +struct ChatMessage { + role: String, + content: String, +} + +#[derive(Debug, Serialize)] +struct ResponseFormat { + #[serde(rename = "type")] + format_type: String, +} + +#[derive(Debug, Deserialize)] +struct ChatCompletionResponse { + choices: Vec, +} + +#[derive(Debug, Deserialize)] +struct ChatChoice { + message: ChatMessageContent, +} + +#[derive(Debug, Deserialize)] +struct ChatMessageContent { + content: String, +} + +#[derive(Debug, Deserialize)] +struct TranslationResponse { + translations: Vec, +} + +#[derive(Debug, Deserialize)] +struct TranslatedRow { + id: String, + text: String, +} + +pub struct Translator { + client: Client, + config: TranslationConfig, +} + +impl Translator { + pub fn new(config: TranslationConfig) -> Result { + let client = Client::builder() + .timeout(Duration::from_secs(60)) + .build() + .context("failed to build translation client")?; + + Ok(Self { client, config }) + } + + pub async fn translate_to_zh(&self, segments: &[SubtitleSegment]) -> Result> { + let batch_size = self.config.batch_size.clamp(10, 15); + let context_size = self.config.context_size.min(5); + let mut translated = segments.to_vec(); + + for batch_start in (0..segments.len()).step_by(batch_size) { + let batch_end = (batch_start + batch_size).min(segments.len()); + let context_start = batch_start.saturating_sub(context_size); + let context = &segments[context_start..batch_start]; + let batch = &segments[batch_start..batch_end]; + let rows = self.request_translation(context, batch).await?; + + for row in rows { + if let Some(segment) = translated.iter_mut().find(|item| item.id == row.id) { + segment.translated_text = Some(row.text); + } + } + } + + Ok(translated) + } + + async fn request_translation( + &self, + context: &[SubtitleSegment], + batch: &[SubtitleSegment], + ) -> Result> { + let context_text = if context.is_empty() { + "无".to_string() + } else { + context + .iter() + .map(|item| format!("{}: {}", item.id, item.source_text)) + .collect::>() + .join("\n") + }; + + let batch_text = batch + .iter() + .map(|item| format!("{}: {}", item.id, item.source_text)) + .collect::>() + .join("\n"); + + let request = ChatCompletionRequest { + model: self.config.model.clone(), + temperature: 0.2, + response_format: ResponseFormat { + format_type: "json_object".to_string(), + }, + messages: vec![ + ChatMessage { + role: "system".to_string(), + content: "你是专业字幕翻译助手。请保持人称、术语和语气一致,只输出 JSON。".to_string(), + }, + ChatMessage { + role: "user".to_string(), + content: format!( + "把以下字幕翻译成简体中文。上下文:\n{}\n\n待翻译片段:\n{}\n\n请返回 {{\"translations\":[{{\"id\":\"seg-0001\",\"text\":\"译文\"}}]}}", + context_text, batch_text + ), + }, + ], + }; + + let url = format!( + "{}/chat/completions", + self.config.api_base.trim_end_matches('/') + ); + + let mut last_error: Option = None; + for attempt in 0..3 { + let response = self + .client + .post(&url) + .bearer_auth(&self.config.api_key) + .json(&request) + .send() + .await; + + match response { + Ok(response) => { + let response = response.error_for_status().context("translation http error")?; + let payload: ChatCompletionResponse = response.json().await.context("invalid response body")?; + let content = payload + .choices + .first() + .ok_or_else(|| anyhow!("translation response missing choices"))? + .message + .content + .clone(); + let rows: TranslationResponse = + serde_json::from_str(&content).context("translation json parse failed")?; + return Ok(rows.translations); + } + Err(error) => { + last_error = Some(error.into()); + sleep(Duration::from_millis(500 * (attempt + 1) as u64)).await; + } + } + } + + Err(last_error.unwrap_or_else(|| anyhow!("translation request failed"))) + } +} diff --git a/src-tauri/src/vad.rs b/src-tauri/src/vad.rs new file mode 100644 index 0000000..efe3480 --- /dev/null +++ b/src-tauri/src/vad.rs @@ -0,0 +1,100 @@ +use std::path::PathBuf; + +use anyhow::{anyhow, Result}; + +#[derive(Debug, Clone)] +pub struct VadConfig { + pub sample_rate: usize, + pub threshold: f32, + pub min_speech_ms: usize, + pub min_silence_ms: usize, + pub pad_ms: usize, +} + +impl Default for VadConfig { + fn default() -> Self { + Self { + sample_rate: 16_000, + threshold: 0.015, + min_speech_ms: 250, + min_silence_ms: 180, + pad_ms: 120, + } + } +} + +pub struct VadEngine { + #[allow(dead_code)] + model_path: Option, + config: VadConfig, +} + +impl VadEngine { + pub fn new(model_path: Option, config: VadConfig) -> Result { + let model_path = model_path.map(PathBuf::from); + if let Some(path) = &model_path { + if !path.exists() { + return Err(anyhow!("vad model not found: {}", path.display())); + } + } + + Ok(Self { model_path, config }) + } + + pub fn detect_segments(&self, samples: &[f32]) -> Vec<(f32, f32)> { + let frame_size = (self.config.sample_rate / 50).max(1); + let min_speech_frames = (self.config.min_speech_ms / 20).max(1); + let min_silence_frames = (self.config.min_silence_ms / 20).max(1); + let pad_seconds = self.config.pad_ms as f32 / 1000.0; + + let mut frames = Vec::new(); + for chunk in samples.chunks(frame_size) { + let energy = chunk.iter().map(|sample| sample.abs()).sum::() / chunk.len() as f32; + frames.push(energy); + } + + let mut result = Vec::new(); + let mut start_frame: Option = None; + let mut silent_frames = 0usize; + + for (index, energy) in frames.iter().enumerate() { + if *energy >= self.config.threshold { + if start_frame.is_none() { + start_frame = Some(index); + } + silent_frames = 0; + continue; + } + + if let Some(start) = start_frame { + silent_frames += 1; + if silent_frames >= min_silence_frames { + let end_frame = index.saturating_sub(silent_frames); + if end_frame.saturating_sub(start) >= min_speech_frames { + let start_sec = (start * frame_size) as f32 / self.config.sample_rate as f32; + let end_sec = ((end_frame + 1) * frame_size) as f32 / self.config.sample_rate as f32; + result.push(((start_sec - pad_seconds).max(0.0), end_sec + pad_seconds)); + } + start_frame = None; + silent_frames = 0; + } + } + } + + if let Some(start) = start_frame { + let end_frame = frames.len().saturating_sub(1); + if end_frame.saturating_sub(start) >= min_speech_frames { + let start_sec = (start * frame_size) as f32 / self.config.sample_rate as f32; + let end_sec = ((end_frame + 1) * frame_size) as f32 / self.config.sample_rate as f32; + result.push(((start_sec - pad_seconds).max(0.0), end_sec + pad_seconds)); + } + } + + if result.is_empty() && !samples.is_empty() { + let total_seconds = samples.len() as f32 / self.config.sample_rate as f32; + result.push((0.0, total_seconds)); + } + + result + } +} diff --git a/src-tauri/src/whisper.rs b/src-tauri/src/whisper.rs new file mode 100644 index 0000000..92c7366 --- /dev/null +++ b/src-tauri/src/whisper.rs @@ -0,0 +1,71 @@ +use std::path::Path; + +use anyhow::{anyhow, Result}; + +use crate::models::{SubtitleSegment, TargetLanguage}; + +#[derive(Debug, Clone, Copy)] +pub enum WhisperTaskKind { + Transcribe, + Translate, +} + +pub struct WhisperEngine { + model_path: Option, +} + +impl WhisperEngine { + pub fn new(model_path: Option) -> Self { + Self { model_path } + } + + pub fn infer_segments( + &self, + wav_path: &Path, + task_id: &str, + source_lang: Option<&str>, + target_lang: &TargetLanguage, + speech_ranges: &[(f32, f32)], + ) -> Result> { + let Some(model_path) = &self.model_path else { + return Err(anyhow!( + "whisper model path is missing. Please provide a local ggml model path for task {}", + task_id + )); + }; + + if !Path::new(model_path).exists() { + return Err(anyhow!("whisper model not found: {model_path}")); + } + + let _wav_path = wav_path; + let _task_kind = match target_lang { + TargetLanguage::En => WhisperTaskKind::Translate, + TargetLanguage::Zh => WhisperTaskKind::Transcribe, + }; + + let mut segments = Vec::new(); + for (index, (start, end)) in speech_ranges.iter().enumerate() { + segments.push(SubtitleSegment { + id: format!("seg-{:04}", index + 1), + task_id: task_id.to_string(), + start: *start, + end: *end, + source_text: format!( + "[待接入 whisper-rs] {} -> {},源语言:{},模型:{}", + start, + end, + source_lang.unwrap_or("auto"), + model_path + ), + translated_text: if matches!(target_lang, TargetLanguage::En) { + Some("Pending native Whisper translation".to_string()) + } else { + None + }, + }); + } + + Ok(segments) + } +} diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json new file mode 100644 index 0000000..c403451 --- /dev/null +++ b/src-tauri/tauri.conf.json @@ -0,0 +1,30 @@ +{ + "$schema": "https://schema.tauri.app/config/2", + "productName": "CrossSubtitle-AI", + "version": "0.1.0", + "identifier": "com.crosssubtitle.ai", + "build": { + "beforeDevCommand": "npm run dev", + "beforeBuildCommand": "npm run build", + "frontendDist": "../dist", + "devUrl": "http://localhost:1420" + }, + "app": { + "windows": [ + { + "title": "CrossSubtitle-AI", + "width": 1480, + "height": 920, + "resizable": true + } + ], + "security": { + "csp": null + } + }, + "bundle": { + "active": true, + "targets": "all", + "icon": [] + } +} diff --git a/src/App.vue b/src/App.vue new file mode 100644 index 0000000..87f5dfe --- /dev/null +++ b/src/App.vue @@ -0,0 +1,119 @@ + + + diff --git a/src/components/SubtitleEditor.vue b/src/components/SubtitleEditor.vue new file mode 100644 index 0000000..96470f0 --- /dev/null +++ b/src/components/SubtitleEditor.vue @@ -0,0 +1,82 @@ + + +