From 2a057e6917a3f7a4a4e68cc9d72694b7b13de892 Mon Sep 17 00:00:00 2001 From: kura Date: Thu, 19 Mar 2026 11:54:44 +0800 Subject: [PATCH] =?UTF-8?q?mac=E6=89=93=E5=8C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .cargo/config.toml | 3 + src-tauri/Cargo.lock | 68 +++++ src-tauri/Cargo.toml | 4 + src-tauri/src/lib.rs | 132 ++++++++++ src-tauri/src/models.rs | 13 + src-tauri/src/task.rs | 84 +++++-- src-tauri/src/whisper.rs | 97 ++++++-- src-tauri/tauri.conf.json | 11 +- src/App.vue | 260 +++++++++++++------ src/components/SubtitleEditor.vue | 82 +++--- src/components/TaskQueue.vue | 53 ++-- src/lib/types.ts | 9 + src/stores/tasks.ts | 32 ++- src/style.css | 400 +++++++++++++++++++++++++++++- 14 files changed, 1055 insertions(+), 193 deletions(-) create mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..179a4fa --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,3 @@ +[env] +MACOSX_DEPLOYMENT_TARGET = "10.15" +CMAKE_OSX_DEPLOYMENT_TARGET = "10.15" diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index c49640e..62e172b 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -479,6 +479,8 @@ dependencies = [ "anyhow", "hound", "ndarray", + "objc2-app-kit", + "objc2-foundation", "ort", "reqwest 0.12.28", "serde", @@ -2219,8 +2221,38 @@ checksum = "d49e936b501e5c5bf01fda3a9452ff86dc3ea98ad5f283e1455153142d97518c" dependencies = [ "bitflags 2.11.0", "block2", + "libc", "objc2", + "objc2-cloud-kit", + "objc2-core-data", "objc2-core-foundation", + "objc2-core-graphics", + "objc2-core-image", + "objc2-core-text", + "objc2-core-video", + "objc2-foundation", + "objc2-quartz-core", +] + +[[package]] +name = "objc2-cloud-kit" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ad74d880bb43877038da939b7427bba67e9dd42004a18b809ba7d87cee241c" +dependencies = [ + "bitflags 2.11.0", + "objc2", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-data" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b402a653efbb5e82ce4df10683b6b28027616a2715e90009947d50b8dd298fa" +dependencies = [ + "bitflags 2.11.0", + "objc2", "objc2-foundation", ] @@ -2248,6 +2280,41 @@ dependencies = [ "objc2-io-surface", ] +[[package]] +name = "objc2-core-image" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5d563b38d2b97209f8e861173de434bd0214cf020e3423a52624cd1d989f006" +dependencies = [ + "objc2", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-text" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde0dfb48d25d2b4862161a4d5fcc0e3c24367869ad306b0c9ec0073bfed92d" +dependencies = [ + "bitflags 2.11.0", + "objc2", + "objc2-core-foundation", + "objc2-core-graphics", +] + +[[package]] +name = "objc2-core-video" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d425caf1df73233f29fd8a5c3e5edbc30d2d4307870f802d18f00d83dc5141a6" +dependencies = [ + "bitflags 2.11.0", + "objc2", + "objc2-core-foundation", + "objc2-core-graphics", + "objc2-io-surface", +] + [[package]] name = "objc2-encode" version = "4.1.0" @@ -2271,6 +2338,7 @@ checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272" dependencies = [ "bitflags 2.11.0", "block2", + "libc", "objc2", "objc2-core-foundation", ] diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index 594c2fe..5dd36d5 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -26,3 +26,7 @@ tokio = { version = "1.42", features = ["macros", "rt-multi-thread", "time"] } uuid = { version = "1.11", features = ["serde", "v4"] } walkdir = "2.5" whisper-rs = "0.16" + +[target.'cfg(target_os = "macos")'.dependencies] +objc2-app-kit = "0.3.2" +objc2-foundation = { version = "0.3.2", features = ["objc2-core-foundation"] } diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index fb6419f..0f4a212 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -9,6 +9,21 @@ mod whisper; use models::{StartTaskPayload, SubtitleSegment, SubtitleTask}; use state::AppState; +use tauri::{ + menu::{MenuBuilder, MenuItemBuilder, PredefinedMenuItem, SubmenuBuilder}, + AppHandle, Emitter, Manager, PhysicalSize, Size, +}; +#[cfg(target_os = "macos")] +use objc2_app_kit::NSWindow; +#[cfg(target_os = "macos")] +use objc2_foundation::NSSize; + +const WINDOW_RATIO_WIDTH: f64 = 16.0; +const WINDOW_RATIO_HEIGHT: f64 = 10.0; +const DEFAULT_WINDOW_WIDTH: u32 = 1440; +const DEFAULT_WINDOW_HEIGHT: u32 = 900; +const MIN_WINDOW_WIDTH: u32 = 1280; +const MIN_WINDOW_HEIGHT: u32 = 800; #[tauri::command] async fn start_subtitle_task( @@ -53,6 +68,12 @@ pub fn run() { tauri::Builder::default() .plugin(tauri_plugin_dialog::init()) .manage(AppState::default()) + .setup(|app| { + configure_window(app.handle())?; + #[cfg(target_os = "macos")] + configure_macos_menu(app.handle())?; + Ok(()) + }) .invoke_handler(tauri::generate_handler![ start_subtitle_task, list_tasks, @@ -62,3 +83,114 @@ pub fn run() { .run(tauri::generate_context!()) .expect("error while running tauri application"); } + +fn configure_window(app: &AppHandle) -> tauri::Result<()> { + if let Some(window) = app.get_webview_window("main") { + window.set_min_size(Some(Size::Physical(PhysicalSize::new( + MIN_WINDOW_WIDTH, + MIN_WINDOW_HEIGHT, + ))))?; + window.set_size(Size::Physical(PhysicalSize::new( + DEFAULT_WINDOW_WIDTH, + DEFAULT_WINDOW_HEIGHT, + )))?; + #[cfg(target_os = "macos")] + apply_macos_aspect_ratio(&window)?; + } + Ok(()) +} + +#[cfg(target_os = "macos")] +fn configure_macos_menu(app: &AppHandle) -> tauri::Result<()> { + let app_name = app.package_info().name.clone(); + + let app_menu = SubmenuBuilder::new(app, app_name) + .item(&PredefinedMenuItem::about(app, None, None)?) + .separator() + .item(&PredefinedMenuItem::services(app, None)?) + .separator() + .item(&PredefinedMenuItem::hide(app, None)?) + .item(&PredefinedMenuItem::hide_others(app, None)?) + .item(&PredefinedMenuItem::show_all(app, None)?) + .separator() + .item(&PredefinedMenuItem::quit(app, None)?) + .build()?; + + let file_menu = SubmenuBuilder::new(app, "文件") + .item(&MenuItemBuilder::with_id("pick_files", "选择媒体文件").accelerator("CmdOrCtrl+O").build(app)?) + .separator() + .item(&MenuItemBuilder::with_id("export_srt", "导出 SRT").build(app)?) + .item(&MenuItemBuilder::with_id("export_vtt", "导出 VTT").build(app)?) + .item(&MenuItemBuilder::with_id("export_ass", "导出 ASS").build(app)?) + .build()?; + + let edit_menu = SubmenuBuilder::new(app, "编辑") + .item(&PredefinedMenuItem::undo(app, None)?) + .item(&PredefinedMenuItem::redo(app, None)?) + .separator() + .item(&PredefinedMenuItem::cut(app, None)?) + .item(&PredefinedMenuItem::copy(app, None)?) + .item(&PredefinedMenuItem::paste(app, None)?) + .item(&PredefinedMenuItem::select_all(app, None)?) + .build()?; + + let settings_menu = SubmenuBuilder::new(app, "设置") + .item(&MenuItemBuilder::with_id("toggle_advanced", "显示或隐藏高级设置").build(app)?) + .item(&MenuItemBuilder::with_id("toggle_bilingual", "切换双语导出").build(app)?) + .item(&MenuItemBuilder::with_id("reset_models", "恢复默认模型路径").build(app)?) + .build()?; + + let window_menu = SubmenuBuilder::new(app, "窗口") + .item(&PredefinedMenuItem::minimize(app, None)?) + .item(&PredefinedMenuItem::maximize(app, None)?) + .separator() + .item(&PredefinedMenuItem::close_window(app, None)?) + .build()?; + + let menu = MenuBuilder::new(app) + .item(&app_menu) + .item(&file_menu) + .item(&edit_menu) + .item(&settings_menu) + .item(&window_menu) + .build()?; + + app.set_menu(menu)?; + app.on_menu_event(|app, event| { + let action = match event.id().0.as_str() { + "pick_files" => Some("pick-files"), + "export_srt" => Some("export-srt"), + "export_vtt" => Some("export-vtt"), + "export_ass" => Some("export-ass"), + "toggle_advanced" => Some("toggle-advanced"), + "toggle_bilingual" => Some("toggle-bilingual"), + "reset_models" => Some("reset-models"), + _ => None, + }; + + if let Some(action) = action { + let _ = app.emit("menu:action", action); + } + }); + + Ok(()) +} + +#[cfg(not(target_os = "macos"))] +fn configure_macos_menu(_app: &AppHandle) -> tauri::Result<()> { + Ok(()) +} + +#[cfg(target_os = "macos")] +fn apply_macos_aspect_ratio(window: &tauri::WebviewWindow) -> tauri::Result<()> { + let ns_window = window.ns_window()?; + let ns_window = ns_window.cast::(); + let aspect_ratio = NSSize::new(WINDOW_RATIO_WIDTH, WINDOW_RATIO_HEIGHT); + + unsafe { + let ns_window = &*ns_window; + ns_window.setContentAspectRatio(aspect_ratio); + } + + Ok(()) +} diff --git a/src-tauri/src/models.rs b/src-tauri/src/models.rs index 0e926c8..f152fc1 100644 --- a/src-tauri/src/models.rs +++ b/src-tauri/src/models.rs @@ -89,6 +89,19 @@ pub struct ErrorEvent { pub message: String, } +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct LogEvent { + pub task_id: String, + pub message: String, +} + +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct ResetSegmentsEvent { + pub task_id: String, +} + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct TranslationConfig { diff --git a/src-tauri/src/task.rs b/src-tauri/src/task.rs index b307178..0d54781 100644 --- a/src-tauri/src/task.rs +++ b/src-tauri/src/task.rs @@ -10,8 +10,8 @@ use uuid::Uuid; use crate::{ audio::AudioPipeline, models::{ - ErrorEvent, OutputMode, ProgressEvent, StartTaskPayload, SubtitleSegment, SubtitleTask, - TaskStatus, TranslationConfig, + ErrorEvent, LogEvent, OutputMode, ProgressEvent, ResetSegmentsEvent, StartTaskPayload, + SubtitleSegment, SubtitleTask, TaskStatus, TranslationConfig, }, state::AppState, subtitle::{render, SubtitleFormat}, @@ -94,16 +94,28 @@ async fn run_pipeline( let should_translate = matches!(payload.output_mode, OutputMode::Translate); set_status(&window, &app_state, &mut task, TaskStatus::Extracting, 8.0, "正在抽取音频")?; + emit_log(&window, &task.id, format!("task: input file={}", payload.file_path))?; let wav_path = AudioPipeline::extract_to_wav(&payload.file_path, &workspace)?; + emit_log(&window, &task.id, format!("audio: normalized wav={}", wav_path.display()))?; set_status(&window, &app_state, &mut task, TaskStatus::VadProcessing, 22.0, "正在分析语音片段")?; let samples = AudioPipeline::load_wav_f32(&wav_path)?; let vad = VadEngine::new(payload.vad_model_path.clone(), VadConfig::default())?; let speech_ranges = vad.detect_segments(&samples); + emit_log( + &window, + &task.id, + format!("vad: detected {} speech ranges", speech_ranges.len()), + )?; set_status(&window, &app_state, &mut task, TaskStatus::Transcribing, 45.0, "正在执行 Whisper")?; let whisper = WhisperEngine::new(payload.whisper_model_path.clone()); let task_id_for_progress = task.id.clone(); + let task_id_for_segment = task.id.clone(); + let task_id_for_reset = task.id.clone(); + let task_id_for_log = task.id.clone(); + let app_state_for_segment = app_state.clone(); + let app_state_for_reset = app_state.clone(); let mut segments = whisper.infer_segments( &wav_path, &task.id, @@ -124,18 +136,44 @@ async fn run_pipeline( )?; Ok(()) }, + || { + if let Ok(mut current_task) = app_state_for_reset.get_task(&task_id_for_reset) { + current_task.segments.clear(); + let _ = app_state_for_reset.upsert_task(current_task); + } + window.emit( + "task:segments_reset", + ResetSegmentsEvent { + task_id: task_id_for_reset.clone(), + }, + )?; + Ok(()) + }, + |segment| { + if let Ok(mut current_task) = app_state_for_segment.get_task(&task_id_for_segment) { + if let Some(existing) = current_task + .segments + .iter_mut() + .find(|item| item.id == segment.id) + { + *existing = segment.clone(); + } else { + current_task.segments.push(segment.clone()); + } + let _ = app_state_for_segment.upsert_task(current_task); + } + window.emit( + "task:segment", + crate::models::SegmentEvent { + task_id: task_id_for_segment.clone(), + segment, + }, + )?; + Ok(()) + }, + |message| emit_log(&window, &task_id_for_log, message), )?; - for segment in &segments { - window.emit( - "task:segment", - crate::models::SegmentEvent { - task_id: task.id.clone(), - segment: segment.clone(), - }, - )?; - } - task.segments = segments.clone(); app_state.upsert_task(task.clone())?; @@ -218,15 +256,16 @@ pub fn export_task(state: tauri::State<'_, AppState>, task_id: String, format: S let format = SubtitleFormat::try_from(format.as_str())?; let content = render(&task.segments, format, task.bilingual_output); - let file_name_path = PathBuf::from(&task.file_name); - let stem = file_name_path + let source_path = PathBuf::from(&task.file_path); + let stem = source_path .file_stem() .and_then(|item| item.to_str()) .unwrap_or("subtitle"); - let output_dir = std::env::current_dir() - .context("failed to get current directory")? - .join("exports"); + let output_dir = source_path + .parent() + .map(PathBuf::from) + .unwrap_or(std::env::current_dir().context("failed to get current directory")?); fs::create_dir_all(&output_dir)?; let output_path = output_dir.join(format!("{stem}.{}", format.extension())); @@ -245,3 +284,14 @@ fn emit_error(window: &Window, task_id: &str, message: &str) -> Result<()> { )?; Ok(()) } + +fn emit_log(window: &Window, task_id: &str, message: String) -> Result<()> { + window.emit( + "task:log", + LogEvent { + task_id: task_id.to_string(), + message, + }, + )?; + Ok(()) +} diff --git a/src-tauri/src/whisper.rs b/src-tauri/src/whisper.rs index 1d2a91e..cfff86b 100644 --- a/src-tauri/src/whisper.rs +++ b/src-tauri/src/whisper.rs @@ -2,7 +2,8 @@ use std::path::Path; use anyhow::{anyhow, Context, Result}; use whisper_rs::{ - FullParams, SamplingStrategy, WhisperContext, WhisperContextParameters, + get_lang_str, install_logging_hooks, FullParams, SamplingStrategy, WhisperContext, + WhisperContextParameters, }; use crate::models::{SubtitleSegment, TargetLanguage}; @@ -13,6 +14,7 @@ pub struct WhisperEngine { impl WhisperEngine { pub fn new(model_path: Option) -> Self { + install_logging_hooks(); Self { model_path } } @@ -25,6 +27,9 @@ impl WhisperEngine { should_translate: bool, speech_ranges: &[(f32, f32)], mut on_progress: F, + mut on_reset_segments: impl FnMut() -> Result<()>, + mut on_segment: impl FnMut(SubtitleSegment) -> Result<()>, + mut on_log: impl FnMut(String) -> Result<()>, ) -> Result> where F: FnMut(f32) -> Result<()>, @@ -49,14 +54,22 @@ impl WhisperEngine { ) .with_context(|| format!("failed to load whisper model: {model_path}"))?; let mut state = context.create_state().context("failed to create whisper state")?; + let detected_language = resolve_source_language(&mut state, &audio, source_lang) + .context("failed to resolve source language")?; + + if let Some(lang) = detected_language { + on_log(format!("whisper: source language={lang}"))?; + } else { + on_log("whisper: source language unresolved, fallback to auto decode".to_string())?; + } let mut segments = Vec::new(); - eprintln!( + on_log(format!( "whisper: processing {} speech ranges (normalized from {}), coverage={:.1}%", normalized_ranges.len(), speech_ranges.len(), speech_coverage_ratio(&normalized_ranges, total_seconds) * 100.0 - ); + ))?; for (range_index, (start, end)) in normalized_ranges.iter().enumerate() { let clip = slice_audio(&audio, *start, *end); if clip.is_empty() { @@ -74,10 +87,12 @@ impl WhisperEngine { *start, *end, task_id, - source_lang, + detected_language, target_lang, should_translate, segments.len(), + &mut on_segment, + &mut on_log, )?; segments.extend(clip_segments); @@ -94,14 +109,15 @@ impl WhisperEngine { || (total_seconds > 45.0 && vad_text_len < (total_seconds / 2.4) as usize)); if should_retry_full_audio { - eprintln!( + on_log(format!( "whisper: VAD result looks incomplete, retrying full audio (segments={}, chars={}, end={:.2}s/{:.2}s, coverage={:.1}%)", segments.len(), vad_text_len, vad_end, total_seconds, vad_coverage * 100.0 - ); + ))?; + on_reset_segments()?; let full_audio_segments = transcribe_clip( &mut state, &audio, @@ -109,23 +125,27 @@ impl WhisperEngine { 0.0, total_seconds, task_id, - source_lang, + detected_language, target_lang, should_translate, 0, + &mut on_segment, + &mut on_log, )?; if should_prefer_full_audio(&segments, &full_audio_segments, total_seconds) { - eprintln!( + on_log(format!( "whisper: using full-audio transcript (vad_segments={}, full_segments={})", segments.len(), full_audio_segments.len() - ); + ))?; segments = full_audio_segments; + } else { + segments.iter().cloned().try_for_each(&mut on_segment)?; } } - eprintln!("whisper: total emitted segments={}", segments.len()); + on_log(format!("whisper: total emitted segments={}", segments.len()))?; Ok(segments) } } @@ -142,6 +162,8 @@ fn transcribe_clip( _target_lang: &TargetLanguage, _should_translate: bool, segment_offset: usize, + on_segment: &mut impl FnMut(SubtitleSegment) -> Result<()>, + on_log: &mut impl FnMut(String) -> Result<()>, ) -> Result> { let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 1 }); params.set_n_threads(4); @@ -151,21 +173,28 @@ fn transcribe_clip( params.set_print_timestamps(false); params.set_token_timestamps(false); params.set_translate(false); - if let Some(lang) = source_lang { - params.set_language(Some(lang)); + match source_lang { + Some(lang) => { + params.set_detect_language(false); + params.set_language(Some(lang)); + } + None => { + params.set_detect_language(true); + params.set_language(None); + } } state.full(params, clip).context("whisper inference failed")?; let num_segments = state.full_n_segments(); - eprintln!( + on_log(format!( "whisper: range #{}, {:.2}-{:.2}s, samples={}, segments={}", range_index + 1, start, end, clip.len(), num_segments - ); + ))?; let mut results = Vec::new(); for offset in 0..num_segments { @@ -180,19 +209,21 @@ fn transcribe_clip( if text.is_empty() { continue; } - eprintln!("whisper text: {}", text); + on_log(format!("whisper text: {}", text))?; let local_start = segment.start_timestamp() as f32 / 100.0; let local_end = segment.end_timestamp() as f32 / 100.0; - results.push(SubtitleSegment { + let emitted = SubtitleSegment { id: format!("seg-{:04}", segment_offset + results.len() + 1), task_id: task_id.to_string(), start: start + local_start, end: start + local_end, source_text: text.clone(), translated_text: None, - }); + }; + on_segment(emitted.clone())?; + results.push(emitted); } Ok(results) @@ -301,3 +332,35 @@ fn should_prefer_full_audio( || full_end > vad_end + 2.0 || (total_seconds > 30.0 && full_end + 1.5 >= total_seconds && vad_end + 3.0 < total_seconds) } + +fn resolve_source_language<'a>( + state: &mut whisper_rs::WhisperState, + audio: &[f32], + source_lang: Option<&'a str>, +) -> Result> { + match source_lang.map(str::trim).filter(|lang| !lang.is_empty()) { + Some("auto") | None => { + let detect_samples = audio.len().min(16_000 * 30); + let sample = &audio[..detect_samples]; + state + .pcm_to_mel(sample, 4) + .context("failed to build mel spectrogram for language detection")?; + let (lang_id, probabilities) = state + .lang_detect(0, 4) + .context("whisper language detection failed")?; + let lang = get_lang_str(lang_id) + .ok_or_else(|| anyhow!("unknown whisper language id: {lang_id}"))?; + let probability = probabilities + .get(lang_id as usize) + .copied() + .unwrap_or_default(); + + if probability < 0.35 { + Ok(None) + } else { + Ok(Some(lang)) + } + } + Some(lang) => Ok(Some(lang)), + } +} diff --git a/src-tauri/tauri.conf.json b/src-tauri/tauri.conf.json index c403451..e3041f2 100644 --- a/src-tauri/tauri.conf.json +++ b/src-tauri/tauri.conf.json @@ -13,8 +13,10 @@ "windows": [ { "title": "CrossSubtitle-AI", - "width": 1480, - "height": 920, + "width": 1440, + "height": 900, + "minWidth": 1280, + "minHeight": 800, "resizable": true } ], @@ -25,6 +27,9 @@ "bundle": { "active": true, "targets": "all", - "icon": [] + "icon": [], + "macOS": { + "minimumSystemVersion": "10.15" + } } } diff --git a/src/App.vue b/src/App.vue index 0d17d23..618ab3f 100644 --- a/src/App.vue +++ b/src/App.vue @@ -1,6 +1,7 @@