| """ |
| Multi-Speaker Audiobook Generator |
| Different voices for narrator + characters |
| Engines: Premium AI (Qwen) + Premium English (ElevenLabs) |
| |
| Features: |
| - Auto-detect characters from text OR manual character definition |
| - Assign different voices to narrator and each character |
| - Per-segment TTS with the right voice |
| - Supports 17 languages across two engines |
| """ |
|
|
| import os |
| import base64 |
| import json |
| import pathlib |
| import shutil |
| import struct |
| import subprocess |
| import tempfile |
| import time |
| import re |
|
|
| import gradio as gr |
| import requests as http_requests |
| from openai import OpenAI |
|
|
| try: |
| import pypdf |
| HAS_PYPDF = True |
| except ImportError: |
| HAS_PYPDF = False |
|
|
| try: |
| import docx |
| HAS_DOCX = True |
| except ImportError: |
| HAS_DOCX = False |
|
|
| |
| |
| |
| OMNI_MODEL = "qwen3.5-omni-plus" |
| DASHSCOPE_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" |
| ELEVENLABS_TTS_URL = "https://api.elevenlabs.io/v1/text-to-speech" |
| ELEVENLABS_CLONE_URL = "https://api.elevenlabs.io/v1/voices/add" |
|
|
| MAX_CHARS_PER_SEGMENT = 1500 |
|
|
| |
| |
| |
| LANGUAGES = { |
| |
| "English": {"engine": "qwen"}, |
| "Chinese (Mandarin)": {"engine": "qwen"}, |
| "Japanese": {"engine": "qwen"}, |
| "Korean": {"engine": "qwen"}, |
| "German": {"engine": "qwen"}, |
| "French": {"engine": "qwen"}, |
| "Russian": {"engine": "qwen"}, |
| "Portuguese": {"engine": "qwen"}, |
| "Spanish": {"engine": "qwen"}, |
| "Italian": {"engine": "qwen"}, |
| "Arabic": {"engine": "qwen"}, |
| |
| "English (US)": {"engine": "elevenlabs"}, |
| "English (UK)": {"engine": "elevenlabs"}, |
| "English (AU)": {"engine": "elevenlabs"}, |
| "Swahili": {"engine": "elevenlabs"}, |
| "Amharic": {"engine": "elevenlabs"}, |
| "Afrikaans": {"engine": "elevenlabs"}, |
| } |
|
|
| |
| |
| |
| QWEN_VOICES = [ |
| "Jennifer -- Cinematic female", "Serena -- Gentle female", |
| "Katerina -- Mature female", "Bella -- Elegant female", |
| "Vivian -- Professional female", "Mia -- Young female", |
| "Seren -- Calm female", "Dolce -- Sweet female", |
| "Ethan -- Warm male", "Ryan -- Dramatic male", |
| "Kai -- Soothing male", "Neil -- Precise male", |
| "Lenn -- Rational male", "Arthur -- Classic male", |
| "Eldric Sage -- Authoritative male", "Vincent -- Theatrical male", |
| "Andre -- Deep male", "Aiden -- Young male", |
| ] |
|
|
| |
| QWEN_MALE_VOICES = [v for v in QWEN_VOICES if "male" in v and "female" not in v] |
| QWEN_FEMALE_VOICES = [v for v in QWEN_VOICES if "female" in v] |
| QWEN_NARRATOR_VOICES = ["Jennifer -- Cinematic female", "Eldric Sage -- Authoritative male"] |
|
|
| ELEVENLABS_VOICE_LIST = [ |
| {"name": "Rachel", "id": "21m00Tcm4TlvDq8ikWAM", "desc": "Calm female (US)"}, |
| {"name": "Drew", "id": "29vD33N1CtxCmqQRPOHJ", "desc": "Rounded male (US)"}, |
| {"name": "Clyde", "id": "2EiwWnXFnvU5JabPnv8n", "desc": "Deep male (US)"}, |
| {"name": "Paul", "id": "5Q0t7uMcjvnagumLfvZi", "desc": "News male (US)"}, |
| {"name": "Antoni", "id": "ErXwobaYiN019PkySvjV", "desc": "Rounded male (US)"}, |
| {"name": "Daniel", "id": "onwK4e9ZLuTAKqWW03F9", "desc": "British male"}, |
| {"name": "George", "id": "JBFqnCBsd6RMkjVDRZzb", "desc": "Warm British male"}, |
| {"name": "Callum", "id": "N2lVS1w4EtoT3dr4eOWO", "desc": "Intense male"}, |
| {"name": "Charlie", "id": "IKne3meq5aSn9XLyUdCD", "desc": "Australian male"}, |
| {"name": "Matilda", "id": "XrExE9yKIg1WjnnlVkGX", "desc": "Australian female"}, |
| {"name": "Freya", "id": "jsCqWAovK2LkecY7zXl4", "desc": "Young female"}, |
| ] |
|
|
| ELEVENLABS_VOICES = [f"{v['name']} -- {v['desc']}" for v in ELEVENLABS_VOICE_LIST] |
|
|
| ELEVENLABS_MODELS = { |
| "quality": "eleven_multilingual_v2", |
| "balanced": "eleven_flash_v2_5", |
| "expressive": "eleven_v3", |
| } |
|
|
|
|
| def get_voice_name(label): |
| return label.split("--")[0].strip() |
|
|
|
|
| def get_el_voice_id(label): |
| name = get_voice_name(label) |
| for v in ELEVENLABS_VOICE_LIST: |
| if v["name"] == name: |
| return v["id"] |
| return ELEVENLABS_VOICE_LIST[0]["id"] |
|
|
|
|
| |
| |
| |
| def base64_to_wav(b64_data, output_path): |
| audio_bytes = base64.b64decode(b64_data) |
| sr, nc, bps = 24000, 1, 16 |
| br = sr * nc * bps // 8 |
| ba = nc * bps // 8 |
| ds = len(audio_bytes) |
| with open(output_path, "wb") as f: |
| f.write(b"RIFF") |
| f.write(struct.pack("<I", 36 + ds)) |
| f.write(b"WAVE") |
| f.write(b"fmt ") |
| f.write(struct.pack("<I", 16)) |
| f.write(struct.pack("<H", 1)) |
| f.write(struct.pack("<H", nc)) |
| f.write(struct.pack("<I", sr)) |
| f.write(struct.pack("<I", br)) |
| f.write(struct.pack("<H", ba)) |
| f.write(struct.pack("<H", bps)) |
| f.write(b"data") |
| f.write(struct.pack("<I", ds)) |
| f.write(audio_bytes) |
|
|
|
|
| def concatenate_wavs(wav_files, output_path): |
| if not wav_files: |
| return |
| if len(wav_files) == 1: |
| shutil.copy2(wav_files[0], output_path) |
| return |
| list_file = output_path + ".txt" |
| with open(list_file, "w") as f: |
| for w in wav_files: |
| f.write(f"file '{w}'\n") |
| subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0", |
| "-i", list_file, "-c", "copy", output_path], |
| capture_output=True, check=True) |
| os.remove(list_file) |
|
|
|
|
| def generate_silence(dur, path): |
| subprocess.run(["ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=r=24000:cl=mono", |
| "-t", str(dur), "-acodec", "pcm_s16le", path], |
| capture_output=True, check=True) |
|
|
|
|
| |
| |
| |
| def extract_text_from_file(filepath): |
| ext = os.path.splitext(filepath)[1].lower() |
| if ext == ".pdf": |
| if not HAS_PYPDF: |
| raise gr.Error("pypdf not installed.") |
| reader = pypdf.PdfReader(filepath) |
| return "\n\n".join(p.extract_text().strip() for p in reader.pages if p.extract_text()) |
| elif ext in (".docx", ".doc"): |
| if not HAS_DOCX: |
| raise gr.Error("python-docx not installed.") |
| doc = docx.Document(filepath) |
| return "\n\n".join(p.text.strip() for p in doc.paragraphs if p.text.strip()) |
| else: |
| with open(filepath, "r", encoding="utf-8", errors="replace") as f: |
| return f.read() |
|
|
|
|
| |
| |
| |
| def auto_detect_characters(client, text): |
| """Use AI to detect characters with genders and split text into speaker segments.""" |
| response = client.chat.completions.create( |
| model=OMNI_MODEL, modalities=["text"], |
| messages=[ |
| { |
| "role": "system", |
| "content": ( |
| "You are a script analyst. Analyze the text and identify all speaking characters. " |
| "Then split the text into segments, each tagged with who is speaking.\n\n" |
| "Output ONLY valid JSON with this exact structure:\n" |
| "{\n" |
| ' "characters": [\n' |
| ' {"name": "Narrator", "gender": "neutral"},\n' |
| ' {"name": "Elena", "gender": "female"},\n' |
| ' {"name": "Grandfather", "gender": "male"}\n' |
| " ],\n" |
| ' "segments": [\n' |
| ' {"speaker": "Narrator", "text": "The old lighthouse stood at the edge of the world."},\n' |
| ' {"speaker": "Elena", "text": "One day, I\'ll follow that sun to wherever it goes."},\n' |
| ' {"speaker": "Narrator", "text": "The gulls said nothing."}\n' |
| " ]\n" |
| "}\n\n" |
| "Rules:\n" |
| '1. "Narrator" is always first, gender "neutral"\n' |
| '2. Detect character genders from names, pronouns, and context (he/she/etc)\n' |
| "3. If gender unclear, use 'neutral'\n" |
| "4. Detect character names from dialogue tags (said, whispered, asked, etc.)\n" |
| "5. Keep segments in original order, include ALL text\n" |
| "6. Merge consecutive segments by the same speaker\n" |
| "7. Output ONLY the JSON, no markdown, no backticks" |
| ), |
| }, |
| {"role": "user", "content": f"Analyze and split into speaker segments:\n\n{text[:8000]}"}, |
| ], |
| ) |
|
|
| raw = response.choices[0].message.content.strip() |
| raw = re.sub(r'^```json\s*', '', raw) |
| raw = re.sub(r'\s*```$', '', raw) |
|
|
| try: |
| data = json.loads(raw) |
| char_list = data.get("characters", [{"name": "Narrator", "gender": "neutral"}]) |
| segments = data.get("segments", []) |
| if not segments: |
| return [{"name": "Narrator", "gender": "neutral"}], [{"speaker": "Narrator", "text": text}] |
| |
| names = [c["name"] for c in char_list] |
| if "Narrator" not in names: |
| char_list.insert(0, {"name": "Narrator", "gender": "neutral"}) |
| return char_list, segments |
| except json.JSONDecodeError as e: |
| print(f"[CharDetect] JSON parse failed: {e}") |
| print(f"[CharDetect] Raw: {raw[:500]}") |
| return [{"name": "Narrator", "gender": "neutral"}], [{"speaker": "Narrator", "text": text}] |
|
|
|
|
| def inject_audio_tags(client, text): |
| """Inject ElevenLabs v3 audio tags for emotional narration.""" |
| if not client: |
| return text |
| try: |
| response = client.chat.completions.create( |
| model=OMNI_MODEL, modalities=["text"], |
| messages=[ |
| { |
| "role": "system", |
| "content": ( |
| "Add ElevenLabs v3 audio tags to this text for expressive narration. " |
| "Tags are words in [brackets] like [whispers], [excited], [sighs], [laughs], " |
| "[softly], [firmly], [dramatically]. Place before the phrase they apply to. " |
| "Use sparingly (1 tag per 2-3 sentences). Output ONLY the tagged text." |
| ), |
| }, |
| {"role": "user", "content": text}, |
| ], |
| ) |
| return response.choices[0].message.content.strip() |
| except Exception: |
| return text |
|
|
|
|
| |
| |
| |
| def tts_qwen(client, text, voice, language, translate, chunk_idx, tmp_dir): |
| output_wav = os.path.join(tmp_dir, f"seg_{chunk_idx:04d}.wav") |
| if translate: |
| sys_prompt = f"Translate English to {language} and narrate expressively. ONLY spoken {language}." |
| user_text = f"Translate and narrate:\n\n{text}" |
| else: |
| sys_prompt = "Narrate expressively as an audiobook. ONLY narration." |
| user_text = f"Narrate:\n\n{text}" |
|
|
| try: |
| completion = client.chat.completions.create( |
| model=OMNI_MODEL, |
| messages=[{"role": "system", "content": sys_prompt}, {"role": "user", "content": user_text}], |
| modalities=["text", "audio"], audio={"voice": voice, "format": "wav"}, |
| stream=True, stream_options={"include_usage": True}, |
| ) |
| audio_parts = [] |
| for event in completion: |
| if not event.choices: |
| continue |
| delta = event.choices[0].delta |
| if hasattr(delta, "audio") and delta.audio: |
| if isinstance(delta.audio, dict) and "data" in delta.audio: |
| audio_parts.append(delta.audio["data"]) |
| elif hasattr(delta.audio, "data") and delta.audio.data: |
| audio_parts.append(delta.audio.data) |
| if audio_parts: |
| base64_to_wav("".join(audio_parts), output_wav) |
| return output_wav, None |
| return None, "No audio received" |
| except Exception as e: |
| return None, str(e) |
|
|
|
|
| |
| |
| |
| def tts_elevenlabs(text, voice_id, api_key, model_id, chunk_idx, tmp_dir): |
| output_mp3 = os.path.join(tmp_dir, f"el_{chunk_idx:04d}.mp3") |
| output_wav = os.path.join(tmp_dir, f"el_{chunk_idx:04d}.wav") |
| headers = {"xi-api-key": api_key, "Content-Type": "application/json"} |
| payload = { |
| "text": text, |
| "model_id": model_id, |
| "voice_settings": {"stability": 0.5, "similarity_boost": 0.75, "style": 0.3}, |
| } |
| try: |
| resp = http_requests.post(f"{ELEVENLABS_TTS_URL}/{voice_id}", headers=headers, json=payload, timeout=120) |
| print(f"[EL] Seg {chunk_idx}: status={resp.status_code}, {len(resp.content)} bytes") |
| if resp.status_code != 200: |
| print(f"[EL] Error: {resp.text[:300]}") |
| return None, f"TTS failed ({resp.status_code})" |
| with open(output_mp3, "wb") as f: |
| f.write(resp.content) |
| subprocess.run(["ffmpeg", "-y", "-i", output_mp3, "-ar", "24000", "-ac", "1", |
| "-acodec", "pcm_s16le", output_wav], capture_output=True, check=True) |
| return output_wav, None |
| except Exception as e: |
| return None, str(e) |
|
|
|
|
| |
| |
| |
| def generate_multi_speaker( |
| text_input, file_input, target_language, detection_mode, |
| manual_chars_json, el_model_choice, add_tags, add_pauses, |
| |
| v0, v1, v2, v3, v4, v5, v6, v7, |
| progress=gr.Progress(), |
| ): |
| |
| if file_input is not None: |
| progress(0.01, desc="Extracting text...") |
| text = extract_text_from_file(file_input) |
| elif text_input and text_input.strip(): |
| text = text_input.strip() |
| else: |
| raise gr.Error("Please provide text or upload a file.") |
| if len(text) < 20: |
| raise gr.Error("Text too short.") |
|
|
| |
| ds_key = os.environ.get("DASHSCOPE_API_KEY", "") |
| el_key = os.environ.get("ELEVENLABS_API_KEY", "") |
| engine = LANGUAGES.get(target_language, {}).get("engine", "qwen") |
|
|
| if engine == "elevenlabs" and not el_key: |
| raise gr.Error("ELEVENLABS_API_KEY not set. Add it in Settings > Secrets.") |
| if not ds_key: |
| raise gr.Error("DASHSCOPE_API_KEY not set. Add it in Settings > Secrets.") |
|
|
| client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL) |
| translate = target_language not in ("English", "English (US)", "English (UK)", "English (AU)") |
| tmp_dir = tempfile.mkdtemp(prefix="multispeaker_") |
| el_model_id = ELEVENLABS_MODELS.get(el_model_choice, "eleven_multilingual_v2") |
|
|
| |
| if detection_mode == "Auto-detect": |
| progress(0.05, desc="Detecting characters...") |
| char_list, segments = auto_detect_characters(client, text) |
| characters = [c["name"] for c in char_list] |
| char_genders = {c["name"]: c.get("gender", "neutral") for c in char_list} |
| print(f"[MultiSpeaker] Detected {len(characters)} characters: {characters}") |
| print(f"[MultiSpeaker] Genders: {char_genders}") |
| print(f"[MultiSpeaker] {len(segments)} segments") |
| else: |
| |
| try: |
| manual_data = json.loads(manual_chars_json) if manual_chars_json else {} |
| characters = manual_data.get("characters", ["Narrator"]) |
| char_genders = {c: "neutral" for c in characters} |
| segments = [{"speaker": "Narrator", "text": text}] |
| progress(0.05, desc="Splitting text with your characters...") |
| response = client.chat.completions.create( |
| model=OMNI_MODEL, modalities=["text"], |
| messages=[ |
| { |
| "role": "system", |
| "content": ( |
| f"Split this text into segments by speaker. The characters are: {', '.join(characters)}. " |
| "Output ONLY valid JSON: " |
| '{{"segments": [{{"speaker": "Name", "text": "..."}}]}}' |
| " Include ALL text. No markdown." |
| ), |
| }, |
| {"role": "user", "content": text[:8000]}, |
| ], |
| ) |
| raw = response.choices[0].message.content.strip() |
| raw = re.sub(r'^```json\s*', '', raw) |
| raw = re.sub(r'\s*```$', '', raw) |
| data = json.loads(raw) |
| segments = data.get("segments", [{"speaker": "Narrator", "text": text}]) |
| except Exception as e: |
| print(f"[Manual] Parse error: {e}") |
| characters = ["Narrator"] |
| char_genders = {"Narrator": "neutral"} |
| segments = [{"speaker": "Narrator", "text": text}] |
|
|
| if not segments: |
| raise gr.Error("No segments detected. Try a different text or use manual mode.") |
|
|
| |
| voice_assignments = [v0, v1, v2, v3, v4, v5, v6, v7] |
| char_voice_map = {} |
| male_idx, female_idx = 0, 0 |
|
|
| for i, char in enumerate(characters[:8]): |
| if i < len(voice_assignments) and voice_assignments[i]: |
| char_voice_map[char] = voice_assignments[i] |
| else: |
| |
| gender = char_genders.get(char, "neutral") |
| if engine == "elevenlabs": |
| el_male = [f"{v['name']} -- {v['desc']}" for v in ELEVENLABS_VOICE_LIST if "male" in v["desc"].lower() and "female" not in v["desc"].lower()] |
| el_female = [f"{v['name']} -- {v['desc']}" for v in ELEVENLABS_VOICE_LIST if "female" in v["desc"].lower()] |
| if gender == "male" and el_male: |
| char_voice_map[char] = el_male[male_idx % len(el_male)] |
| male_idx += 1 |
| elif gender == "female" and el_female: |
| char_voice_map[char] = el_female[female_idx % len(el_female)] |
| female_idx += 1 |
| else: |
| char_voice_map[char] = ELEVENLABS_VOICES[0] |
| else: |
| if char == "Narrator": |
| char_voice_map[char] = QWEN_NARRATOR_VOICES[0] |
| elif gender == "male": |
| char_voice_map[char] = QWEN_MALE_VOICES[male_idx % len(QWEN_MALE_VOICES)] |
| male_idx += 1 |
| elif gender == "female": |
| char_voice_map[char] = QWEN_FEMALE_VOICES[female_idx % len(QWEN_FEMALE_VOICES)] |
| female_idx += 1 |
| else: |
| char_voice_map[char] = QWEN_VOICES[i % len(QWEN_VOICES)] |
|
|
| print(f"[MultiSpeaker] Voice map: {char_voice_map}") |
|
|
| try: |
| |
| audio_files = [] |
| all_transcripts = [] |
| silence_path = os.path.join(tmp_dir, "silence.wav") |
| speaker_pause = os.path.join(tmp_dir, "speaker_pause.wav") |
| if add_pauses: |
| generate_silence(1.0, silence_path) |
| generate_silence(0.4, speaker_pause) |
|
|
| total = len(segments) |
| prev_speaker = None |
|
|
| for i, seg in enumerate(segments): |
| frac = 0.10 + 0.80 * (i / total) |
| speaker = seg.get("speaker", "Narrator") |
| seg_text = seg.get("text", "").strip() |
| if not seg_text: |
| continue |
|
|
| voice_label = char_voice_map.get(speaker, char_voice_map.get("Narrator", QWEN_VOICES[0])) |
| progress(frac, desc=f"[{speaker}] Segment {i+1}/{total}...") |
|
|
| |
| if add_pauses and prev_speaker and prev_speaker != speaker: |
| audio_files.append(speaker_pause) |
|
|
| |
| if len(seg_text) > MAX_CHARS_PER_SEGMENT: |
| sub_texts = [] |
| sentences = re.split(r'(?<=[.!?])\s+', seg_text) |
| current = "" |
| for s in sentences: |
| if len(current) + len(s) + 1 <= MAX_CHARS_PER_SEGMENT: |
| current = (current + " " + s).strip() |
| else: |
| if current: |
| sub_texts.append(current) |
| current = s |
| if current: |
| sub_texts.append(current) |
| else: |
| sub_texts = [seg_text] |
|
|
| for j, sub_text in enumerate(sub_texts): |
| seg_idx = i * 100 + j |
|
|
| if engine == "elevenlabs": |
| voice_id = get_el_voice_id(voice_label) |
| final_text = sub_text |
| |
| tts_model = el_model_id |
| if add_tags and client: |
| final_text = inject_audio_tags(client, sub_text) |
| tts_model = "eleven_v3" |
| wav_path, error = tts_elevenlabs(final_text, voice_id, el_key, tts_model, seg_idx, tmp_dir) |
| else: |
| voice = get_voice_name(voice_label) |
| wav_path, error = tts_qwen(client, sub_text, voice, target_language, translate, seg_idx, tmp_dir) |
|
|
| if wav_path: |
| audio_files.append(wav_path) |
| else: |
| all_transcripts.append(f"Segment {i+1} failed: {error}") |
| fail = os.path.join(tmp_dir, f"fail_{seg_idx}.wav") |
| generate_silence(1.5, fail) |
| audio_files.append(fail) |
|
|
| |
| all_transcripts.append(f"**[{speaker}]** {seg_text[:200]}{'...' if len(seg_text) > 200 else ''}") |
|
|
| |
| if add_pauses and i < total - 1: |
| audio_files.append(silence_path) |
|
|
| prev_speaker = speaker |
|
|
| if not audio_files: |
| raise gr.Error("No audio generated.") |
|
|
| |
| progress(0.92, desc="Assembling audiobook...") |
| final_wav = os.path.join(tmp_dir, "audiobook.wav") |
| concatenate_wavs(audio_files, final_wav) |
|
|
| progress(0.96, desc="Converting to MP3...") |
| final_mp3 = os.path.join(tmp_dir, "audiobook.mp3") |
| subprocess.run(["ffmpeg", "-y", "-i", final_wav, "-codec:a", "libmp3lame", |
| "-b:a", "128k", "-ar", "24000", "-ac", "1", final_mp3], |
| capture_output=True, check=True) |
|
|
| progress(1.0, desc="Done!") |
|
|
| size_mb = os.path.getsize(final_mp3) / (1024 * 1024) |
| char_summary = "\n".join(f" - **{c}**: {char_voice_map.get(c, 'auto')}" for c in characters[:8]) |
| stats = ( |
| f"**Multi-Speaker Audiobook Generated!**\n\n" |
| f"- **Language:** {target_language}\n" |
| f"- **Characters:** {len(characters)}\n" |
| f"- **Segments:** {total}\n" |
| f"- **File size:** {size_mb:.1f} MB\n\n" |
| f"**Cast:**\n{char_summary}\n" |
| ) |
| transcript = "\n\n".join(all_transcripts) if all_transcripts else "" |
|
|
| |
| return final_mp3, stats, transcript |
|
|
| except gr.Error: |
| raise |
| except Exception as e: |
| raise gr.Error(f"Pipeline error: {str(e)}") |
|
|
|
|
| |
| |
| |
| def detect_characters_ui(text_input, file_input, target_language): |
| """Detect characters and return info for the UI.""" |
| if file_input is not None: |
| text = extract_text_from_file(file_input) |
| elif text_input and text_input.strip(): |
| text = text_input.strip() |
| else: |
| raise gr.Error("Provide text first.") |
|
|
| ds_key = os.environ.get("DASHSCOPE_API_KEY", "") |
| if not ds_key: |
| raise gr.Error("DASHSCOPE_API_KEY not set.") |
|
|
| client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL) |
| char_list, segments = auto_detect_characters(client, text) |
|
|
| engine = LANGUAGES.get(target_language, {}).get("engine", "qwen") |
|
|
| |
| results = f"**Detected {len(char_list)} characters** in {len(segments)} segments:\n\n" |
| for i, char in enumerate(char_list[:8]): |
| name = char["name"] |
| gender = char.get("gender", "neutral") |
| seg_count = sum(1 for s in segments if s.get("speaker") == name) |
| results += f"{i+1}. **{name}** ({gender}) - {seg_count} segments\n" |
|
|
| |
| male_idx, female_idx = 0, 0 |
| updates = [] |
| for i in range(8): |
| if i < len(char_list): |
| char = char_list[i] |
| name = char["name"] |
| gender = char.get("gender", "neutral") |
|
|
| if engine == "elevenlabs": |
| voices = ELEVENLABS_VOICES |
| |
| el_male = [v for v in ELEVENLABS_VOICE_LIST if "male" in v["desc"].lower()] |
| el_female = [v for v in ELEVENLABS_VOICE_LIST if "female" in v["desc"].lower()] |
| if gender == "male" and el_male: |
| v = el_male[male_idx % len(el_male)] |
| default_voice = f"{v['name']} -- {v['desc']}" |
| male_idx += 1 |
| elif gender == "female" and el_female: |
| v = el_female[female_idx % len(el_female)] |
| default_voice = f"{v['name']} -- {v['desc']}" |
| female_idx += 1 |
| else: |
| default_voice = voices[0] |
| else: |
| voices = QWEN_VOICES |
| if name == "Narrator": |
| default_voice = QWEN_NARRATOR_VOICES[0] |
| elif gender == "male": |
| default_voice = QWEN_MALE_VOICES[male_idx % len(QWEN_MALE_VOICES)] |
| male_idx += 1 |
| elif gender == "female": |
| default_voice = QWEN_FEMALE_VOICES[female_idx % len(QWEN_FEMALE_VOICES)] |
| female_idx += 1 |
| else: |
| default_voice = voices[i % len(voices)] |
|
|
| updates.append(gr.update(visible=True, label=f"Voice for: {name} ({gender})", |
| choices=voices, value=default_voice)) |
| else: |
| updates.append(gr.update(visible=False)) |
|
|
| return [results] + updates |
|
|
|
|
| |
| |
| |
| SAMPLE_TEXT = """Chapter 1: The Lighthouse |
| |
| The old lighthouse stood at the edge of the world. Each morning, Elena climbed one hundred and forty-seven iron steps to the lamp room and watched the sun rise from the sea. |
| |
| "One day," she whispered to the seagulls, "I'll follow that sun to wherever it goes." |
| |
| The gulls said nothing. They merely tilted their heads and launched themselves into the wind. |
| |
| Her grandfather was a man of few words but many stories. |
| |
| "Tell me about the ships," Elena would say, curling up in the worn armchair by the fire. |
| |
| And he would smile that slow, careful smile and begin: "There was a ship once, long ago, that sailed beyond the edge of every map. Its captain was a woman with eyes like starlight and a voice that could calm any storm." |
| |
| "What happened to her?" Elena asked, leaning forward. |
| |
| "She found what she was looking for," her grandfather said quietly. "But the price was higher than she imagined." |
| |
| Elena stared into the fire. "Would you pay it? The price, I mean." |
| |
| The old man was silent for a long time. "I already did," he finally whispered. "I already did." |
| """ |
|
|
| DESCRIPTION = """ |
| # Multi-Speaker Audiobook Generator |
| ### Different Voices for Every Character |
| |
| """ |
|
|
| lang_choices = list(LANGUAGES.keys()) |
|
|
|
|
| with gr.Blocks(title="Multi-Speaker Audiobook") as demo: |
|
|
| gr.Markdown(DESCRIPTION) |
|
|
| with gr.Row(): |
| |
| with gr.Column(scale=1): |
| with gr.Tab("Story"): |
| text_input = gr.Textbox(label="English Text", placeholder="Paste your story with dialogue...", |
| lines=10, max_lines=25) |
| file_input = gr.File(label="Or Upload (.txt, .md, .pdf, .docx)", |
| file_types=[".txt", ".md", ".pdf", ".docx"], type="filepath") |
| sample_btn = gr.Button("Load Sample Story", variant="secondary", size="sm") |
|
|
| with gr.Tab("Characters"): |
| target_lang = gr.Dropdown(choices=lang_choices, value="English", label="Target Language") |
|
|
| detection_mode = gr.Radio( |
| choices=["Auto-detect", "Manual"], |
| value="Auto-detect", label="Character Detection Mode", |
| ) |
|
|
| detect_btn = gr.Button("Detect Characters", variant="secondary") |
| detect_result = gr.Markdown(value="Click 'Detect Characters' after entering your text.") |
|
|
| manual_chars = gr.Textbox( |
| label="Manual Characters (JSON)", |
| placeholder='{"characters": ["Narrator", "Elena", "Grandfather"]}', |
| visible=False, lines=3, |
| ) |
|
|
| |
| voice_dropdowns = [] |
| for i in range(8): |
| dd = gr.Dropdown(choices=QWEN_VOICES, label=f"Voice {i+1}", |
| visible=False, allow_custom_value=True) |
| voice_dropdowns.append(dd) |
|
|
| with gr.Tab("Settings"): |
| el_model = gr.Radio( |
| choices=["quality", "balanced", "expressive"], |
| value="quality", label="Voice Quality (English accents only)", |
| info="'expressive' enables emotional audio tags (ElevenLabs v3)", |
| ) |
| add_tags = gr.Checkbox(value=True, label="Auto-inject emotional audio tags", |
| info="AI adds [whispers], [excited], [sighs] etc. for expressive model") |
| add_pauses = gr.Checkbox(value=True, label="Add pauses between speakers", |
| info="Short pause when speaker changes, longer between sections") |
|
|
| generate_btn = gr.Button("Generate Multi-Speaker Audiobook", variant="primary", size="lg") |
|
|
| |
| with gr.Column(scale=1): |
| audio_output = gr.Audio(label="Generated Audiobook", type="filepath") |
| stats_output = gr.Markdown(label="Cast & Stats") |
| with gr.Accordion("Full Transcript (by speaker)", open=False): |
| transcript_output = gr.Markdown() |
|
|
| |
| sample_btn.click(fn=lambda: SAMPLE_TEXT, outputs=text_input) |
|
|
| def toggle_detection_mode(mode): |
| if mode == "Manual": |
| return gr.update(visible=True), gr.update(visible=False) |
| return gr.update(visible=False), gr.update(visible=True) |
|
|
| detection_mode.change( |
| fn=toggle_detection_mode, inputs=detection_mode, |
| outputs=[manual_chars, detect_btn], |
| ) |
|
|
| def on_lang_change(lang): |
| engine = LANGUAGES.get(lang, {}).get("engine", "qwen") |
| voices = ELEVENLABS_VOICES if engine == "elevenlabs" else QWEN_VOICES |
| updates = [gr.update(choices=voices, value=voices[i % len(voices)]) for i in range(8)] |
| return updates |
|
|
| target_lang.change(fn=on_lang_change, inputs=target_lang, outputs=voice_dropdowns) |
|
|
| detect_btn.click( |
| fn=detect_characters_ui, |
| inputs=[text_input, file_input, target_lang], |
| outputs=[detect_result] + voice_dropdowns, |
| ) |
|
|
| generate_btn.click( |
| fn=generate_multi_speaker, |
| inputs=[text_input, file_input, target_lang, detection_mode, |
| manual_chars, el_model, add_tags, add_pauses] + voice_dropdowns, |
| outputs=[audio_output, stats_output, transcript_output], |
| ) |
|
|
| gr.Markdown( |
| "---\n" |
| "**How it works:** AI reads your story, detects characters and dialogue, assigns a unique voice " |
| "to each character, generates audio per segment, and stitches it all together.\n\n" |
| "**Tips:** Use stories with clear dialogue tags (said, whispered, asked) for best auto-detection. " |
| "The 'expressive' model adds emotional audio tags like [whispers] and [sighs] automatically.\n\n" |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|