""" Multi-Speaker Audiobook Generator Different voices for narrator + characters Engines: Premium AI (Qwen) + Premium English (ElevenLabs) Features: - Auto-detect characters from text OR manual character definition - Assign different voices to narrator and each character - Per-segment TTS with the right voice - Supports 17 languages across two engines """ import os import base64 import json import pathlib import shutil import struct import subprocess import tempfile import time import re import gradio as gr import requests as http_requests from openai import OpenAI try: import pypdf HAS_PYPDF = True except ImportError: HAS_PYPDF = False try: import docx HAS_DOCX = True except ImportError: HAS_DOCX = False # ========================================== # CONFIG # ========================================== OMNI_MODEL = "qwen3.5-omni-plus" DASHSCOPE_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1" ELEVENLABS_TTS_URL = "https://api.elevenlabs.io/v1/text-to-speech" ELEVENLABS_CLONE_URL = "https://api.elevenlabs.io/v1/voices/add" MAX_CHARS_PER_SEGMENT = 1500 # ========================================== # LANGUAGES # ========================================== LANGUAGES = { # Qwen (11) "English": {"engine": "qwen"}, "Chinese (Mandarin)": {"engine": "qwen"}, "Japanese": {"engine": "qwen"}, "Korean": {"engine": "qwen"}, "German": {"engine": "qwen"}, "French": {"engine": "qwen"}, "Russian": {"engine": "qwen"}, "Portuguese": {"engine": "qwen"}, "Spanish": {"engine": "qwen"}, "Italian": {"engine": "qwen"}, "Arabic": {"engine": "qwen"}, # ElevenLabs (6) "English (US)": {"engine": "elevenlabs"}, "English (UK)": {"engine": "elevenlabs"}, "English (AU)": {"engine": "elevenlabs"}, "Swahili": {"engine": "elevenlabs"}, "Amharic": {"engine": "elevenlabs"}, "Afrikaans": {"engine": "elevenlabs"}, } # ========================================== # VOICES # ========================================== QWEN_VOICES = [ "Jennifer -- Cinematic female", "Serena -- Gentle female", "Katerina -- Mature female", "Bella -- Elegant female", "Vivian -- Professional female", "Mia -- Young female", "Seren -- Calm female", "Dolce -- Sweet female", "Ethan -- Warm male", "Ryan -- Dramatic male", "Kai -- Soothing male", "Neil -- Precise male", "Lenn -- Rational male", "Arthur -- Classic male", "Eldric Sage -- Authoritative male", "Vincent -- Theatrical male", "Andre -- Deep male", "Aiden -- Young male", ] # Separate male/female for smart assignment QWEN_MALE_VOICES = [v for v in QWEN_VOICES if "male" in v and "female" not in v] QWEN_FEMALE_VOICES = [v for v in QWEN_VOICES if "female" in v] QWEN_NARRATOR_VOICES = ["Jennifer -- Cinematic female", "Eldric Sage -- Authoritative male"] ELEVENLABS_VOICE_LIST = [ {"name": "Rachel", "id": "21m00Tcm4TlvDq8ikWAM", "desc": "Calm female (US)"}, {"name": "Drew", "id": "29vD33N1CtxCmqQRPOHJ", "desc": "Rounded male (US)"}, {"name": "Clyde", "id": "2EiwWnXFnvU5JabPnv8n", "desc": "Deep male (US)"}, {"name": "Paul", "id": "5Q0t7uMcjvnagumLfvZi", "desc": "News male (US)"}, {"name": "Antoni", "id": "ErXwobaYiN019PkySvjV", "desc": "Rounded male (US)"}, {"name": "Daniel", "id": "onwK4e9ZLuTAKqWW03F9", "desc": "British male"}, {"name": "George", "id": "JBFqnCBsd6RMkjVDRZzb", "desc": "Warm British male"}, {"name": "Callum", "id": "N2lVS1w4EtoT3dr4eOWO", "desc": "Intense male"}, {"name": "Charlie", "id": "IKne3meq5aSn9XLyUdCD", "desc": "Australian male"}, {"name": "Matilda", "id": "XrExE9yKIg1WjnnlVkGX", "desc": "Australian female"}, {"name": "Freya", "id": "jsCqWAovK2LkecY7zXl4", "desc": "Young female"}, ] ELEVENLABS_VOICES = [f"{v['name']} -- {v['desc']}" for v in ELEVENLABS_VOICE_LIST] ELEVENLABS_MODELS = { "quality": "eleven_multilingual_v2", "balanced": "eleven_flash_v2_5", "expressive": "eleven_v3", } def get_voice_name(label): return label.split("--")[0].strip() def get_el_voice_id(label): name = get_voice_name(label) for v in ELEVENLABS_VOICE_LIST: if v["name"] == name: return v["id"] return ELEVENLABS_VOICE_LIST[0]["id"] # ========================================== # AUDIO HELPERS # ========================================== def base64_to_wav(b64_data, output_path): audio_bytes = base64.b64decode(b64_data) sr, nc, bps = 24000, 1, 16 br = sr * nc * bps // 8 ba = nc * bps // 8 ds = len(audio_bytes) with open(output_path, "wb") as f: f.write(b"RIFF") f.write(struct.pack(" Secrets.") if not ds_key: raise gr.Error("DASHSCOPE_API_KEY not set. Add it in Settings > Secrets.") client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL) translate = target_language not in ("English", "English (US)", "English (UK)", "English (AU)") tmp_dir = tempfile.mkdtemp(prefix="multispeaker_") el_model_id = ELEVENLABS_MODELS.get(el_model_choice, "eleven_multilingual_v2") # -- Character detection -- if detection_mode == "Auto-detect": progress(0.05, desc="Detecting characters...") char_list, segments = auto_detect_characters(client, text) characters = [c["name"] for c in char_list] char_genders = {c["name"]: c.get("gender", "neutral") for c in char_list} print(f"[MultiSpeaker] Detected {len(characters)} characters: {characters}") print(f"[MultiSpeaker] Genders: {char_genders}") print(f"[MultiSpeaker] {len(segments)} segments") else: # Manual mode - parse the manual JSON try: manual_data = json.loads(manual_chars_json) if manual_chars_json else {} characters = manual_data.get("characters", ["Narrator"]) char_genders = {c: "neutral" for c in characters} segments = [{"speaker": "Narrator", "text": text}] progress(0.05, desc="Splitting text with your characters...") response = client.chat.completions.create( model=OMNI_MODEL, modalities=["text"], messages=[ { "role": "system", "content": ( f"Split this text into segments by speaker. The characters are: {', '.join(characters)}. " "Output ONLY valid JSON: " '{{"segments": [{{"speaker": "Name", "text": "..."}}]}}' " Include ALL text. No markdown." ), }, {"role": "user", "content": text[:8000]}, ], ) raw = response.choices[0].message.content.strip() raw = re.sub(r'^```json\s*', '', raw) raw = re.sub(r'\s*```$', '', raw) data = json.loads(raw) segments = data.get("segments", [{"speaker": "Narrator", "text": text}]) except Exception as e: print(f"[Manual] Parse error: {e}") characters = ["Narrator"] char_genders = {"Narrator": "neutral"} segments = [{"speaker": "Narrator", "text": text}] if not segments: raise gr.Error("No segments detected. Try a different text or use manual mode.") # -- Build voice assignments from the UI dropdowns -- voice_assignments = [v0, v1, v2, v3, v4, v5, v6, v7] char_voice_map = {} male_idx, female_idx = 0, 0 for i, char in enumerate(characters[:8]): if i < len(voice_assignments) and voice_assignments[i]: char_voice_map[char] = voice_assignments[i] else: # Gender-aware auto-assignment gender = char_genders.get(char, "neutral") if engine == "elevenlabs": el_male = [f"{v['name']} -- {v['desc']}" for v in ELEVENLABS_VOICE_LIST if "male" in v["desc"].lower() and "female" not in v["desc"].lower()] el_female = [f"{v['name']} -- {v['desc']}" for v in ELEVENLABS_VOICE_LIST if "female" in v["desc"].lower()] if gender == "male" and el_male: char_voice_map[char] = el_male[male_idx % len(el_male)] male_idx += 1 elif gender == "female" and el_female: char_voice_map[char] = el_female[female_idx % len(el_female)] female_idx += 1 else: char_voice_map[char] = ELEVENLABS_VOICES[0] else: if char == "Narrator": char_voice_map[char] = QWEN_NARRATOR_VOICES[0] elif gender == "male": char_voice_map[char] = QWEN_MALE_VOICES[male_idx % len(QWEN_MALE_VOICES)] male_idx += 1 elif gender == "female": char_voice_map[char] = QWEN_FEMALE_VOICES[female_idx % len(QWEN_FEMALE_VOICES)] female_idx += 1 else: char_voice_map[char] = QWEN_VOICES[i % len(QWEN_VOICES)] print(f"[MultiSpeaker] Voice map: {char_voice_map}") try: # -- Generate audio per segment -- audio_files = [] all_transcripts = [] silence_path = os.path.join(tmp_dir, "silence.wav") speaker_pause = os.path.join(tmp_dir, "speaker_pause.wav") if add_pauses: generate_silence(1.0, silence_path) generate_silence(0.4, speaker_pause) total = len(segments) prev_speaker = None for i, seg in enumerate(segments): frac = 0.10 + 0.80 * (i / total) speaker = seg.get("speaker", "Narrator") seg_text = seg.get("text", "").strip() if not seg_text: continue voice_label = char_voice_map.get(speaker, char_voice_map.get("Narrator", QWEN_VOICES[0])) progress(frac, desc=f"[{speaker}] Segment {i+1}/{total}...") # Add small pause between different speakers if add_pauses and prev_speaker and prev_speaker != speaker: audio_files.append(speaker_pause) # Split long segments if len(seg_text) > MAX_CHARS_PER_SEGMENT: sub_texts = [] sentences = re.split(r'(?<=[.!?])\s+', seg_text) current = "" for s in sentences: if len(current) + len(s) + 1 <= MAX_CHARS_PER_SEGMENT: current = (current + " " + s).strip() else: if current: sub_texts.append(current) current = s if current: sub_texts.append(current) else: sub_texts = [seg_text] for j, sub_text in enumerate(sub_texts): seg_idx = i * 100 + j # Unique index if engine == "elevenlabs": voice_id = get_el_voice_id(voice_label) final_text = sub_text # If tags enabled, inject them and force v3 model (only v3 supports audio tags) tts_model = el_model_id if add_tags and client: final_text = inject_audio_tags(client, sub_text) tts_model = "eleven_v3" # audio tags require v3 wav_path, error = tts_elevenlabs(final_text, voice_id, el_key, tts_model, seg_idx, tmp_dir) else: voice = get_voice_name(voice_label) wav_path, error = tts_qwen(client, sub_text, voice, target_language, translate, seg_idx, tmp_dir) if wav_path: audio_files.append(wav_path) else: all_transcripts.append(f"Segment {i+1} failed: {error}") fail = os.path.join(tmp_dir, f"fail_{seg_idx}.wav") generate_silence(1.5, fail) audio_files.append(fail) # Track transcript all_transcripts.append(f"**[{speaker}]** {seg_text[:200]}{'...' if len(seg_text) > 200 else ''}") # Add section pause if add_pauses and i < total - 1: audio_files.append(silence_path) prev_speaker = speaker if not audio_files: raise gr.Error("No audio generated.") # -- Assemble -- progress(0.92, desc="Assembling audiobook...") final_wav = os.path.join(tmp_dir, "audiobook.wav") concatenate_wavs(audio_files, final_wav) progress(0.96, desc="Converting to MP3...") final_mp3 = os.path.join(tmp_dir, "audiobook.mp3") subprocess.run(["ffmpeg", "-y", "-i", final_wav, "-codec:a", "libmp3lame", "-b:a", "128k", "-ar", "24000", "-ac", "1", final_mp3], capture_output=True, check=True) progress(1.0, desc="Done!") size_mb = os.path.getsize(final_mp3) / (1024 * 1024) char_summary = "\n".join(f" - **{c}**: {char_voice_map.get(c, 'auto')}" for c in characters[:8]) stats = ( f"**Multi-Speaker Audiobook Generated!**\n\n" f"- **Language:** {target_language}\n" f"- **Characters:** {len(characters)}\n" f"- **Segments:** {total}\n" f"- **File size:** {size_mb:.1f} MB\n\n" f"**Cast:**\n{char_summary}\n" ) transcript = "\n\n".join(all_transcripts) if all_transcripts else "" # Return characters info for UI update return final_mp3, stats, transcript except gr.Error: raise except Exception as e: raise gr.Error(f"Pipeline error: {str(e)}") # ========================================== # CHARACTER DETECTION (UI step) # ========================================== def detect_characters_ui(text_input, file_input, target_language): """Detect characters and return info for the UI.""" if file_input is not None: text = extract_text_from_file(file_input) elif text_input and text_input.strip(): text = text_input.strip() else: raise gr.Error("Provide text first.") ds_key = os.environ.get("DASHSCOPE_API_KEY", "") if not ds_key: raise gr.Error("DASHSCOPE_API_KEY not set.") client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL) char_list, segments = auto_detect_characters(client, text) engine = LANGUAGES.get(target_language, {}).get("engine", "qwen") # Build results results = f"**Detected {len(char_list)} characters** in {len(segments)} segments:\n\n" for i, char in enumerate(char_list[:8]): name = char["name"] gender = char.get("gender", "neutral") seg_count = sum(1 for s in segments if s.get("speaker") == name) results += f"{i+1}. **{name}** ({gender}) - {seg_count} segments\n" # Gender-aware voice assignment male_idx, female_idx = 0, 0 updates = [] for i in range(8): if i < len(char_list): char = char_list[i] name = char["name"] gender = char.get("gender", "neutral") if engine == "elevenlabs": voices = ELEVENLABS_VOICES # Pick male/female ElevenLabs voice el_male = [v for v in ELEVENLABS_VOICE_LIST if "male" in v["desc"].lower()] el_female = [v for v in ELEVENLABS_VOICE_LIST if "female" in v["desc"].lower()] if gender == "male" and el_male: v = el_male[male_idx % len(el_male)] default_voice = f"{v['name']} -- {v['desc']}" male_idx += 1 elif gender == "female" and el_female: v = el_female[female_idx % len(el_female)] default_voice = f"{v['name']} -- {v['desc']}" female_idx += 1 else: default_voice = voices[0] # Narrator gets first voice else: voices = QWEN_VOICES if name == "Narrator": default_voice = QWEN_NARRATOR_VOICES[0] elif gender == "male": default_voice = QWEN_MALE_VOICES[male_idx % len(QWEN_MALE_VOICES)] male_idx += 1 elif gender == "female": default_voice = QWEN_FEMALE_VOICES[female_idx % len(QWEN_FEMALE_VOICES)] female_idx += 1 else: default_voice = voices[i % len(voices)] updates.append(gr.update(visible=True, label=f"Voice for: {name} ({gender})", choices=voices, value=default_voice)) else: updates.append(gr.update(visible=False)) return [results] + updates # ========================================== # GRADIO UI # ========================================== SAMPLE_TEXT = """Chapter 1: The Lighthouse The old lighthouse stood at the edge of the world. Each morning, Elena climbed one hundred and forty-seven iron steps to the lamp room and watched the sun rise from the sea. "One day," she whispered to the seagulls, "I'll follow that sun to wherever it goes." The gulls said nothing. They merely tilted their heads and launched themselves into the wind. Her grandfather was a man of few words but many stories. "Tell me about the ships," Elena would say, curling up in the worn armchair by the fire. And he would smile that slow, careful smile and begin: "There was a ship once, long ago, that sailed beyond the edge of every map. Its captain was a woman with eyes like starlight and a voice that could calm any storm." "What happened to her?" Elena asked, leaning forward. "She found what she was looking for," her grandfather said quietly. "But the price was higher than she imagined." Elena stared into the fire. "Would you pay it? The price, I mean." The old man was silent for a long time. "I already did," he finally whispered. "I already did." """ DESCRIPTION = """ # Multi-Speaker Audiobook Generator ### Different Voices for Every Character """ lang_choices = list(LANGUAGES.keys()) with gr.Blocks(title="Multi-Speaker Audiobook") as demo: gr.Markdown(DESCRIPTION) with gr.Row(): # -- LEFT -- with gr.Column(scale=1): with gr.Tab("Story"): text_input = gr.Textbox(label="English Text", placeholder="Paste your story with dialogue...", lines=10, max_lines=25) file_input = gr.File(label="Or Upload (.txt, .md, .pdf, .docx)", file_types=[".txt", ".md", ".pdf", ".docx"], type="filepath") sample_btn = gr.Button("Load Sample Story", variant="secondary", size="sm") with gr.Tab("Characters"): target_lang = gr.Dropdown(choices=lang_choices, value="English", label="Target Language") detection_mode = gr.Radio( choices=["Auto-detect", "Manual"], value="Auto-detect", label="Character Detection Mode", ) detect_btn = gr.Button("Detect Characters", variant="secondary") detect_result = gr.Markdown(value="Click 'Detect Characters' after entering your text.") manual_chars = gr.Textbox( label="Manual Characters (JSON)", placeholder='{"characters": ["Narrator", "Elena", "Grandfather"]}', visible=False, lines=3, ) # Up to 8 character voice dropdowns (hidden until detection) voice_dropdowns = [] for i in range(8): dd = gr.Dropdown(choices=QWEN_VOICES, label=f"Voice {i+1}", visible=False, allow_custom_value=True) voice_dropdowns.append(dd) with gr.Tab("Settings"): el_model = gr.Radio( choices=["quality", "balanced", "expressive"], value="quality", label="Voice Quality (English accents only)", info="'expressive' enables emotional audio tags (ElevenLabs v3)", ) add_tags = gr.Checkbox(value=True, label="Auto-inject emotional audio tags", info="AI adds [whispers], [excited], [sighs] etc. for expressive model") add_pauses = gr.Checkbox(value=True, label="Add pauses between speakers", info="Short pause when speaker changes, longer between sections") generate_btn = gr.Button("Generate Multi-Speaker Audiobook", variant="primary", size="lg") # -- RIGHT -- with gr.Column(scale=1): audio_output = gr.Audio(label="Generated Audiobook", type="filepath") stats_output = gr.Markdown(label="Cast & Stats") with gr.Accordion("Full Transcript (by speaker)", open=False): transcript_output = gr.Markdown() # -- Events -- sample_btn.click(fn=lambda: SAMPLE_TEXT, outputs=text_input) def toggle_detection_mode(mode): if mode == "Manual": return gr.update(visible=True), gr.update(visible=False) return gr.update(visible=False), gr.update(visible=True) detection_mode.change( fn=toggle_detection_mode, inputs=detection_mode, outputs=[manual_chars, detect_btn], ) def on_lang_change(lang): engine = LANGUAGES.get(lang, {}).get("engine", "qwen") voices = ELEVENLABS_VOICES if engine == "elevenlabs" else QWEN_VOICES updates = [gr.update(choices=voices, value=voices[i % len(voices)]) for i in range(8)] return updates target_lang.change(fn=on_lang_change, inputs=target_lang, outputs=voice_dropdowns) detect_btn.click( fn=detect_characters_ui, inputs=[text_input, file_input, target_lang], outputs=[detect_result] + voice_dropdowns, ) generate_btn.click( fn=generate_multi_speaker, inputs=[text_input, file_input, target_lang, detection_mode, manual_chars, el_model, add_tags, add_pauses] + voice_dropdowns, outputs=[audio_output, stats_output, transcript_output], ) gr.Markdown( "---\n" "**How it works:** AI reads your story, detects characters and dialogue, assigns a unique voice " "to each character, generates audio per segment, and stitches it all together.\n\n" "**Tips:** Use stories with clear dialogue tags (said, whispered, asked) for best auto-detection. " "The 'expressive' model adds emotional audio tags like [whispers] and [sighs] automatically.\n\n" ) if __name__ == "__main__": demo.launch()