PlotweaverModel's picture
Upload app.py
f5b6b19 verified
"""
Multi-Speaker Audiobook Generator
Different voices for narrator + characters
Engines: Premium AI (Qwen) + Premium English (ElevenLabs)
Features:
- Auto-detect characters from text OR manual character definition
- Assign different voices to narrator and each character
- Per-segment TTS with the right voice
- Supports 17 languages across two engines
"""
import os
import base64
import json
import pathlib
import shutil
import struct
import subprocess
import tempfile
import time
import re
import gradio as gr
import requests as http_requests
from openai import OpenAI
try:
import pypdf
HAS_PYPDF = True
except ImportError:
HAS_PYPDF = False
try:
import docx
HAS_DOCX = True
except ImportError:
HAS_DOCX = False
# ==========================================
# CONFIG
# ==========================================
OMNI_MODEL = "qwen3.5-omni-plus"
DASHSCOPE_BASE_URL = "https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
ELEVENLABS_TTS_URL = "https://api.elevenlabs.io/v1/text-to-speech"
ELEVENLABS_CLONE_URL = "https://api.elevenlabs.io/v1/voices/add"
MAX_CHARS_PER_SEGMENT = 1500
# ==========================================
# LANGUAGES
# ==========================================
LANGUAGES = {
# Qwen (11)
"English": {"engine": "qwen"},
"Chinese (Mandarin)": {"engine": "qwen"},
"Japanese": {"engine": "qwen"},
"Korean": {"engine": "qwen"},
"German": {"engine": "qwen"},
"French": {"engine": "qwen"},
"Russian": {"engine": "qwen"},
"Portuguese": {"engine": "qwen"},
"Spanish": {"engine": "qwen"},
"Italian": {"engine": "qwen"},
"Arabic": {"engine": "qwen"},
# ElevenLabs (6)
"English (US)": {"engine": "elevenlabs"},
"English (UK)": {"engine": "elevenlabs"},
"English (AU)": {"engine": "elevenlabs"},
"Swahili": {"engine": "elevenlabs"},
"Amharic": {"engine": "elevenlabs"},
"Afrikaans": {"engine": "elevenlabs"},
}
# ==========================================
# VOICES
# ==========================================
QWEN_VOICES = [
"Jennifer -- Cinematic female", "Serena -- Gentle female",
"Katerina -- Mature female", "Bella -- Elegant female",
"Vivian -- Professional female", "Mia -- Young female",
"Seren -- Calm female", "Dolce -- Sweet female",
"Ethan -- Warm male", "Ryan -- Dramatic male",
"Kai -- Soothing male", "Neil -- Precise male",
"Lenn -- Rational male", "Arthur -- Classic male",
"Eldric Sage -- Authoritative male", "Vincent -- Theatrical male",
"Andre -- Deep male", "Aiden -- Young male",
]
# Separate male/female for smart assignment
QWEN_MALE_VOICES = [v for v in QWEN_VOICES if "male" in v and "female" not in v]
QWEN_FEMALE_VOICES = [v for v in QWEN_VOICES if "female" in v]
QWEN_NARRATOR_VOICES = ["Jennifer -- Cinematic female", "Eldric Sage -- Authoritative male"]
ELEVENLABS_VOICE_LIST = [
{"name": "Rachel", "id": "21m00Tcm4TlvDq8ikWAM", "desc": "Calm female (US)"},
{"name": "Drew", "id": "29vD33N1CtxCmqQRPOHJ", "desc": "Rounded male (US)"},
{"name": "Clyde", "id": "2EiwWnXFnvU5JabPnv8n", "desc": "Deep male (US)"},
{"name": "Paul", "id": "5Q0t7uMcjvnagumLfvZi", "desc": "News male (US)"},
{"name": "Antoni", "id": "ErXwobaYiN019PkySvjV", "desc": "Rounded male (US)"},
{"name": "Daniel", "id": "onwK4e9ZLuTAKqWW03F9", "desc": "British male"},
{"name": "George", "id": "JBFqnCBsd6RMkjVDRZzb", "desc": "Warm British male"},
{"name": "Callum", "id": "N2lVS1w4EtoT3dr4eOWO", "desc": "Intense male"},
{"name": "Charlie", "id": "IKne3meq5aSn9XLyUdCD", "desc": "Australian male"},
{"name": "Matilda", "id": "XrExE9yKIg1WjnnlVkGX", "desc": "Australian female"},
{"name": "Freya", "id": "jsCqWAovK2LkecY7zXl4", "desc": "Young female"},
]
ELEVENLABS_VOICES = [f"{v['name']} -- {v['desc']}" for v in ELEVENLABS_VOICE_LIST]
ELEVENLABS_MODELS = {
"quality": "eleven_multilingual_v2",
"balanced": "eleven_flash_v2_5",
"expressive": "eleven_v3",
}
def get_voice_name(label):
return label.split("--")[0].strip()
def get_el_voice_id(label):
name = get_voice_name(label)
for v in ELEVENLABS_VOICE_LIST:
if v["name"] == name:
return v["id"]
return ELEVENLABS_VOICE_LIST[0]["id"]
# ==========================================
# AUDIO HELPERS
# ==========================================
def base64_to_wav(b64_data, output_path):
audio_bytes = base64.b64decode(b64_data)
sr, nc, bps = 24000, 1, 16
br = sr * nc * bps // 8
ba = nc * bps // 8
ds = len(audio_bytes)
with open(output_path, "wb") as f:
f.write(b"RIFF")
f.write(struct.pack("<I", 36 + ds))
f.write(b"WAVE")
f.write(b"fmt ")
f.write(struct.pack("<I", 16))
f.write(struct.pack("<H", 1))
f.write(struct.pack("<H", nc))
f.write(struct.pack("<I", sr))
f.write(struct.pack("<I", br))
f.write(struct.pack("<H", ba))
f.write(struct.pack("<H", bps))
f.write(b"data")
f.write(struct.pack("<I", ds))
f.write(audio_bytes)
def concatenate_wavs(wav_files, output_path):
if not wav_files:
return
if len(wav_files) == 1:
shutil.copy2(wav_files[0], output_path)
return
list_file = output_path + ".txt"
with open(list_file, "w") as f:
for w in wav_files:
f.write(f"file '{w}'\n")
subprocess.run(["ffmpeg", "-y", "-f", "concat", "-safe", "0",
"-i", list_file, "-c", "copy", output_path],
capture_output=True, check=True)
os.remove(list_file)
def generate_silence(dur, path):
subprocess.run(["ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=r=24000:cl=mono",
"-t", str(dur), "-acodec", "pcm_s16le", path],
capture_output=True, check=True)
# ==========================================
# DOCUMENT EXTRACTION
# ==========================================
def extract_text_from_file(filepath):
ext = os.path.splitext(filepath)[1].lower()
if ext == ".pdf":
if not HAS_PYPDF:
raise gr.Error("pypdf not installed.")
reader = pypdf.PdfReader(filepath)
return "\n\n".join(p.extract_text().strip() for p in reader.pages if p.extract_text())
elif ext in (".docx", ".doc"):
if not HAS_DOCX:
raise gr.Error("python-docx not installed.")
doc = docx.Document(filepath)
return "\n\n".join(p.text.strip() for p in doc.paragraphs if p.text.strip())
else:
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
return f.read()
# ==========================================
# CHARACTER DETECTION (via AI)
# ==========================================
def auto_detect_characters(client, text):
"""Use AI to detect characters with genders and split text into speaker segments."""
response = client.chat.completions.create(
model=OMNI_MODEL, modalities=["text"],
messages=[
{
"role": "system",
"content": (
"You are a script analyst. Analyze the text and identify all speaking characters. "
"Then split the text into segments, each tagged with who is speaking.\n\n"
"Output ONLY valid JSON with this exact structure:\n"
"{\n"
' "characters": [\n'
' {"name": "Narrator", "gender": "neutral"},\n'
' {"name": "Elena", "gender": "female"},\n'
' {"name": "Grandfather", "gender": "male"}\n'
" ],\n"
' "segments": [\n'
' {"speaker": "Narrator", "text": "The old lighthouse stood at the edge of the world."},\n'
' {"speaker": "Elena", "text": "One day, I\'ll follow that sun to wherever it goes."},\n'
' {"speaker": "Narrator", "text": "The gulls said nothing."}\n'
" ]\n"
"}\n\n"
"Rules:\n"
'1. "Narrator" is always first, gender "neutral"\n'
'2. Detect character genders from names, pronouns, and context (he/she/etc)\n'
"3. If gender unclear, use 'neutral'\n"
"4. Detect character names from dialogue tags (said, whispered, asked, etc.)\n"
"5. Keep segments in original order, include ALL text\n"
"6. Merge consecutive segments by the same speaker\n"
"7. Output ONLY the JSON, no markdown, no backticks"
),
},
{"role": "user", "content": f"Analyze and split into speaker segments:\n\n{text[:8000]}"},
],
)
raw = response.choices[0].message.content.strip()
raw = re.sub(r'^```json\s*', '', raw)
raw = re.sub(r'\s*```$', '', raw)
try:
data = json.loads(raw)
char_list = data.get("characters", [{"name": "Narrator", "gender": "neutral"}])
segments = data.get("segments", [])
if not segments:
return [{"name": "Narrator", "gender": "neutral"}], [{"speaker": "Narrator", "text": text}]
# Ensure Narrator is first
names = [c["name"] for c in char_list]
if "Narrator" not in names:
char_list.insert(0, {"name": "Narrator", "gender": "neutral"})
return char_list, segments
except json.JSONDecodeError as e:
print(f"[CharDetect] JSON parse failed: {e}")
print(f"[CharDetect] Raw: {raw[:500]}")
return [{"name": "Narrator", "gender": "neutral"}], [{"speaker": "Narrator", "text": text}]
def inject_audio_tags(client, text):
"""Inject ElevenLabs v3 audio tags for emotional narration."""
if not client:
return text
try:
response = client.chat.completions.create(
model=OMNI_MODEL, modalities=["text"],
messages=[
{
"role": "system",
"content": (
"Add ElevenLabs v3 audio tags to this text for expressive narration. "
"Tags are words in [brackets] like [whispers], [excited], [sighs], [laughs], "
"[softly], [firmly], [dramatically]. Place before the phrase they apply to. "
"Use sparingly (1 tag per 2-3 sentences). Output ONLY the tagged text."
),
},
{"role": "user", "content": text},
],
)
return response.choices[0].message.content.strip()
except Exception:
return text
# ==========================================
# TTS: QWEN
# ==========================================
def tts_qwen(client, text, voice, language, translate, chunk_idx, tmp_dir):
output_wav = os.path.join(tmp_dir, f"seg_{chunk_idx:04d}.wav")
if translate:
sys_prompt = f"Translate English to {language} and narrate expressively. ONLY spoken {language}."
user_text = f"Translate and narrate:\n\n{text}"
else:
sys_prompt = "Narrate expressively as an audiobook. ONLY narration."
user_text = f"Narrate:\n\n{text}"
try:
completion = client.chat.completions.create(
model=OMNI_MODEL,
messages=[{"role": "system", "content": sys_prompt}, {"role": "user", "content": user_text}],
modalities=["text", "audio"], audio={"voice": voice, "format": "wav"},
stream=True, stream_options={"include_usage": True},
)
audio_parts = []
for event in completion:
if not event.choices:
continue
delta = event.choices[0].delta
if hasattr(delta, "audio") and delta.audio:
if isinstance(delta.audio, dict) and "data" in delta.audio:
audio_parts.append(delta.audio["data"])
elif hasattr(delta.audio, "data") and delta.audio.data:
audio_parts.append(delta.audio.data)
if audio_parts:
base64_to_wav("".join(audio_parts), output_wav)
return output_wav, None
return None, "No audio received"
except Exception as e:
return None, str(e)
# ==========================================
# TTS: ELEVENLABS
# ==========================================
def tts_elevenlabs(text, voice_id, api_key, model_id, chunk_idx, tmp_dir):
output_mp3 = os.path.join(tmp_dir, f"el_{chunk_idx:04d}.mp3")
output_wav = os.path.join(tmp_dir, f"el_{chunk_idx:04d}.wav")
headers = {"xi-api-key": api_key, "Content-Type": "application/json"}
payload = {
"text": text,
"model_id": model_id,
"voice_settings": {"stability": 0.5, "similarity_boost": 0.75, "style": 0.3},
}
try:
resp = http_requests.post(f"{ELEVENLABS_TTS_URL}/{voice_id}", headers=headers, json=payload, timeout=120)
print(f"[EL] Seg {chunk_idx}: status={resp.status_code}, {len(resp.content)} bytes")
if resp.status_code != 200:
print(f"[EL] Error: {resp.text[:300]}")
return None, f"TTS failed ({resp.status_code})"
with open(output_mp3, "wb") as f:
f.write(resp.content)
subprocess.run(["ffmpeg", "-y", "-i", output_mp3, "-ar", "24000", "-ac", "1",
"-acodec", "pcm_s16le", output_wav], capture_output=True, check=True)
return output_wav, None
except Exception as e:
return None, str(e)
# ==========================================
# MAIN PIPELINE
# ==========================================
def generate_multi_speaker(
text_input, file_input, target_language, detection_mode,
manual_chars_json, el_model_choice, add_tags, add_pauses,
# Dynamic voice assignments (up to 8 characters)
v0, v1, v2, v3, v4, v5, v6, v7,
progress=gr.Progress(),
):
# -- Resolve text --
if file_input is not None:
progress(0.01, desc="Extracting text...")
text = extract_text_from_file(file_input)
elif text_input and text_input.strip():
text = text_input.strip()
else:
raise gr.Error("Please provide text or upload a file.")
if len(text) < 20:
raise gr.Error("Text too short.")
# -- API keys --
ds_key = os.environ.get("DASHSCOPE_API_KEY", "")
el_key = os.environ.get("ELEVENLABS_API_KEY", "")
engine = LANGUAGES.get(target_language, {}).get("engine", "qwen")
if engine == "elevenlabs" and not el_key:
raise gr.Error("ELEVENLABS_API_KEY not set. Add it in Settings > Secrets.")
if not ds_key:
raise gr.Error("DASHSCOPE_API_KEY not set. Add it in Settings > Secrets.")
client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL)
translate = target_language not in ("English", "English (US)", "English (UK)", "English (AU)")
tmp_dir = tempfile.mkdtemp(prefix="multispeaker_")
el_model_id = ELEVENLABS_MODELS.get(el_model_choice, "eleven_multilingual_v2")
# -- Character detection --
if detection_mode == "Auto-detect":
progress(0.05, desc="Detecting characters...")
char_list, segments = auto_detect_characters(client, text)
characters = [c["name"] for c in char_list]
char_genders = {c["name"]: c.get("gender", "neutral") for c in char_list}
print(f"[MultiSpeaker] Detected {len(characters)} characters: {characters}")
print(f"[MultiSpeaker] Genders: {char_genders}")
print(f"[MultiSpeaker] {len(segments)} segments")
else:
# Manual mode - parse the manual JSON
try:
manual_data = json.loads(manual_chars_json) if manual_chars_json else {}
characters = manual_data.get("characters", ["Narrator"])
char_genders = {c: "neutral" for c in characters}
segments = [{"speaker": "Narrator", "text": text}]
progress(0.05, desc="Splitting text with your characters...")
response = client.chat.completions.create(
model=OMNI_MODEL, modalities=["text"],
messages=[
{
"role": "system",
"content": (
f"Split this text into segments by speaker. The characters are: {', '.join(characters)}. "
"Output ONLY valid JSON: "
'{{"segments": [{{"speaker": "Name", "text": "..."}}]}}'
" Include ALL text. No markdown."
),
},
{"role": "user", "content": text[:8000]},
],
)
raw = response.choices[0].message.content.strip()
raw = re.sub(r'^```json\s*', '', raw)
raw = re.sub(r'\s*```$', '', raw)
data = json.loads(raw)
segments = data.get("segments", [{"speaker": "Narrator", "text": text}])
except Exception as e:
print(f"[Manual] Parse error: {e}")
characters = ["Narrator"]
char_genders = {"Narrator": "neutral"}
segments = [{"speaker": "Narrator", "text": text}]
if not segments:
raise gr.Error("No segments detected. Try a different text or use manual mode.")
# -- Build voice assignments from the UI dropdowns --
voice_assignments = [v0, v1, v2, v3, v4, v5, v6, v7]
char_voice_map = {}
male_idx, female_idx = 0, 0
for i, char in enumerate(characters[:8]):
if i < len(voice_assignments) and voice_assignments[i]:
char_voice_map[char] = voice_assignments[i]
else:
# Gender-aware auto-assignment
gender = char_genders.get(char, "neutral")
if engine == "elevenlabs":
el_male = [f"{v['name']} -- {v['desc']}" for v in ELEVENLABS_VOICE_LIST if "male" in v["desc"].lower() and "female" not in v["desc"].lower()]
el_female = [f"{v['name']} -- {v['desc']}" for v in ELEVENLABS_VOICE_LIST if "female" in v["desc"].lower()]
if gender == "male" and el_male:
char_voice_map[char] = el_male[male_idx % len(el_male)]
male_idx += 1
elif gender == "female" and el_female:
char_voice_map[char] = el_female[female_idx % len(el_female)]
female_idx += 1
else:
char_voice_map[char] = ELEVENLABS_VOICES[0]
else:
if char == "Narrator":
char_voice_map[char] = QWEN_NARRATOR_VOICES[0]
elif gender == "male":
char_voice_map[char] = QWEN_MALE_VOICES[male_idx % len(QWEN_MALE_VOICES)]
male_idx += 1
elif gender == "female":
char_voice_map[char] = QWEN_FEMALE_VOICES[female_idx % len(QWEN_FEMALE_VOICES)]
female_idx += 1
else:
char_voice_map[char] = QWEN_VOICES[i % len(QWEN_VOICES)]
print(f"[MultiSpeaker] Voice map: {char_voice_map}")
try:
# -- Generate audio per segment --
audio_files = []
all_transcripts = []
silence_path = os.path.join(tmp_dir, "silence.wav")
speaker_pause = os.path.join(tmp_dir, "speaker_pause.wav")
if add_pauses:
generate_silence(1.0, silence_path)
generate_silence(0.4, speaker_pause)
total = len(segments)
prev_speaker = None
for i, seg in enumerate(segments):
frac = 0.10 + 0.80 * (i / total)
speaker = seg.get("speaker", "Narrator")
seg_text = seg.get("text", "").strip()
if not seg_text:
continue
voice_label = char_voice_map.get(speaker, char_voice_map.get("Narrator", QWEN_VOICES[0]))
progress(frac, desc=f"[{speaker}] Segment {i+1}/{total}...")
# Add small pause between different speakers
if add_pauses and prev_speaker and prev_speaker != speaker:
audio_files.append(speaker_pause)
# Split long segments
if len(seg_text) > MAX_CHARS_PER_SEGMENT:
sub_texts = []
sentences = re.split(r'(?<=[.!?])\s+', seg_text)
current = ""
for s in sentences:
if len(current) + len(s) + 1 <= MAX_CHARS_PER_SEGMENT:
current = (current + " " + s).strip()
else:
if current:
sub_texts.append(current)
current = s
if current:
sub_texts.append(current)
else:
sub_texts = [seg_text]
for j, sub_text in enumerate(sub_texts):
seg_idx = i * 100 + j # Unique index
if engine == "elevenlabs":
voice_id = get_el_voice_id(voice_label)
final_text = sub_text
# If tags enabled, inject them and force v3 model (only v3 supports audio tags)
tts_model = el_model_id
if add_tags and client:
final_text = inject_audio_tags(client, sub_text)
tts_model = "eleven_v3" # audio tags require v3
wav_path, error = tts_elevenlabs(final_text, voice_id, el_key, tts_model, seg_idx, tmp_dir)
else:
voice = get_voice_name(voice_label)
wav_path, error = tts_qwen(client, sub_text, voice, target_language, translate, seg_idx, tmp_dir)
if wav_path:
audio_files.append(wav_path)
else:
all_transcripts.append(f"Segment {i+1} failed: {error}")
fail = os.path.join(tmp_dir, f"fail_{seg_idx}.wav")
generate_silence(1.5, fail)
audio_files.append(fail)
# Track transcript
all_transcripts.append(f"**[{speaker}]** {seg_text[:200]}{'...' if len(seg_text) > 200 else ''}")
# Add section pause
if add_pauses and i < total - 1:
audio_files.append(silence_path)
prev_speaker = speaker
if not audio_files:
raise gr.Error("No audio generated.")
# -- Assemble --
progress(0.92, desc="Assembling audiobook...")
final_wav = os.path.join(tmp_dir, "audiobook.wav")
concatenate_wavs(audio_files, final_wav)
progress(0.96, desc="Converting to MP3...")
final_mp3 = os.path.join(tmp_dir, "audiobook.mp3")
subprocess.run(["ffmpeg", "-y", "-i", final_wav, "-codec:a", "libmp3lame",
"-b:a", "128k", "-ar", "24000", "-ac", "1", final_mp3],
capture_output=True, check=True)
progress(1.0, desc="Done!")
size_mb = os.path.getsize(final_mp3) / (1024 * 1024)
char_summary = "\n".join(f" - **{c}**: {char_voice_map.get(c, 'auto')}" for c in characters[:8])
stats = (
f"**Multi-Speaker Audiobook Generated!**\n\n"
f"- **Language:** {target_language}\n"
f"- **Characters:** {len(characters)}\n"
f"- **Segments:** {total}\n"
f"- **File size:** {size_mb:.1f} MB\n\n"
f"**Cast:**\n{char_summary}\n"
)
transcript = "\n\n".join(all_transcripts) if all_transcripts else ""
# Return characters info for UI update
return final_mp3, stats, transcript
except gr.Error:
raise
except Exception as e:
raise gr.Error(f"Pipeline error: {str(e)}")
# ==========================================
# CHARACTER DETECTION (UI step)
# ==========================================
def detect_characters_ui(text_input, file_input, target_language):
"""Detect characters and return info for the UI."""
if file_input is not None:
text = extract_text_from_file(file_input)
elif text_input and text_input.strip():
text = text_input.strip()
else:
raise gr.Error("Provide text first.")
ds_key = os.environ.get("DASHSCOPE_API_KEY", "")
if not ds_key:
raise gr.Error("DASHSCOPE_API_KEY not set.")
client = OpenAI(api_key=ds_key, base_url=DASHSCOPE_BASE_URL)
char_list, segments = auto_detect_characters(client, text)
engine = LANGUAGES.get(target_language, {}).get("engine", "qwen")
# Build results
results = f"**Detected {len(char_list)} characters** in {len(segments)} segments:\n\n"
for i, char in enumerate(char_list[:8]):
name = char["name"]
gender = char.get("gender", "neutral")
seg_count = sum(1 for s in segments if s.get("speaker") == name)
results += f"{i+1}. **{name}** ({gender}) - {seg_count} segments\n"
# Gender-aware voice assignment
male_idx, female_idx = 0, 0
updates = []
for i in range(8):
if i < len(char_list):
char = char_list[i]
name = char["name"]
gender = char.get("gender", "neutral")
if engine == "elevenlabs":
voices = ELEVENLABS_VOICES
# Pick male/female ElevenLabs voice
el_male = [v for v in ELEVENLABS_VOICE_LIST if "male" in v["desc"].lower()]
el_female = [v for v in ELEVENLABS_VOICE_LIST if "female" in v["desc"].lower()]
if gender == "male" and el_male:
v = el_male[male_idx % len(el_male)]
default_voice = f"{v['name']} -- {v['desc']}"
male_idx += 1
elif gender == "female" and el_female:
v = el_female[female_idx % len(el_female)]
default_voice = f"{v['name']} -- {v['desc']}"
female_idx += 1
else:
default_voice = voices[0] # Narrator gets first voice
else:
voices = QWEN_VOICES
if name == "Narrator":
default_voice = QWEN_NARRATOR_VOICES[0]
elif gender == "male":
default_voice = QWEN_MALE_VOICES[male_idx % len(QWEN_MALE_VOICES)]
male_idx += 1
elif gender == "female":
default_voice = QWEN_FEMALE_VOICES[female_idx % len(QWEN_FEMALE_VOICES)]
female_idx += 1
else:
default_voice = voices[i % len(voices)]
updates.append(gr.update(visible=True, label=f"Voice for: {name} ({gender})",
choices=voices, value=default_voice))
else:
updates.append(gr.update(visible=False))
return [results] + updates
# ==========================================
# GRADIO UI
# ==========================================
SAMPLE_TEXT = """Chapter 1: The Lighthouse
The old lighthouse stood at the edge of the world. Each morning, Elena climbed one hundred and forty-seven iron steps to the lamp room and watched the sun rise from the sea.
"One day," she whispered to the seagulls, "I'll follow that sun to wherever it goes."
The gulls said nothing. They merely tilted their heads and launched themselves into the wind.
Her grandfather was a man of few words but many stories.
"Tell me about the ships," Elena would say, curling up in the worn armchair by the fire.
And he would smile that slow, careful smile and begin: "There was a ship once, long ago, that sailed beyond the edge of every map. Its captain was a woman with eyes like starlight and a voice that could calm any storm."
"What happened to her?" Elena asked, leaning forward.
"She found what she was looking for," her grandfather said quietly. "But the price was higher than she imagined."
Elena stared into the fire. "Would you pay it? The price, I mean."
The old man was silent for a long time. "I already did," he finally whispered. "I already did."
"""
DESCRIPTION = """
# Multi-Speaker Audiobook Generator
### Different Voices for Every Character
"""
lang_choices = list(LANGUAGES.keys())
with gr.Blocks(title="Multi-Speaker Audiobook") as demo:
gr.Markdown(DESCRIPTION)
with gr.Row():
# -- LEFT --
with gr.Column(scale=1):
with gr.Tab("Story"):
text_input = gr.Textbox(label="English Text", placeholder="Paste your story with dialogue...",
lines=10, max_lines=25)
file_input = gr.File(label="Or Upload (.txt, .md, .pdf, .docx)",
file_types=[".txt", ".md", ".pdf", ".docx"], type="filepath")
sample_btn = gr.Button("Load Sample Story", variant="secondary", size="sm")
with gr.Tab("Characters"):
target_lang = gr.Dropdown(choices=lang_choices, value="English", label="Target Language")
detection_mode = gr.Radio(
choices=["Auto-detect", "Manual"],
value="Auto-detect", label="Character Detection Mode",
)
detect_btn = gr.Button("Detect Characters", variant="secondary")
detect_result = gr.Markdown(value="Click 'Detect Characters' after entering your text.")
manual_chars = gr.Textbox(
label="Manual Characters (JSON)",
placeholder='{"characters": ["Narrator", "Elena", "Grandfather"]}',
visible=False, lines=3,
)
# Up to 8 character voice dropdowns (hidden until detection)
voice_dropdowns = []
for i in range(8):
dd = gr.Dropdown(choices=QWEN_VOICES, label=f"Voice {i+1}",
visible=False, allow_custom_value=True)
voice_dropdowns.append(dd)
with gr.Tab("Settings"):
el_model = gr.Radio(
choices=["quality", "balanced", "expressive"],
value="quality", label="Voice Quality (English accents only)",
info="'expressive' enables emotional audio tags (ElevenLabs v3)",
)
add_tags = gr.Checkbox(value=True, label="Auto-inject emotional audio tags",
info="AI adds [whispers], [excited], [sighs] etc. for expressive model")
add_pauses = gr.Checkbox(value=True, label="Add pauses between speakers",
info="Short pause when speaker changes, longer between sections")
generate_btn = gr.Button("Generate Multi-Speaker Audiobook", variant="primary", size="lg")
# -- RIGHT --
with gr.Column(scale=1):
audio_output = gr.Audio(label="Generated Audiobook", type="filepath")
stats_output = gr.Markdown(label="Cast & Stats")
with gr.Accordion("Full Transcript (by speaker)", open=False):
transcript_output = gr.Markdown()
# -- Events --
sample_btn.click(fn=lambda: SAMPLE_TEXT, outputs=text_input)
def toggle_detection_mode(mode):
if mode == "Manual":
return gr.update(visible=True), gr.update(visible=False)
return gr.update(visible=False), gr.update(visible=True)
detection_mode.change(
fn=toggle_detection_mode, inputs=detection_mode,
outputs=[manual_chars, detect_btn],
)
def on_lang_change(lang):
engine = LANGUAGES.get(lang, {}).get("engine", "qwen")
voices = ELEVENLABS_VOICES if engine == "elevenlabs" else QWEN_VOICES
updates = [gr.update(choices=voices, value=voices[i % len(voices)]) for i in range(8)]
return updates
target_lang.change(fn=on_lang_change, inputs=target_lang, outputs=voice_dropdowns)
detect_btn.click(
fn=detect_characters_ui,
inputs=[text_input, file_input, target_lang],
outputs=[detect_result] + voice_dropdowns,
)
generate_btn.click(
fn=generate_multi_speaker,
inputs=[text_input, file_input, target_lang, detection_mode,
manual_chars, el_model, add_tags, add_pauses] + voice_dropdowns,
outputs=[audio_output, stats_output, transcript_output],
)
gr.Markdown(
"---\n"
"**How it works:** AI reads your story, detects characters and dialogue, assigns a unique voice "
"to each character, generates audio per segment, and stitches it all together.\n\n"
"**Tips:** Use stories with clear dialogue tags (said, whispered, asked) for best auto-detection. "
"The 'expressive' model adds emotional audio tags like [whispers] and [sighs] automatically.\n\n"
)
if __name__ == "__main__":
demo.launch()