| | """Set of default text cleaners"""
|
| |
|
| |
|
| | import re
|
| |
|
| |
|
| | _whitespace_re = re.compile(r"\s+")
|
| |
|
| | rep_map = {
|
| | ":": ",",
|
| | ";": ",",
|
| | ",": ",",
|
| | "。": ".",
|
| | "!": "!",
|
| | "?": "?",
|
| | "\n": ".",
|
| | "·": ",",
|
| | "、": ",",
|
| | "...": ".",
|
| | "…": ".",
|
| | "$": ".",
|
| | "“": "'",
|
| | "”": "'",
|
| | "‘": "'",
|
| | "’": "'",
|
| | "(": "'",
|
| | ")": "'",
|
| | "(": "'",
|
| | ")": "'",
|
| | "《": "'",
|
| | "》": "'",
|
| | "【": "'",
|
| | "】": "'",
|
| | "[": "'",
|
| | "]": "'",
|
| | "—": "",
|
| | "~": "-",
|
| | "~": "-",
|
| | "「": "'",
|
| | "」": "'",
|
| | }
|
| |
|
| | def replace_punctuation(text):
|
| | pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
|
| | replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
|
| | return replaced_text
|
| |
|
| | def lowercase(text):
|
| | return text.lower()
|
| |
|
| |
|
| | def collapse_whitespace(text):
|
| | return re.sub(_whitespace_re, " ", text).strip()
|
| |
|
| | def remove_punctuation_at_begin(text):
|
| | return re.sub(r'^[,.!?]+', '', text)
|
| |
|
| | def remove_aux_symbols(text):
|
| | text = re.sub(r"[\<\>\(\)\[\]\"\«\»\']+", "", text)
|
| | return text
|
| |
|
| |
|
| | def replace_symbols(text, lang="en"):
|
| | """Replace symbols based on the lenguage tag.
|
| |
|
| | Args:
|
| | text:
|
| | Input text.
|
| | lang:
|
| | Lenguage identifier. ex: "en", "fr", "pt", "ca".
|
| |
|
| | Returns:
|
| | The modified text
|
| | example:
|
| | input args:
|
| | text: "si l'avi cau, diguem-ho"
|
| | lang: "ca"
|
| | Output:
|
| | text: "si lavi cau, diguemho"
|
| | """
|
| | text = text.replace(";", ",")
|
| | text = text.replace("-", " ") if lang != "ca" else text.replace("-", "")
|
| | text = text.replace(":", ",")
|
| | if lang == "en":
|
| | text = text.replace("&", " and ")
|
| | elif lang == "fr":
|
| | text = text.replace("&", " et ")
|
| | elif lang == "pt":
|
| | text = text.replace("&", " e ")
|
| | elif lang == "ca":
|
| | text = text.replace("&", " i ")
|
| | text = text.replace("'", "")
|
| | elif lang== "es":
|
| | text=text.replace("&","y")
|
| | text = text.replace("'", "")
|
| | return text
|
| |
|
| | def unicleaners(text, cased=False, lang='en'):
|
| | """Basic pipeline for Portuguese text. There is no need to expand abbreviation and
|
| | numbers, phonemizer already does that"""
|
| | if not cased:
|
| | text = lowercase(text)
|
| | text = replace_punctuation(text)
|
| | text = replace_symbols(text, lang=lang)
|
| | text = remove_aux_symbols(text)
|
| | text = remove_punctuation_at_begin(text)
|
| | text = collapse_whitespace(text)
|
| | text = re.sub(r'([^\.,!\?\-…])$', r'\1.', text)
|
| | return text
|
| |
|
| |
|