| | |
| |
|
| | Unique_ID = "ultradata_math_parser_id_internal" |
| |
|
| | PAYWALL_DISCARD_XPATH = [ |
| | """.//*[(self::div or self::p)][ |
| | contains(@id, "paywall") or contains(@id, "premium") or |
| | contains(@class, "paid-content") or contains(@class, "paidcontent") or |
| | contains(@class, "obfuscated") or contains(@class, "blurred") or |
| | contains(@class, "restricted") or contains(@class, "overlay") |
| | ]""", |
| | ] |
| |
|
| | OVERALL_DISCARD_XPATH = [ |
| | |
| | """.//*[(self::div or self::item or self::ul |
| | or self::p or self::section or self::span)][ |
| | contains(translate(@id, "F","f"), "footer") or contains(translate(@class, "F","f"), "footer") |
| | or contains(@id, "related") or contains(translate(@class, "R", "r"), "related") or |
| | contains(@id, "viral") or contains(@class, "viral") or |
| | starts-with(@id, "shar") or starts-with(@class, "shar") or |
| | contains(@class, "share-") or |
| | contains(translate(@id, "S", "s"), "share") or |
| | contains(@id, "social") or contains(@class, "social") or contains(@class, "sociable") or |
| | contains(@id, "syndication") or contains(@class, "syndication") or |
| | starts-with(@id, "jp-") or starts-with(@id, "dpsp-content") or |
| | contains(@class, "embedded") or contains(@class, "embed") |
| | or contains(@id, "newsletter") or contains(@class, "newsletter") |
| | or contains(@class, "subnav") or |
| | contains(@id, "cookie") or contains(@class, "cookie") or contains(@id, "tags") |
| | or contains(@class, "tags") or contains(@id, "sidebar") or |
| | contains(@class, "sidebar") or contains(@id, "banner") or contains(@class, "banner") |
| | or contains(@class, "meta") or |
| | contains(@id, "menu") or contains(@class, "menu") or |
| | contains(translate(@id, "N", "n"), "nav") or contains(translate(@role, "N", "n"), "nav") |
| | or starts-with(@class, "nav") or contains(translate(@class, "N", "n"), "navigation") or |
| | contains(@class, "navbar") or contains(@class, "navbox") or starts-with(@class, "post-nav") |
| | or contains(@id, "breadcrumb") or contains(@class, "breadcrumb") or |
| | contains(@id, "bread-crumb") or contains(@class, "bread-crumb") or |
| | contains(@id, "author") or contains(@class, "author") or |
| | contains(@id, "button") or contains(@class, "button") |
| | or contains(translate(@class, "B", "b"), "byline") |
| | or contains(@class, "rating") or starts-with(@class, "widget") or |
| | contains(@class, "attachment") or contains(@class, "timestamp") or |
| | contains(@class, "user-info") or contains(@class, "user-profile") or |
| | contains(@class, "-ad-") or contains(@class, "-icon") |
| | or contains(@class, "article-infos") or |
| | contains(translate(@class, "I", "i"), "infoline") |
| | or contains(@data-component, "MostPopularStories") |
| | or contains(@class, "outbrain") or contains(@class, "taboola") |
| | or contains(@class, "criteo") or contains(@class, "options") |
| | or contains(@class, "consent") or contains(@class, "modal-content") |
| | or contains(@class, "paid-content") or contains(@class, "paidcontent") |
| | or contains(@id, "premium-") or contains(@id, "paywall") |
| | or contains(@class, "obfuscated") or contains(@class, "blurred") |
| | or contains(@class, " ad ") |
| | or contains(@class, "next-post") |
| | or contains(@class, "yin") or contains(@class, "zlylin") or |
| | contains(@class, "xg1") or contains(@id, "bmdh") |
| | or @data-lp-replacement-content]""", |
| | |
| | """.//*[starts-with(@class, "hide-") or contains(@class, "hide-print") or contains(@id, "hidden") |
| | or contains(@style, "hidden") or contains(@hidden, "hidden") or contains(@class, "noprint") |
| | or contains(@style, "display:none") or contains(@class, " hidden") or @aria-hidden="true" |
| | or contains(@class, "notloaded")]""", |
| | |
| | |
| | """.//*[@class="comments-title" or contains(@class, "comments-title") or |
| | contains(@class, "nocomments") or starts-with(@id, "reply-") or starts-with(@class, "reply-") or |
| | contains(@class, "-reply-") or contains(@class, "message") or contains(@id, "message_container") |
| | or contains(@id, "akismet") or contains(@class, "akismet")] """, |
| | ] |
| |
|
| | TEASER_DISCARD_XPATH = [ |
| | """.//*[(self::div or self::item or self::ul |
| | or self::p or self::section or self::span)][ |
| | contains(translate(@id, "T", "t"), "teaser") or contains(translate(@class, "T", "t"), "teaser") |
| | ]""", |
| | ] |
| |
|
| | PRECISION_DISCARD_XPATH = [ |
| | ".//header", |
| | """.//*[(self::div or self::item or self::ul |
| | or self::p or self::section or self::span)][ |
| | contains(@id, "bottom") or contains(@class, "bottom") or |
| | contains(@id, "link") or contains(@class, "link") |
| | or contains(@style, "border") |
| | ]""", |
| | ] |
| |
|
| | DISCARD_IMAGE_ELEMENTS = [ |
| | """.//*[(self::div or self::item or self::ul |
| | or self::p or self::section or self::span)][ |
| | contains(@id, "caption") or contains(@class, "caption") |
| | ] |
| | """ |
| | ] |
| |
|
| | REMOVE_COMMENTS_XPATH = [ |
| | """.//*[(self::div or self::ul or self::section)][ |
| | starts-with(translate(@id, "C","c"), 'comment') or |
| | starts-with(translate(@class, "C","c"), 'comment') or starts-with(translate(@name, "C","c"), 'comment') or |
| | contains(@class, 'article-comments') or contains(@class, 'post-comments') |
| | or starts-with(@id, 'comol') or starts-with(@id, 'disqus_thread') |
| | or starts-with(@id, 'dsq-comments') |
| | ]""" |
| | ] |
| |
|
| | CONTENT_EXTRACTOR_NOISE_XPATHS = [ |
| | |
| | '//div[starts-with(@class, "advert") or starts-with(@name, "advert") or starts-with(@id, "advert")]', |
| | '//div[contains(@style, "display: none")]', |
| | '//div[contains(@style, "display:none")]', |
| | ] |
| |
|
| | |
| | MANUALLY_CLEANED = [ |
| | "aside", |
| | "embed", |
| | "footer", |
| | "head", |
| | "iframe", |
| | "menu", |
| | "object", |
| | "script", |
| | "applet", |
| | "canvas", |
| | "map", |
| | "svg", |
| | "area", |
| | "blink", |
| | "button", |
| | "datalist", |
| | "dialog", |
| | "frame", |
| | "frameset", |
| | "fieldset", |
| | "hr", |
| | "link", |
| | "input", |
| | "ins", |
| | "label", |
| | "legend", |
| | "marquee", |
| | "menuitem", |
| | "nav", |
| | "noscript", |
| | "optgroup", |
| | "option", |
| | "output", |
| | "param", |
| | "progress", |
| | "rp", |
| | "rt", |
| | "rtc", |
| | "select", |
| | "style", |
| | "track", |
| | "textarea", |
| | "time", |
| | "use", |
| | ] |
| |
|
| | MANUALLY_STRIPPED = [ |
| | "abbr", |
| | "acronym", |
| | "address", |
| | "bdi", |
| | "bdo", |
| | "big", |
| | "cite", |
| | "data", |
| | "dfn", |
| | "font", |
| | "hgroup", |
| | "ins", |
| | "mark", |
| | "meta", |
| | "ruby", |
| | "small", |
| | "tbody", |
| | "template", |
| | "tfoot", |
| | "thead", |
| | ] |
| |
|
| | CUT_EMPTY_ELEMS = { |
| | "article", |
| | "b", |
| | "blockquote", |
| | "dd", |
| | "div", |
| | "dt", |
| | "em", |
| | "h1", |
| | "h2", |
| | "h3", |
| | "h4", |
| | "h5", |
| | "h6", |
| | "i", |
| | "li", |
| | "main", |
| | "p", |
| | "pre", |
| | "q", |
| | "section", |
| | "span", |
| | "strong", |
| | } |
| |
|
| | USELESS_ATTR = [ |
| | "share", |
| | "contribution", |
| | "copyright", |
| | "copy-right", |
| | "disclaimer", |
| | "recommend", |
| | "related", |
| | "footer", |
| | "social", |
| | "submeta", |
| | "report-infor", |
| | ] |
| |
|
| | BODY_XPATH = [ |
| | """.//*[(self::article or self::div or self::main or self::section)][ |
| | @class="post" or @class="entry" or |
| | contains(@class, "post-text") or contains(@class, "post_text") or |
| | contains(@class, "post-body") or contains(@class, "post-entry") or contains(@class, "postentry") or |
| | contains(@class, "post-content") or contains(@class, "post_content") or |
| | contains(@class, "postcontent") or contains(@class, "postContent") or |
| | contains(@class, "article-text") or contains(@class, "articletext") or contains(@class, "articleText") |
| | or contains(@id, "entry-content") or |
| | contains(@class, "entry-content") or contains(@id, "article-content") or |
| | contains(@class, "article-content") or contains(@id, "article__content") or |
| | contains(@class, "article__content") or contains(@id, "article-body") or |
| | contains(@class, "article-body") or contains(@id, "article__body") or |
| | contains(@class, "article__body") or @itemprop="articleBody" or |
| | contains(translate(@id, "B", "b"), "articlebody") or contains(translate(@class, "B", "b"), "articlebody") |
| | or @id="articleContent" or contains(@class, "ArticleContent") or |
| | contains(@class, "page-content") or contains(@class, "text-content") or |
| | contains(@id, "body-text") or contains(@class, "body-text") or contains(@class, "body-content") or contains(translate(@class, "B", "b"), "textbody") or |
| | contains(@class, "article__container") or contains(@id, "art-content") or contains(@class, "art-content")][1]""", |
| | "(.//article)[1]", |
| | """(.//*[(self::article or self::div or self::main or self::section)][ |
| | contains(@class, 'post-bodycopy') or |
| | contains(@class, 'storycontent') or contains(@class, 'story-content') or |
| | @class='postarea' or @class='art-postcontent' or |
| | contains(@class, 'theme-content') or contains(@class, 'blog-content') or |
| | contains(@class, 'section-content') or contains(@class, 'single-content') or |
| | contains(@class, 'single-post') or |
| | contains(@class, 'main-column') or contains(@class, 'wpb_text_column') or |
| | starts-with(@id, 'primary') or starts-with(@class, 'article ') or @class="text" or |
| | @id="article" or @class="cell" or @id="story" or @class="story" or |
| | contains(@class, "story-body") or contains(@class, "field-body") or |
| | contains(translate(@class, "FULTEX","fultex"), "fulltext") |
| | or @role='article'])[1]""", |
| | """(.//*[(self::article or self::div or self::main or self::section)][ |
| | contains(@id, "content-main") or contains(@class, "content-main") or contains(@class, "content_main") or |
| | contains(@id, "content-body") or contains(@class, "content-body") or contains(@id, "contentBody") |
| | or contains(@class, "content__body") or contains(translate(@id, "CM","cm"), "main-content") or contains(translate(@class, "CM","cm"), "main-content") |
| | or contains(translate(@class, "CP","cp"), "page-content") or |
| | @id="content" or @class="content"])[1]""", |
| | '(.//*[(self::article or self::div or self::section)][starts-with(@class, "main") or starts-with(@id, "main") or starts-with(@role, "main")])[1]|(.//main)[1]', |
| | ] |
| |
|
| | Forum_XPATH = [ |
| | """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][ |
| | contains(@id, 'question') or contains(@class, 'question')]""", |
| | """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][ |
| | contains(@id, 'answer') or contains(@class, 'answer')]""", |
| | """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][ |
| | contains(@id, 'comment') or contains(@class, 'comment') or contains(@class, 'Comment')]""", |
| | """.//*[(self::article or self::div or self::main or self::section or self::li or self::tr)][contains(@class, "message-container") or contains(@id, "message_container") or contains(@class, "Messages_container")]""", |
| | """.//*[(self::article or self::div or self::main or self::section or self::p or self::span or self::li or self::tr)][ |
| | contains(@id, 'comment-content') or contains(@class, 'comment-content') or contains(@class, 'comment-body') or contains(@class, 'comment-body') or contains(@class, "post-reply") or contains(@class, "reply_content") or contains(@class, "reply-content") or contains(@class, "reply_post") or contains(@class, "post-reply") or contains(@id, "reply") or contains(@class, "post-text") or contains(@class, "post_text") or |
| | contains(@class, "post-body") or contains(@class, "postbody") or contains(@class, "post-entry") or contains(@class, "postentry") or contains(@component, 'post') or |
| | contains(@class, "post-content") or contains(@class, "post_content") or contains(@class, "p_content") or contains(@class, "Post_content") or contains(@class, "message-post") or contains(@class, "js-post")]""", |
| | |
| | """.//*[(self::article or self::div or self::main or self::section or self::p or self::span or self::li or self::tr)][contains(@id, 'post-') or contains(@id, 'post_')]""" |
| | ] |
| |
|
| | METAS = [ |
| | '//meta[starts-with(@property, "og:title")]/@content', |
| | '//meta[starts-with(@name, "og:title")]/@content', |
| | '//meta[starts-with(@property, "title")]/@content', |
| | '//meta[starts-with(@name, "title")]/@content', |
| | '//meta[starts-with(@property, "page:title")]/@content', |
| | '//meta[starts-with(@name, "page:title")]/@content', |
| | ] |
| | URL_PATTERNS_TO_HTML_TYPE = { |
| | } |
| |
|
| | |
| | BUILTIN_SITE_RULES = { |
| | |
| | "answers.com": { |
| | "clean": [ |
| | "//script", |
| | "//style", |
| | ], |
| | "title": { |
| | "mode": "xpath", |
| | "value": "//h1[@property='name']//text() | //h1[contains(@class, 'headline1')]//text()" |
| | }, |
| | "content": { |
| | "mode": "xpath", |
| | |
| | "value": "//div[@property='content'] | //div[contains(@class, 'markdownStyles')]" |
| | } |
| | }, |
| | } |
| |
|
| | SCORING_WEIGHTS = { |
| | "content_length": 1.0, |
| | "paragraph_quality": 0.0, |
| | "link_density": 0.0, |
| | "text_density": 0.0, |
| | "punctuation_density": 0.0, |
| | "structure_completeness": 0.0, |
| | "xpath_confidence": 0.0, |
| | "noise_elements": 0.0, |
| | "code_block_quality": 0.0, |
| | "list_structure": 0.0, |
| | } |
| |
|
| | SCORE_THRESHOLDS = { |
| | "min_acceptable_score": 3.0, |
| | "similar_threshold": 0.5, |
| | } |
| |
|
| |
|