diff options
| author | Omar Roth <omarroth@hotmail.com> | 2018-03-03 15:59:21 -0600 |
|---|---|---|
| committer | Omar Roth <omarroth@hotmail.com> | 2018-03-03 15:59:21 -0600 |
| commit | d573461a671a0ccbf1b06c5e16cbc7c5cff52139 (patch) | |
| tree | 6a03672df7e2d922ec37b40ae7e1be9b9b57ebc7 /src | |
| parent | b8fe82a7f7d05f4a2af0d7445e2633dea5169395 (diff) | |
| download | invidious-d573461a671a0ccbf1b06c5e16cbc7c5cff52139.tar.gz invidious-d573461a671a0ccbf1b06c5e16cbc7c5cff52139.tar.bz2 invidious-d573461a671a0ccbf1b06c5e16cbc7c5cff52139.zip | |
Parse HTML properly instead of relying on regexes
Diffstat (limited to 'src')
| -rw-r--r-- | src/helpers.cr | 29 |
1 files changed, 26 insertions, 3 deletions
diff --git a/src/helpers.cr b/src/helpers.cr index 45204132..094d7dc1 100644 --- a/src/helpers.cr +++ b/src/helpers.cr @@ -132,8 +132,19 @@ def fetch_video(id, client) dislikes = dislikes ? dislikes.content.delete(",").to_i : 0 description = html.xpath_node(%q(//p[@id="eow-description"])) + if description + description.xpath_nodes(%q(//a/@href)).each do |match| + uri = URI.parse(match.content) + + if uri.host =~ /(www\.)?youtube.com/ + uri = uri.full_path + puts uri + end + + match.content = uri.to_s + end + end description = description ? description.to_xml : "" - description = description.gsub(/(https:\/\/)|(http:\/\/)?(www\.)?(youtube\.com)/, "") wilson_score = ci_lower_bound(likes, likes + dislikes) @@ -278,6 +289,20 @@ def template_comments(root) author = child["data"]["author"] score = child["data"]["score"] body_html = HTML.unescape(child["data"]["body_html"].as_s) + body_html = XML.parse_html(body_html) + + body_html.xpath_nodes(%q(//a/@href)).each do |match| + uri = URI.parse(match.content) + + if uri.host =~ /(www\.)?youtube.com/ + uri = uri.full_path + puts uri + end + + match.content = uri.to_s + end + + body_html = body_html.to_s replies_html = "" if child["data"]["replies"] != "" @@ -317,8 +342,6 @@ def template_comments(root) end end - html = html.gsub(/(https:\/\/)|(http:\/\/)?(www\.)?(youtube\.com)/, "") - return html end |
