From 49ddf8b6bdb98c7a9678cbec800c45350a54a786 Mon Sep 17 00:00:00 2001 From: techmetx11 Date: Thu, 23 Mar 2023 05:10:21 +0000 Subject: Added attributed description support --- src/invidious/videos/description.cr | 81 +++++++++++++++++++++++++++++++++++++ src/invidious/videos/parser.cr | 6 ++- 2 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 src/invidious/videos/description.cr (limited to 'src') diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr new file mode 100644 index 00000000..d4c60a84 --- /dev/null +++ b/src/invidious/videos/description.cr @@ -0,0 +1,81 @@ +require "json" +require "uri" + +def parse_command(command : JSON::Any?, string : String) : String? + on_tap = command.dig?("onTap", "innertubeCommand") + + # 3rd party URL, extract original URL from YouTube tracking URL + if url_endpoint = on_tap.try &.["urlEndpoint"]? + youtube_url = URI.parse url_endpoint["url"].as_s + + original_url = youtube_url.query_params["q"]? + if original_url.nil? + return "" + else + return "#{original_url}" + end + # 1st party watch URL + elsif watch_endpoint = on_tap.try &.["watchEndpoint"]? + video_id = watch_endpoint["videoId"].as_s + time = watch_endpoint["startTimeSeconds"].as_i + + url = "/watch?v=#{video_id}&t=#{time}s" + + # if text is a timestamp, use the string instead + if /(?:\d{2}:){1,2}\d{2}/ =~ string + return "#{string}" + else + return "#{url}" + end + # hashtag/other browse URLs + elsif browse_endpoint = on_tap.try &.dig?("commandMetadata", "webCommandMetadata") + url = browse_endpoint["url"].try &.as_s + + # remove unnecessary character in a channel name + if browse_endpoint["webPageType"]?.try &.as_s == "WEB_PAGE_TYPE_CHANNEL" + name = string.match(/@[\w\d]+/) + if name.try &.[0]? + return "#{name.try &.[0]}" + end + end + + return "#{string}" + end + + return "(unknown YouTube desc command)" +end + +def parse_description(desc : JSON::Any?) : String? + if desc.nil? + return "" + end + + content = desc["content"].as_s + if content.empty? + return "" + end + + if commands = desc["commandRuns"]?.try &.as_a + description = String.build do |str| + index = 0 + commands.each do |command| + start_index = command["startIndex"].as_i + length = command["length"].as_i + + if start_index > 0 && start_index - index > 0 + str << content[index..(start_index - 1)] + index += start_index - index + end + + str << parse_command(command, content[start_index, length]) + index += length + end + if index < content.size + str << content[index..content.size] + end + end + return description + end + + return content +end diff --git a/src/invidious/videos/parser.cr b/src/invidious/videos/parser.cr index 608ae99d..3a342a95 100644 --- a/src/invidious/videos/parser.cr +++ b/src/invidious/videos/parser.cr @@ -284,8 +284,10 @@ def parse_video_info(video_id : String, player_response : Hash(String, JSON::Any description = microformat.dig?("description", "simpleText").try &.as_s || "" short_description = player_response.dig?("videoDetails", "shortDescription") - description_html = video_secondary_renderer.try &.dig?("description", "runs") - .try &.as_a.try { |t| content_to_comment_html(t, video_id) } + # description_html = video_secondary_renderer.try &.dig?("description", "runs") + # .try &.as_a.try { |t| content_to_comment_html(t, video_id) } + + description_html = parse_description(video_secondary_renderer.try &.dig?("attributedDescription")) # Video metadata -- cgit v1.2.3 From 7755ed4ac8812377da04cff951324ab31d2e621c Mon Sep 17 00:00:00 2001 From: techmetx11 Date: Thu, 23 Mar 2023 20:12:54 +0000 Subject: Fix regexs --- src/invidious/videos/description.cr | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'src') diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr index d4c60a84..3d25197b 100644 --- a/src/invidious/videos/description.cr +++ b/src/invidious/videos/description.cr @@ -21,8 +21,9 @@ def parse_command(command : JSON::Any?, string : String) : String? url = "/watch?v=#{video_id}&t=#{time}s" - # if text is a timestamp, use the string instead - if /(?:\d{2}:){1,2}\d{2}/ =~ string + # if string is a timestamp, use the string instead + # this is a lazy regex for validating timestamps + if /(?:\d{1,2}:){1,2}\d{2}/ =~ string return "#{string}" else return "#{url}" @@ -33,7 +34,7 @@ def parse_command(command : JSON::Any?, string : String) : String? # remove unnecessary character in a channel name if browse_endpoint["webPageType"]?.try &.as_s == "WEB_PAGE_TYPE_CHANNEL" - name = string.match(/@[\w\d]+/) + name = string.match(/@[\w\d.-]+/) if name.try &.[0]? return "#{name.try &.[0]}" end -- cgit v1.2.3 From 73d2ed6f77308dd300e68f3ea059c6aa2c10b1ce Mon Sep 17 00:00:00 2001 From: techmetx11 Date: Wed, 29 Mar 2023 23:33:23 +0000 Subject: Optimize some redundant stuff --- src/invidious/videos/description.cr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr index 3d25197b..b1d851d3 100644 --- a/src/invidious/videos/description.cr +++ b/src/invidious/videos/description.cr @@ -64,8 +64,8 @@ def parse_description(desc : JSON::Any?) : String? length = command["length"].as_i if start_index > 0 && start_index - index > 0 - str << content[index..(start_index - 1)] - index += start_index - index + str << content[index...start_index] + index = start_index end str << parse_command(command, content[start_index, length]) -- cgit v1.2.3 From b3c0afef02ee13c7f291fd26a5d64b4aee059906 Mon Sep 17 00:00:00 2001 From: Samantaz Fox Date: Wed, 5 Apr 2023 23:43:41 +0200 Subject: Videos: fix description text offset when emojis are present --- src/invidious/videos/description.cr | 71 ++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 24 deletions(-) (limited to 'src') diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr index b1d851d3..2017955d 100644 --- a/src/invidious/videos/description.cr +++ b/src/invidious/videos/description.cr @@ -46,37 +46,60 @@ def parse_command(command : JSON::Any?, string : String) : String? return "(unknown YouTube desc command)" end -def parse_description(desc : JSON::Any?) : String? - if desc.nil? - return "" +private def copy_string(str : String::Builder, iter : Iterator, count : Int) : Int + copied = 0 + while copied < count + cp = iter.next + break if cp.is_a?(Iterator::Stop) + + str << cp.chr + + # A codepoint from the SMP counts twice + copied += 1 if cp > 0xFFFF + copied += 1 end + return copied +end + +def parse_description(desc : JSON::Any?) : String? + return "" if desc.nil? + content = desc["content"].as_s - if content.empty? - return "" - end + return "" if content.empty? - if commands = desc["commandRuns"]?.try &.as_a - description = String.build do |str| - index = 0 - commands.each do |command| - start_index = command["startIndex"].as_i - length = command["length"].as_i + commands = desc["commandRuns"]?.try &.as_a + return content if commands.nil? - if start_index > 0 && start_index - index > 0 - str << content[index...start_index] - index = start_index - end + # Not everything is stored in UTF-8 on youtube's side. The SMP codepoints + # (0x10000 and above) are encoded as UTF-16 surrogate pairs, which are + # automatically decoded by the JSON parser. It means that we need to count + # copied byte in a special manner, preventing the use of regular string copy. + iter = content.each_codepoint - str << parse_command(command, content[start_index, length]) - index += length - end - if index < content.size - str << content[index..content.size] + index = 0 + + return String.build do |str| + commands.each do |command| + cmd_start = command["startIndex"].as_i + cmd_length = command["length"].as_i + + # Copy the text chunk between this command and the previous if needed. + length = cmd_start - index + index += copy_string(str, iter, length) + + # We need to copy the command's text using the iterator + # and the special function defined above. + cmd_content = String.build(cmd_length) do |str2| + copy_string(str2, iter, cmd_length) end + + str << parse_command(command, cmd_content) + index += cmd_length end - return description - end - return content + # Copy the end of the string (past the last command). + remaining_length = content.size - index + copy_string(str, iter, remaining_length) if remaining_length > 0 + end end -- cgit v1.2.3