Videos: fix description text offset when emojis are present

author: Samantaz Fox <coding@samantaz.fr> 2023-04-05 23:43:41 +0200
committer: Samantaz Fox <coding@samantaz.fr> 2023-04-05 23:43:41 +0200
commit: b3c0afef02ee13c7f291fd26a5d64b4aee059906 (patch)
tree: f3d4bbd367611b6eb82e7bca231b6b54e41916d9
parent: 73d2ed6f77308dd300e68f3ea059c6aa2c10b1ce (diff)
download: invidious-b3c0afef02ee13c7f291fd26a5d64b4aee059906.tar.gz
invidious-b3c0afef02ee13c7f291fd26a5d64b4aee059906.tar.bz2
invidious-b3c0afef02ee13c7f291fd26a5d64b4aee059906.zip
1 files changed, 47 insertions, 24 deletions
diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr
index b1d851d3..2017955d 100644
--- a/src/invidious/videos/description.cr
+++ b/src/invidious/videos/description.cr
@@ -46,37 +46,60 @@ def parse_command(command : JSON::Any?, string : String) : String?
   return "(unknown YouTube desc command)"
 end
 
-def parse_description(desc : JSON::Any?) : String?
-  if desc.nil?
-    return ""
+private def copy_string(str : String::Builder, iter : Iterator, count : Int) : Int
+  copied = 0
+  while copied < count
+    cp = iter.next
+    break if cp.is_a?(Iterator::Stop)
+
+    str << cp.chr
+
+    # A codepoint from the SMP counts twice
+    copied += 1 if cp > 0xFFFF
+    copied += 1
   end
 
+  return copied
+end
+
+def parse_description(desc : JSON::Any?) : String?
+  return "" if desc.nil?
+
   content = desc["content"].as_s
-  if content.empty?
-    return ""
-  end
+  return "" if content.empty?
 
-  if commands = desc["commandRuns"]?.try &.as_a
-    description = String.build do |str|
-      index = 0
-      commands.each do |command|
-        start_index = command["startIndex"].as_i
-        length = command["length"].as_i
+  commands = desc["commandRuns"]?.try &.as_a
+  return content if commands.nil?
 
-        if start_index > 0 && start_index - index > 0
-          str << content[index...start_index]
-          index = start_index
-        end
+  # Not everything is stored in UTF-8 on youtube's side. The SMP codepoints
+  # (0x10000 and above) are encoded as UTF-16 surrogate pairs, which are
+  # automatically decoded by the JSON parser. It means that we need to count
+  # copied byte in a special manner, preventing the use of regular string copy.
+  iter = content.each_codepoint
 
-        str << parse_command(command, content[start_index, length])
-        index += length
-      end
-      if index < content.size
-        str << content[index..content.size]
+  index = 0
+
+  return String.build do |str|
+    commands.each do |command|
+      cmd_start = command["startIndex"].as_i
+      cmd_length = command["length"].as_i
+
+      # Copy the text chunk between this command and the previous if needed.
+      length = cmd_start - index
+      index += copy_string(str, iter, length)
+
+      # We need to copy the command's text using the iterator
+      # and the special function defined above.
+      cmd_content = String.build(cmd_length) do |str2|
+        copy_string(str2, iter, cmd_length)
       end
+
+      str << parse_command(command, cmd_content)
+      index += cmd_length
     end
-    return description
-  end
 
-  return content
+    # Copy the end of the string (past the last command).
+    remaining_length = content.size - index
+    copy_string(str, iter, remaining_length) if remaining_length > 0
+  end
 end
author	Samantaz Fox <coding@samantaz.fr>	2023-04-05 23:43:41 +0200
committer	Samantaz Fox <coding@samantaz.fr>	2023-04-05 23:43:41 +0200
commit	b3c0afef02ee13c7f291fd26a5d64b4aee059906 (patch)
tree	f3d4bbd367611b6eb82e7bca231b6b54e41916d9
parent	73d2ed6f77308dd300e68f3ea059c6aa2c10b1ce (diff)
download	invidious-b3c0afef02ee13c7f291fd26a5d64b4aee059906.tar.gz invidious-b3c0afef02ee13c7f291fd26a5d64b4aee059906.tar.bz2 invidious-b3c0afef02ee13c7f291fd26a5d64b4aee059906.zip