summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSamantaz Fox <coding@samantaz.fr>2023-04-05 23:43:41 +0200
committerSamantaz Fox <coding@samantaz.fr>2023-04-05 23:43:41 +0200
commitb3c0afef02ee13c7f291fd26a5d64b4aee059906 (patch)
treef3d4bbd367611b6eb82e7bca231b6b54e41916d9
parent73d2ed6f77308dd300e68f3ea059c6aa2c10b1ce (diff)
downloadinvidious-b3c0afef02ee13c7f291fd26a5d64b4aee059906.tar.gz
invidious-b3c0afef02ee13c7f291fd26a5d64b4aee059906.tar.bz2
invidious-b3c0afef02ee13c7f291fd26a5d64b4aee059906.zip
Videos: fix description text offset when emojis are present
-rw-r--r--src/invidious/videos/description.cr71
1 files changed, 47 insertions, 24 deletions
diff --git a/src/invidious/videos/description.cr b/src/invidious/videos/description.cr
index b1d851d3..2017955d 100644
--- a/src/invidious/videos/description.cr
+++ b/src/invidious/videos/description.cr
@@ -46,37 +46,60 @@ def parse_command(command : JSON::Any?, string : String) : String?
return "(unknown YouTube desc command)"
end
-def parse_description(desc : JSON::Any?) : String?
- if desc.nil?
- return ""
+private def copy_string(str : String::Builder, iter : Iterator, count : Int) : Int
+ copied = 0
+ while copied < count
+ cp = iter.next
+ break if cp.is_a?(Iterator::Stop)
+
+ str << cp.chr
+
+ # A codepoint from the SMP counts twice
+ copied += 1 if cp > 0xFFFF
+ copied += 1
end
+ return copied
+end
+
+def parse_description(desc : JSON::Any?) : String?
+ return "" if desc.nil?
+
content = desc["content"].as_s
- if content.empty?
- return ""
- end
+ return "" if content.empty?
- if commands = desc["commandRuns"]?.try &.as_a
- description = String.build do |str|
- index = 0
- commands.each do |command|
- start_index = command["startIndex"].as_i
- length = command["length"].as_i
+ commands = desc["commandRuns"]?.try &.as_a
+ return content if commands.nil?
- if start_index > 0 && start_index - index > 0
- str << content[index...start_index]
- index = start_index
- end
+ # Not everything is stored in UTF-8 on youtube's side. The SMP codepoints
+ # (0x10000 and above) are encoded as UTF-16 surrogate pairs, which are
+ # automatically decoded by the JSON parser. It means that we need to count
+ # copied byte in a special manner, preventing the use of regular string copy.
+ iter = content.each_codepoint
- str << parse_command(command, content[start_index, length])
- index += length
- end
- if index < content.size
- str << content[index..content.size]
+ index = 0
+
+ return String.build do |str|
+ commands.each do |command|
+ cmd_start = command["startIndex"].as_i
+ cmd_length = command["length"].as_i
+
+ # Copy the text chunk between this command and the previous if needed.
+ length = cmd_start - index
+ index += copy_string(str, iter, length)
+
+ # We need to copy the command's text using the iterator
+ # and the special function defined above.
+ cmd_content = String.build(cmd_length) do |str2|
+ copy_string(str2, iter, cmd_length)
end
+
+ str << parse_command(command, cmd_content)
+ index += cmd_length
end
- return description
- end
- return content
+ # Copy the end of the string (past the last command).
+ remaining_length = content.size - index
+ copy_string(str, iter, remaining_length) if remaining_length > 0
+ end
end