summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorsyeopite <syeopite@syeopite.dev>2024-06-11 17:57:33 -0700
committersyeopite <syeopite@syeopite.dev>2024-06-11 18:23:01 -0700
commit0224162ad22dc19d58a73202d796eb3e99f0a71c (patch)
treee3a99f9b8e8ee13dbf1af9d660b5f6fc386870e9
parenteda7444ca46dbc3941205316baba8030fe0b2989 (diff)
downloadinvidious-0224162ad22dc19d58a73202d796eb3e99f0a71c.tar.gz
invidious-0224162ad22dc19d58a73202d796eb3e99f0a71c.tar.bz2
invidious-0224162ad22dc19d58a73202d796eb3e99f0a71c.zip
Rewrite transcript logic to be more generic
The transcript logic in Invidious was written specifically as a workaround for captions, and not transcripts as a feature. This commit genericises the logic a bit as so it can be used for implementing transcripts within Invidious' API and UI as well. The most notable change is the added parsing of section headings when it was previously skipped over in favor of regular lines.
-rw-r--r--src/invidious/routes/api/v1/videos.cr9
-rw-r--r--src/invidious/videos/transcript.cr86
2 files changed, 61 insertions, 34 deletions
diff --git a/src/invidious/routes/api/v1/videos.cr b/src/invidious/routes/api/v1/videos.cr
index 9281f4dd..faff2f59 100644
--- a/src/invidious/routes/api/v1/videos.cr
+++ b/src/invidious/routes/api/v1/videos.cr
@@ -89,9 +89,14 @@ module Invidious::Routes::API::V1::Videos
if CONFIG.use_innertube_for_captions
params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
- initial_data = YoutubeAPI.get_transcript(params)
- webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code)
+ transcript = Invidious::Videos::Transcript.from_raw(
+ YoutubeAPI.get_transcript(params),
+ caption.language_code,
+ caption.auto_generated
+ )
+
+ webvtt = transcript.to_vtt
else
# Timedtext API handling
url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
diff --git a/src/invidious/videos/transcript.cr b/src/invidious/videos/transcript.cr
index dac00eea..42f29f46 100644
--- a/src/invidious/videos/transcript.cr
+++ b/src/invidious/videos/transcript.cr
@@ -1,8 +1,21 @@
module Invidious::Videos
- # Namespace for methods primarily relating to Transcripts
- module Transcript
- record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+ # A `Transcripts` struct encapsulates a sequence of lines that together forms the whole transcript for a given YouTube video.
+ # These lines can be categorized into two types: section headings and regular lines representing content from the video.
+ struct Transcript
+ # Types
+ record HeadingLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+ record RegularLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+ alias TranscriptLine = HeadingLine | RegularLine
+
+ property lines : Array(TranscriptLine)
+ property language_code : String
+ property auto_generated : Bool
+
+ # Initializes a new Transcript struct with the contents and associated metadata describing it
+ def initialize(@lines : Array(TranscriptLine), @language_code : String, @auto_generated : Bool)
+ end
+ # Generates a protobuf string to fetch the requested transcript from YouTube
def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
kind = auto_generated ? "asr" : ""
@@ -30,48 +43,57 @@ module Invidious::Videos
return params
end
- def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String
- # Convert into array of TranscriptLine
- lines = self.parse(initial_data)
-
- settings_field = {
- "Kind" => "captions",
- "Language" => target_language,
- }
-
- # Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt()
- vtt = WebVTT.build(settings_field) do |vtt|
- lines.each do |line|
- vtt.cue(line.start_ms, line.end_ms, line.line)
- end
- end
-
- return vtt
- end
-
- private def self.parse(initial_data : Hash(String, JSON::Any))
+ # Constructs a Transcripts struct from the initial YouTube response
+ def self.from_raw(initial_data : Hash(String, JSON::Any), language_code : String, auto_generated : Bool)
body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
"content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
"initialSegments").as_a
lines = [] of TranscriptLine
+
body.each do |line|
- # Transcript section headers. They are not apart of the captions and as such we can safely skip them.
- if line.as_h.has_key?("transcriptSectionHeaderRenderer")
- next
+ if unpacked_line = line["transcriptSectionHeaderRenderer"]?
+ line_type = HeadingLine
+ else
+ unpacked_line = line["transcriptSegmentRenderer"]
+ line_type = RegularLine
end
- line = line["transcriptSegmentRenderer"]
+ start_ms = unpacked_line["startMs"].as_s.to_i.millisecond
+ end_ms = unpacked_line["endMs"].as_s.to_i.millisecond
+ text = extract_text(unpacked_line["snippet"]) || ""
- start_ms = line["startMs"].as_s.to_i.millisecond
- end_ms = line["endMs"].as_s.to_i.millisecond
+ lines << line_type.new(start_ms, end_ms, text)
+ end
- text = extract_text(line["snippet"]) || ""
+ return Transcript.new(
+ lines: lines,
+ language_code: language_code,
+ auto_generated: auto_generated,
+ )
+ end
- lines << TranscriptLine.new(start_ms, end_ms, text)
+ # Converts transcript lines to a WebVTT file
+ #
+ # This is used within Invidious to replace subtitles
+ # as to workaround YouTube's rate-limited timedtext endpoint.
+ def to_vtt
+ settings_field = {
+ "Kind" => "captions",
+ "Language" => @language_code,
+ }
+
+ vtt = WebVTT.build(settings_field) do |vtt|
+ @lines.each do |line|
+ # Section headers are excluded from the VTT conversion as to
+ # match the regular captions returned from YouTube as much as possible
+ next if line.is_a? HeadingLine
+
+ vtt.cue(line.start_ms, line.end_ms, line.line)
+ end
end
- return lines
+ return vtt
end
end
end