diff options
| author | Samantaz Fox <coding@samantaz.fr> | 2023-09-18 23:31:56 +0200 |
|---|---|---|
| committer | Samantaz Fox <coding@samantaz.fr> | 2023-09-18 23:31:56 +0200 |
| commit | 842e9fade58414c606b8d97b5050c5f9c9ab77dd (patch) | |
| tree | 3a3ebe103d7e03ecf0e64d8feea18d19861920aa /src | |
| parent | cc03610325f8c49b14216f2913a8a3148b48e3a2 (diff) | |
| parent | eabcea6f4a16a47555d945460ac824588ff546e5 (diff) | |
| download | invidious-842e9fade58414c606b8d97b5050c5f9c9ab77dd.tar.gz invidious-842e9fade58414c606b8d97b5050c5f9c9ab77dd.tar.bz2 invidious-842e9fade58414c606b8d97b5050c5f9c9ab77dd.zip | |
Captions: Add ability to use Innertube's transcripts API (#4001)
Diffstat (limited to 'src')
| -rw-r--r-- | src/invidious/config.cr | 3 | ||||
| -rw-r--r-- | src/invidious/frontend/watch_page.cr | 2 | ||||
| -rw-r--r-- | src/invidious/routes/api/v1/videos.cr | 114 | ||||
| -rw-r--r-- | src/invidious/videos.cr | 6 | ||||
| -rw-r--r-- | src/invidious/videos/caption.cr | 142 | ||||
| -rw-r--r-- | src/invidious/videos/transcript.cr | 103 | ||||
| -rw-r--r-- | src/invidious/views/user/preferences.ecr | 2 | ||||
| -rw-r--r-- | src/invidious/yt_backend/youtube_api.cr | 24 |
8 files changed, 270 insertions, 126 deletions
diff --git a/src/invidious/config.cr b/src/invidious/config.cr index cee33ce1..429d9246 100644 --- a/src/invidious/config.cr +++ b/src/invidious/config.cr @@ -127,6 +127,9 @@ class Config # Pool size for HTTP requests to youtube.com and ytimg.com (each domain has a separate pool of `pool_size`) property pool_size : Int32 = 100 + # Use Innertube's transcripts API instead of timedtext for closed captions + property use_innertube_for_captions : Bool = false + # Saved cookies in "name1=value1; name2=value2..." format @[YAML::Field(converter: Preferences::StringToCookies)] property cookies : HTTP::Cookies = HTTP::Cookies.new diff --git a/src/invidious/frontend/watch_page.cr b/src/invidious/frontend/watch_page.cr index e3214469..5fd81168 100644 --- a/src/invidious/frontend/watch_page.cr +++ b/src/invidious/frontend/watch_page.cr @@ -7,7 +7,7 @@ module Invidious::Frontend::WatchPage getter full_videos : Array(Hash(String, JSON::Any)) getter video_streams : Array(Hash(String, JSON::Any)) getter audio_streams : Array(Hash(String, JSON::Any)) - getter captions : Array(Invidious::Videos::Caption) + getter captions : Array(Invidious::Videos::Captions::Metadata) def initialize( @full_videos, diff --git a/src/invidious/routes/api/v1/videos.cr b/src/invidious/routes/api/v1/videos.cr index af4fc806..25e766d2 100644 --- a/src/invidious/routes/api/v1/videos.cr +++ b/src/invidious/routes/api/v1/videos.cr @@ -87,70 +87,78 @@ module Invidious::Routes::API::V1::Videos caption = caption[0] end - url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target - - # Auto-generated captions often have cues that aren't aligned properly with the video, - # as well as some other markup that makes it cumbersome, so we try to fix that here - if caption.name.includes? "auto-generated" - caption_xml = YT_POOL.client &.get(url).body - - if caption_xml.starts_with?("<?xml") - webvtt = caption.timedtext_to_vtt(caption_xml, tlang) - else - caption_xml = XML.parse(caption_xml) - - webvtt = String.build do |str| - str << <<-END_VTT - WEBVTT - Kind: captions - Language: #{tlang || caption.language_code} + if CONFIG.use_innertube_for_captions + params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated) + initial_data = YoutubeAPI.get_transcript(params) + webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code) + else + # Timedtext API handling + url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target + + # Auto-generated captions often have cues that aren't aligned properly with the video, + # as well as some other markup that makes it cumbersome, so we try to fix that here + if caption.name.includes? "auto-generated" + caption_xml = YT_POOL.client &.get(url).body + + if caption_xml.starts_with?("<?xml") + webvtt = caption.timedtext_to_vtt(caption_xml, tlang) + else + caption_xml = XML.parse(caption_xml) + + webvtt = String.build do |str| + str << <<-END_VTT + WEBVTT + Kind: captions + Language: #{tlang || caption.language_code} + + + END_VTT + + caption_nodes = caption_xml.xpath_nodes("//transcript/text") + caption_nodes.each_with_index do |node, i| + start_time = node["start"].to_f.seconds + duration = node["dur"]?.try &.to_f.seconds + duration ||= start_time + + if caption_nodes.size > i + 1 + end_time = caption_nodes[i + 1]["start"].to_f.seconds + else + end_time = start_time + duration + end - END_VTT + start_time = "#{start_time.hours.to_s.rjust(2, '0')}:#{start_time.minutes.to_s.rjust(2, '0')}:#{start_time.seconds.to_s.rjust(2, '0')}.#{start_time.milliseconds.to_s.rjust(3, '0')}" + end_time = "#{end_time.hours.to_s.rjust(2, '0')}:#{end_time.minutes.to_s.rjust(2, '0')}:#{end_time.seconds.to_s.rjust(2, '0')}.#{end_time.milliseconds.to_s.rjust(3, '0')}" - caption_nodes = caption_xml.xpath_nodes("//transcript/text") - caption_nodes.each_with_index do |node, i| - start_time = node["start"].to_f.seconds - duration = node["dur"]?.try &.to_f.seconds - duration ||= start_time + text = HTML.unescape(node.content) + text = text.gsub(/<font color="#[a-fA-F0-9]{6}">/, "") + text = text.gsub(/<\/font>/, "") + if md = text.match(/(?<name>.*) : (?<text>.*)/) + text = "<v #{md["name"]}>#{md["text"]}</v>" + end - if caption_nodes.size > i + 1 - end_time = caption_nodes[i + 1]["start"].to_f.seconds - else - end_time = start_time + duration - end + str << <<-END_CUE + #{start_time} --> #{end_time} + #{text} - start_time = "#{start_time.hours.to_s.rjust(2, '0')}:#{start_time.minutes.to_s.rjust(2, '0')}:#{start_time.seconds.to_s.rjust(2, '0')}.#{start_time.milliseconds.to_s.rjust(3, '0')}" - end_time = "#{end_time.hours.to_s.rjust(2, '0')}:#{end_time.minutes.to_s.rjust(2, '0')}:#{end_time.seconds.to_s.rjust(2, '0')}.#{end_time.milliseconds.to_s.rjust(3, '0')}" - text = HTML.unescape(node.content) - text = text.gsub(/<font color="#[a-fA-F0-9]{6}">/, "") - text = text.gsub(/<\/font>/, "") - if md = text.match(/(?<name>.*) : (?<text>.*)/) - text = "<v #{md["name"]}>#{md["text"]}</v>" + END_CUE end - - str << <<-END_CUE - #{start_time} --> #{end_time} - #{text} - - - END_CUE end end - end - else - # Some captions have "align:[start/end]" and "position:[num]%" - # attributes. Those are causing issues with VideoJS, which is unable - # to properly align the captions on the video, so we remove them. - # - # See: https://github.com/iv-org/invidious/issues/2391 - webvtt = YT_POOL.client &.get("#{url}&format=vtt").body - if webvtt.starts_with?("<?xml") - webvtt = caption.timedtext_to_vtt(webvtt) else + # Some captions have "align:[start/end]" and "position:[num]%" + # attributes. Those are causing issues with VideoJS, which is unable + # to properly align the captions on the video, so we remove them. + # + # See: https://github.com/iv-org/invidious/issues/2391 webvtt = YT_POOL.client &.get("#{url}&format=vtt").body - .gsub(/([0-9:.]{12} --> [0-9:.]{12}).+/, "\\1") + if webvtt.starts_with?("<?xml") + webvtt = caption.timedtext_to_vtt(webvtt) + else + webvtt = YT_POOL.client &.get("#{url}&format=vtt").body + .gsub(/([0-9:.]{12} --> [0-9:.]{12}).+/, "\\1") + end end end diff --git a/src/invidious/videos.cr b/src/invidious/videos.cr index f38b33e5..9fbd1374 100644 --- a/src/invidious/videos.cr +++ b/src/invidious/videos.cr @@ -24,7 +24,7 @@ struct Video property updated : Time @[DB::Field(ignore: true)] - @captions = [] of Invidious::Videos::Caption + @captions = [] of Invidious::Videos::Captions::Metadata @[DB::Field(ignore: true)] property adaptive_fmts : Array(Hash(String, JSON::Any))? @@ -215,9 +215,9 @@ struct Video keywords.includes? "YouTube Red" end - def captions : Array(Invidious::Videos::Caption) + def captions : Array(Invidious::Videos::Captions::Metadata) if @captions.empty? && @info.has_key?("captions") - @captions = Invidious::Videos::Caption.from_yt_json(info["captions"]) + @captions = Invidious::Videos::Captions::Metadata.from_yt_json(info["captions"]) end return @captions diff --git a/src/invidious/videos/caption.cr b/src/invidious/videos/caption.cr index 13f81a31..256dfcc0 100644 --- a/src/invidious/videos/caption.cr +++ b/src/invidious/videos/caption.cr @@ -1,100 +1,106 @@ require "json" module Invidious::Videos - struct Caption - property name : String - property language_code : String - property base_url : String + module Captions + struct Metadata + property name : String + property language_code : String + property base_url : String - def initialize(@name, @language_code, @base_url) - end + property auto_generated : Bool - # Parse the JSON structure from Youtube - def self.from_yt_json(container : JSON::Any) : Array(Caption) - caption_tracks = container - .dig?("playerCaptionsTracklistRenderer", "captionTracks") - .try &.as_a + def initialize(@name, @language_code, @base_url, @auto_generated) + end - captions_list = [] of Caption - return captions_list if caption_tracks.nil? + # Parse the JSON structure from Youtube + def self.from_yt_json(container : JSON::Any) : Array(Captions::Metadata) + caption_tracks = container + .dig?("playerCaptionsTracklistRenderer", "captionTracks") + .try &.as_a - caption_tracks.each do |caption| - name = caption["name"]["simpleText"]? || caption["name"]["runs"][0]["text"] - name = name.to_s.split(" - ")[0] + captions_list = [] of Captions::Metadata + return captions_list if caption_tracks.nil? - language_code = caption["languageCode"].to_s - base_url = caption["baseUrl"].to_s + caption_tracks.each do |caption| + name = caption["name"]["simpleText"]? || caption["name"]["runs"][0]["text"] + name = name.to_s.split(" - ")[0] - captions_list << Caption.new(name, language_code, base_url) - end + language_code = caption["languageCode"].to_s + base_url = caption["baseUrl"].to_s - return captions_list - end + auto_generated = (caption["kind"]? == "asr") + + captions_list << Captions::Metadata.new(name, language_code, base_url, auto_generated) + end - def timedtext_to_vtt(timedtext : String, tlang = nil) : String - # In the future, we could just directly work with the url. This is more of a POC - cues = [] of XML::Node - tree = XML.parse(timedtext) - tree = tree.children.first - - tree.children.each do |item| - if item.name == "body" - item.children.each do |cue| - if cue.name == "p" && !(cue.children.size == 1 && cue.children[0].content == "\n") - cues << cue + return captions_list + end + + def timedtext_to_vtt(timedtext : String, tlang = nil) : String + # In the future, we could just directly work with the url. This is more of a POC + cues = [] of XML::Node + tree = XML.parse(timedtext) + tree = tree.children.first + + tree.children.each do |item| + if item.name == "body" + item.children.each do |cue| + if cue.name == "p" && !(cue.children.size == 1 && cue.children[0].content == "\n") + cues << cue + end end + break end - break end - end - result = String.build do |result| - result << <<-END_VTT - WEBVTT - Kind: captions - Language: #{tlang || @language_code} + result = String.build do |result| + result << <<-END_VTT + WEBVTT + Kind: captions + Language: #{tlang || @language_code} - END_VTT + END_VTT - result << "\n\n" + result << "\n\n" - cues.each_with_index do |node, i| - start_time = node["t"].to_f.milliseconds + cues.each_with_index do |node, i| + start_time = node["t"].to_f.milliseconds - duration = node["d"]?.try &.to_f.milliseconds + duration = node["d"]?.try &.to_f.milliseconds - duration ||= start_time + duration ||= start_time - if cues.size > i + 1 - end_time = cues[i + 1]["t"].to_f.milliseconds - else - end_time = start_time + duration - end + if cues.size > i + 1 + end_time = cues[i + 1]["t"].to_f.milliseconds + else + end_time = start_time + duration + end - # start_time - result << start_time.hours.to_s.rjust(2, '0') - result << ':' << start_time.minutes.to_s.rjust(2, '0') - result << ':' << start_time.seconds.to_s.rjust(2, '0') - result << '.' << start_time.milliseconds.to_s.rjust(3, '0') + # start_time + result << start_time.hours.to_s.rjust(2, '0') + result << ':' << start_time.minutes.to_s.rjust(2, '0') + result << ':' << start_time.seconds.to_s.rjust(2, '0') + result << '.' << start_time.milliseconds.to_s.rjust(3, '0') - result << " --> " + result << " --> " - # end_time - result << end_time.hours.to_s.rjust(2, '0') - result << ':' << end_time.minutes.to_s.rjust(2, '0') - result << ':' << end_time.seconds.to_s.rjust(2, '0') - result << '.' << end_time.milliseconds.to_s.rjust(3, '0') + # end_time + result << end_time.hours.to_s.rjust(2, '0') + result << ':' << end_time.minutes.to_s.rjust(2, '0') + result << ':' << end_time.seconds.to_s.rjust(2, '0') + result << '.' << end_time.milliseconds.to_s.rjust(3, '0') - result << "\n" + result << "\n" - node.children.each do |s| - result << s.content + node.children.each do |s| + result << s.content + end + result << "\n" + result << "\n" end - result << "\n" - result << "\n" end + return result end - return result end # List of all caption languages available on Youtube. diff --git a/src/invidious/videos/transcript.cr b/src/invidious/videos/transcript.cr new file mode 100644 index 00000000..f3360a52 --- /dev/null +++ b/src/invidious/videos/transcript.cr @@ -0,0 +1,103 @@ +module Invidious::Videos + # Namespace for methods primarily relating to Transcripts + module Transcript + record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String + + def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String + kind = auto_generated ? "asr" : "" + + object = { + "1:0:string" => video_id, + + "2:base64" => { + "1:string" => kind, + "2:string" => language_code, + "3:string" => "", + }, + + "3:varint" => 1_i64, + "5:string" => "engagement-panel-searchable-transcript-search-panel", + "6:varint" => 1_i64, + "7:varint" => 1_i64, + "8:varint" => 1_i64, + } + + params = object.try { |i| Protodec::Any.cast_json(i) } + .try { |i| Protodec::Any.from_json(i) } + .try { |i| Base64.urlsafe_encode(i) } + .try { |i| URI.encode_www_form(i) } + + return params + end + + def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String + # Convert into array of TranscriptLine + lines = self.parse(initial_data) + + # Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt() + vtt = String.build do |vtt| + vtt << <<-END_VTT + WEBVTT + Kind: captions + Language: #{target_language} + + + END_VTT + + vtt << "\n\n" + + lines.each do |line| + start_time = line.start_ms + end_time = line.end_ms + + # start_time + vtt << start_time.hours.to_s.rjust(2, '0') + vtt << ':' << start_time.minutes.to_s.rjust(2, '0') + vtt << ':' << start_time.seconds.to_s.rjust(2, '0') + vtt << '.' << start_time.milliseconds.to_s.rjust(3, '0') + + vtt << " --> " + + # end_time + vtt << end_time.hours.to_s.rjust(2, '0') + vtt << ':' << end_time.minutes.to_s.rjust(2, '0') + vtt << ':' << end_time.seconds.to_s.rjust(2, '0') + vtt << '.' << end_time.milliseconds.to_s.rjust(3, '0') + + vtt << "\n" + vtt << line.line + + vtt << "\n" + vtt << "\n" + end + end + + return vtt + end + + private def self.parse(initial_data : Hash(String, JSON::Any)) + body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer", + "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer", + "initialSegments").as_a + + lines = [] of TranscriptLine + body.each do |line| + # Transcript section headers. They are not apart of the captions and as such we can safely skip them. + if line.as_h.has_key?("transcriptSectionHeaderRenderer") + next + end + + line = line["transcriptSegmentRenderer"] + + start_ms = line["startMs"].as_s.to_i.millisecond + end_ms = line["endMs"].as_s.to_i.millisecond + + text = extract_text(line["snippet"]) || "" + + lines << TranscriptLine.new(start_ms, end_ms, text) + end + + return lines + end + end +end diff --git a/src/invidious/views/user/preferences.ecr b/src/invidious/views/user/preferences.ecr index dfda1434..55349c5a 100644 --- a/src/invidious/views/user/preferences.ecr +++ b/src/invidious/views/user/preferences.ecr @@ -89,7 +89,7 @@ <label for="captions[0]"><%= translate(locale, "preferences_captions_label") %></label> <% preferences.captions.each_with_index do |caption, index| %> <select class="pure-u-1-6" name="captions[<%= index %>]" id="captions[<%= index %>]"> - <% Invidious::Videos::Caption::LANGUAGES.each do |option| %> + <% Invidious::Videos::Captions::LANGUAGES.each do |option| %> <option value="<%= option %>" <% if preferences.captions[index] == option %> selected <% end %>><%= translate(locale, option.blank? ? "none" : option) %></option> <% end %> </select> diff --git a/src/invidious/yt_backend/youtube_api.cr b/src/invidious/yt_backend/youtube_api.cr index aef9ddd9..a5e621f2 100644 --- a/src/invidious/yt_backend/youtube_api.cr +++ b/src/invidious/yt_backend/youtube_api.cr @@ -558,6 +558,30 @@ module YoutubeAPI end #################################################################### + # get_transcript(params, client_config?) + # + # Requests the youtubei/v1/get_transcript endpoint with the required headers + # and POST data in order to get a JSON reply. + # + # The requested data is a specially encoded protobuf string that denotes the specific language requested. + # + # An optional ClientConfig parameter can be passed, too (see + # `struct ClientConfig` above for more details). + # + + def get_transcript( + params : String, + client_config : ClientConfig | Nil = nil + ) : Hash(String, JSON::Any) + data = { + "context" => self.make_context(client_config), + "params" => params, + } + + return self._post_json("/youtubei/v1/get_transcript", data, client_config) + end + + #################################################################### # _post_json(endpoint, data, client_config?) # # Internal function that does the actual request to youtube servers |
