Fix extractor bugs (#2454)

* Add debug/trace logging to extract_items * Handle invalid timestamps for livestreams extraction * Make use of author_fallback in playlist extractor * Don't use extract_text for video length extraction The extract_text function attempts to extract from both the simpleText and the runs route. This is typically what we'd want for text extraction as it could appear in both locations. However, while this still holds true, the thumbnailOverlayTimeStatusRenderer writes a numerical length (when present on the video) to the simpleText route and uses runs for a text overlay like "LIVE" or "PREMIERE". Therefore, when a video has a text overlay instead of a numerical one, Invidious still passes it onto decode_length_seconds, which obviously raises since it cannot be converted into integers. In the future, if more routes requires one text route over the other, we should go ahead and add an argument to extract_text itself. Though for now, this is sufficient. * Handle unsupported "special" categories
author: syeopite <70992037+syeopite@users.noreply.github.com> 2021-10-07 21:39:21 +0000
committer: GitHub <noreply@github.com> 2021-10-07 23:39:21 +0200
commit: 21e29411af3529200cf8bbb880b5a919c1d32f8f (patch)
tree: 1660cd9d070e42f8be0119f2254f664d76c35e7f /src
parent: f85930700b5e273f4236aa26fc96c9e8f054d652 (diff)
download: invidious-21e29411af3529200cf8bbb880b5a919c1d32f8f.tar.gz
invidious-21e29411af3529200cf8bbb880b5a919c1d32f8f.tar.bz2
invidious-21e29411af3529200cf8bbb880b5a919c1d32f8f.zip
1 files changed, 67 insertions, 10 deletions
diff --git a/src/invidious/helpers/extractors.cr b/src/invidious/helpers/extractors.cr
index 850c93ec..c8a6cd4a 100644
--- a/src/invidious/helpers/extractors.cr
+++ b/src/invidious/helpers/extractors.cr
@@ -74,7 +74,15 @@ private module Parsers
       if length_container = item_contents["lengthText"]?
         length_seconds = decode_length_seconds(length_container["simpleText"].as_s)
       elsif length_container = item_contents["thumbnailOverlays"]?.try &.as_a.find(&.["thumbnailOverlayTimeStatusRenderer"]?)
-        length_seconds = extract_text(length_container["thumbnailOverlayTimeStatusRenderer"]["text"]).try { |t| decode_length_seconds(t) } || 0
+        # This needs to only go down the `simpleText` path (if possible). If more situations came up that requires
+        # a specific pathway then we should add an argument to extract_text that'll make this possible
+        length_seconds = length_container.dig?("thumbnailOverlayTimeStatusRenderer", "text", "simpleText")
+
+        if length_seconds
+          length_seconds = decode_length_seconds(length_seconds.as_s)
+        else
+          length_seconds = 0
+        end
       else
         length_seconds = 0
       end
@@ -113,6 +121,10 @@ private module Parsers
         premiere_timestamp: premiere_timestamp,
       })
     end
+
+    def self.parser_name
+      return {{@type.name}}
+    end
   end
 
   # Parses a InnerTube channelRenderer into a SearchChannel. Returns nil when the given object isn't a channelRenderer
@@ -159,6 +171,10 @@ private module Parsers
         auto_generated:   auto_generated,
       })
     end
+
+    def self.parser_name
+      return {{@type.name}}
+    end
   end
 
   # Parses a InnerTube gridPlaylistRenderer into a SearchPlaylist. Returns nil when the given object isn't a gridPlaylistRenderer
@@ -194,6 +210,10 @@ private module Parsers
         thumbnail:   playlist_thumbnail,
       })
     end
+
+    def self.parser_name
+      return {{@type.name}}
+    end
   end
 
   # Parses a InnerTube playlistRenderer into a SearchPlaylist. Returns nil when the given object isn't a playlistRenderer
@@ -207,20 +227,20 @@ private module Parsers
   module PlaylistRendererParser
     def self.process(item : JSON::Any, author_fallback : AuthorFallback)
       if item_contents = item["playlistRenderer"]?
-        return self.parse(item_contents)
+        return self.parse(item_contents, author_fallback)
       end
     end
 
-    private def self.parse(item_contents)
+    private def self.parse(item_contents, author_fallback)
       title = item_contents["title"]["simpleText"]?.try &.as_s || ""
       plid = item_contents["playlistId"]?.try &.as_s || ""
 
       video_count = HelperExtractors.get_video_count(item_contents)
       playlist_thumbnail = HelperExtractors.get_thumbnails_plural(item_contents)
 
-      author_info = item_contents.dig("shortBylineText", "runs", 0)
-      author = author_info["text"].as_s
-      author_id = HelperExtractors.get_browse_id(author_info)
+      author_info = item_contents.dig?("shortBylineText", "runs", 0)
+      author = author_info.try &.["text"].as_s || author_fallback.name
+      author_id = author_info.try { |x| HelperExtractors.get_browse_id(x) } || author_fallback.id
 
       videos = item_contents["videos"]?.try &.as_a.map do |v|
         v = v["childVideoRenderer"]
@@ -246,6 +266,10 @@ private module Parsers
         thumbnail:   playlist_thumbnail,
       })
     end
+
+    def self.parser_name
+      return {{@type.name}}
+    end
   end
 
   # Parses a InnerTube shelfRenderer into a Category. Returns nil when the given object isn't a shelfRenderer
@@ -283,11 +307,17 @@ private module Parsers
       # Content parsing
       contents = [] of SearchItem
 
-      # Content could be in three locations.
-      if content_container = item_contents["content"]["horizontalListRenderer"]?
-      elsif content_container = item_contents["content"]["expandedShelfContentsRenderer"]?
-      elsif content_container = item_contents["content"]["verticalListRenderer"]?
+      # InnerTube recognizes some "special" categories, which are organized differently.
+      if special_category_container = item_contents["content"]?
+        if content_container = special_category_container["horizontalListRenderer"]?
+        elsif content_container = special_category_container["expandedShelfContentsRenderer"]?
+        elsif content_container = special_category_container["verticalListRenderer"]?
+        else
+          # Anything else, such as `horizontalMovieListRenderer` is currently unsupported.
+          return
+        end
       else
+        # "Normal" category.
         content_container = item_contents["contents"]
       end
 
@@ -307,6 +337,10 @@ private module Parsers
         badges:           badges,
       })
     end
+
+    def self.parser_name
+      return {{@type.name}}
+    end
   end
 end
 
@@ -372,6 +406,10 @@ private module Extractors
 
       return raw_items
     end
+
+    def self.extractor_name
+      return {{@type.name}}
+    end
   end
 
   # Extracts items from the InnerTube response for search results
@@ -409,6 +447,10 @@ private module Extractors
 
       return raw_items.flatten
     end
+
+    def self.extractor_name
+      return {{@type.name}}
+    end
   end
 
   # Extracts continuation items from a InnerTube response
@@ -440,6 +482,10 @@ private module Extractors
 
       return raw_items
     end
+
+    def self.extractor_name
+      return {{@type.name}}
+    end
   end
 end
 
@@ -529,8 +575,14 @@ def extract_item(item : JSON::Any, author_fallback : String? = "",
   # Each parser automatically validates the data given to see if the data is
   # applicable to itself. If not nil is returned and the next parser is attemped.
   ITEM_PARSERS.each do |parser|
+    LOGGER.trace("extract_item: Attempting to parse item using \"#{parser.parser_name}\" (cycling...)")
+
     if result = parser.process(item, author_fallback)
+      LOGGER.debug("extract_item: Successfully parsed via #{parser.parser_name}")
+
       return result
+    else
+      LOGGER.trace("extract_item: Parser \"#{parser.parser_name}\" does not apply. Cycling to the next one...")
     end
   end
 end
@@ -550,7 +602,10 @@ def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : Stri
 
   # This is identical to the parser cycling of extract_item().
   ITEM_CONTAINER_EXTRACTOR.each do |extractor|
+    LOGGER.trace("extract_items: Attempting to extract item container using \"#{extractor.extractor_name}\" (cycling...)")
+
     if container = extractor.process(unpackaged_data)
+      LOGGER.debug("extract_items: Successfully unpacked container with \"#{extractor.extractor_name}\"")
       # Extract items in container
       container.each do |item|
         if parsed_result = extract_item(item, author_fallback, author_id_fallback)
@@ -559,6 +614,8 @@ def extract_items(initial_data : Hash(String, JSON::Any), author_fallback : Stri
       end
 
       break
+    else
+      LOGGER.trace("extract_items: Extractor \"#{extractor.extractor_name}\" does not apply. Cycling to the next one...")
     end
   end
author	syeopite <70992037+syeopite@users.noreply.github.com>	2021-10-07 21:39:21 +0000
committer	GitHub <noreply@github.com>	2021-10-07 23:39:21 +0200
commit	21e29411af3529200cf8bbb880b5a919c1d32f8f (patch)
tree	1660cd9d070e42f8be0119f2254f664d76c35e7f /src
parent	f85930700b5e273f4236aa26fc96c9e8f054d652 (diff)
download	invidious-21e29411af3529200cf8bbb880b5a919c1d32f8f.tar.gz invidious-21e29411af3529200cf8bbb880b5a919c1d32f8f.tar.bz2 invidious-21e29411af3529200cf8bbb880b5a919c1d32f8f.zip