summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSamantaz Fox <coding@samantaz.fr>2024-08-24 19:44:59 +0200
committerSamantaz Fox <coding@samantaz.fr>2024-08-24 19:44:59 +0200
commit3c6a662aaffe05b534da9c6cb2ed1b36e120d71a (patch)
tree4619b3f9c07d0abd0cb978420205d35dac5aa630
parent9e55799269dba8a71babd4f99ead62bd9f43044e (diff)
parent85deea5aca4877507bb8850e5e3e168d968328ad (diff)
downloadinvidious-3c6a662aaffe05b534da9c6cb2ed1b36e120d71a.tar.gz
invidious-3c6a662aaffe05b534da9c6cb2ed1b36e120d71a.tar.bz2
invidious-3c6a662aaffe05b534da9c6cb2ed1b36e120d71a.zip
Search: Add support for Youtube URLs (#4146)
Closes issue 3300
-rw-r--r--src/invidious/routes/search.cr6
-rw-r--r--src/invidious/search/query.cr38
-rw-r--r--src/invidious/yt_backend/url_sanitizer.cr121
3 files changed, 160 insertions, 5 deletions
diff --git a/src/invidious/routes/search.cr b/src/invidious/routes/search.cr
index 5be33533..85aa1c7e 100644
--- a/src/invidious/routes/search.cr
+++ b/src/invidious/routes/search.cr
@@ -51,6 +51,12 @@ module Invidious::Routes::Search
else
user = env.get? "user"
+ # An URL was copy/pasted in the search box.
+ # Redirect the user to the appropriate page.
+ if query.is_url?
+ return env.redirect UrlSanitizer.process(query.text).to_s
+ end
+
begin
items = query.process
rescue ex : ChannelSearchException
diff --git a/src/invidious/search/query.cr b/src/invidious/search/query.cr
index e38845d9..a93bb3f9 100644
--- a/src/invidious/search/query.cr
+++ b/src/invidious/search/query.cr
@@ -20,6 +20,9 @@ module Invidious::Search
property region : String?
property channel : String = ""
+ # Flag that indicates if the smart search features have been disabled.
+ @inhibit_ssf : Bool = false
+
# Return true if @raw_query is either `nil` or empty
private def empty_raw_query?
return @raw_query.empty?
@@ -48,10 +51,18 @@ module Invidious::Search
)
# Get the raw search query string (common to all search types). In
# Regular search mode, also look for the `search_query` URL parameter
- if @type.regular?
- @raw_query = params["q"]? || params["search_query"]? || ""
- else
- @raw_query = params["q"]? || ""
+ _raw_query = params["q"]?
+ _raw_query ||= params["search_query"]? if @type.regular?
+ _raw_query ||= ""
+
+ # Remove surrounding whitespaces. Mostly useful for copy/pasted URLs.
+ @raw_query = _raw_query.strip
+
+ # Check for smart features (ex: URL search) inhibitor (backslash).
+ # If inhibitor is present, remove it.
+ if @raw_query.starts_with?('\\')
+ @inhibit_ssf = true
+ @raw_query = @raw_query[1..]
end
# Get the page number (also common to all search types)
@@ -85,7 +96,7 @@ module Invidious::Search
@filters = Filters.from_iv_params(params)
@channel = params["channel"]? || ""
- if @filters.default? && @raw_query.includes?(':')
+ if @filters.default? && @raw_query.index(/\w:\w/)
# Parse legacy filters from query
@filters, @channel, @query, subs = Filters.from_legacy_filters(@raw_query)
else
@@ -136,5 +147,22 @@ module Invidious::Search
return params
end
+
+ # Checks if the query is a standalone URL
+ def is_url? : Bool
+ # If the smart features have been inhibited, don't go further.
+ return false if @inhibit_ssf
+
+ # Only supported in regular search mode
+ return false if !@type.regular?
+
+ # If filters are present, that's a regular search
+ return false if !@filters.default?
+
+ # Simple heuristics: domain name
+ return @raw_query.starts_with?(
+ /(https?:\/\/)?(www\.)?(m\.)?youtu(\.be|be\.com)\//
+ )
+ end
end
end
diff --git a/src/invidious/yt_backend/url_sanitizer.cr b/src/invidious/yt_backend/url_sanitizer.cr
new file mode 100644
index 00000000..725382ee
--- /dev/null
+++ b/src/invidious/yt_backend/url_sanitizer.cr
@@ -0,0 +1,121 @@
+require "uri"
+
+module UrlSanitizer
+ extend self
+
+ ALLOWED_QUERY_PARAMS = {
+ channel: ["u", "user", "lb"],
+ playlist: ["list"],
+ search: ["q", "search_query", "sp"],
+ watch: [
+ "v", # Video ID
+ "list", "index", # Playlist-related
+ "playlist", # Unnamed playlist (id,id,id,...) (embed-only?)
+ "t", "time_continue", "start", "end", # Timestamp
+ "lc", # Highlighted comment (watch page only)
+ ],
+ }
+
+ # Returns whether the given string is an ASCII word. This is the same as
+ # running the following regex in US-ASCII locale: /^[\w-]+$/
+ private def ascii_word?(str : String) : Bool
+ return false if str.bytesize != str.size
+
+ str.each_byte do |byte|
+ next if 'a'.ord <= byte <= 'z'.ord
+ next if 'A'.ord <= byte <= 'Z'.ord
+ next if '0'.ord <= byte <= '9'.ord
+ next if byte == '-'.ord || byte == '_'.ord
+
+ return false
+ end
+
+ return true
+ end
+
+ # Return which kind of parameters are allowed based on the
+ # first path component (breadcrumb 0).
+ private def determine_allowed(path_root : String)
+ case path_root
+ when "watch", "w", "v", "embed", "e", "shorts", "clip"
+ return :watch
+ when .starts_with?("@"), "c", "channel", "user", "profile", "attribution_link"
+ return :channel
+ when "playlist", "mix"
+ return :playlist
+ when "results", "search"
+ return :search
+ else # hashtag, post, trending, brand URLs, etc..
+ return nil
+ end
+ end
+
+ # Create a new URI::Param containing only the allowed parameters
+ private def copy_params(unsafe_params : URI::Params, allowed_type) : URI::Params
+ new_params = URI::Params.new
+
+ ALLOWED_QUERY_PARAMS[allowed_type].each do |name|
+ if unsafe_params[name]?
+ # Only copy the last parameter, in case there is more than one
+ new_params[name] = unsafe_params.fetch_all(name)[-1]
+ end
+ end
+
+ return new_params
+ end
+
+ # Transform any user-supplied youtube URL into something we can trust
+ # and use across the code.
+ def process(str : String) : URI
+ # Because URI follows RFC3986 specifications, URL without a scheme
+ # will be parsed as a relative path. So we have to add a scheme ourselves.
+ str = "https://#{str}" if !str.starts_with?(/https?:\/\//)
+
+ unsafe_uri = URI.parse(str)
+ unsafe_host = unsafe_uri.host
+ unsafe_path = unsafe_uri.path
+
+ new_uri = URI.new(path: "/")
+
+ # Redirect to homepage for bogus URLs
+ return new_uri if (unsafe_host.nil? || unsafe_path.nil?)
+
+ breadcrumbs = unsafe_path
+ .split('/', remove_empty: true)
+ .compact_map do |bc|
+ # Exclude attempts at path trasversal
+ next if bc == "." || bc == ".."
+
+ # Non-alnum characters are unlikely in a genuine URL
+ next if !ascii_word?(bc)
+
+ bc
+ end
+
+ # If nothing remains, it's either a legit URL to the homepage
+ # (who does that!?) or because we filtered some junk earlier.
+ return new_uri if breadcrumbs.empty?
+
+ # Replace the original query parameters with the sanitized ones
+ case unsafe_host
+ when .ends_with?("youtube.com")
+ # Use our sanitized path (not forgetting the leading '/')
+ new_uri.path = "/#{breadcrumbs.join('/')}"
+
+ # Then determine which params are allowed, and copy them over
+ if allowed = determine_allowed(breadcrumbs[0])
+ new_uri.query_params = copy_params(unsafe_uri.query_params, allowed)
+ end
+ when "youtu.be"
+ # Always redirect to the watch page
+ new_uri.path = "/watch"
+
+ new_params = copy_params(unsafe_uri.query_params, :watch)
+ new_params["id"] = breadcrumbs[0]
+
+ new_uri.query_params = new_params
+ end
+
+ return new_uri
+ end
+end