blob: 4a4da69f3e3c16977cf2a6952dde109586ea5333 [file] [log] [blame]
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/url_deduplication/docs_url_strip_handler.h"
#include <string>
#include <vector>
#include "base/containers/fixed_flat_set.h"
#include "base/containers/lru_cache.h"
#include "base/no_destructor.h"
#include "base/strings/escape.h"
#include "base/strings/string_util.h"
#include "components/url_formatter/url_formatter.h"
#include "net/base/url_util.h"
#include "third_party/re2/src/re2/re2.h"
#include "url/gurl.h"
// TODO(crbug.com/353966074) There is a plan to avoid/consolidate any
// duplicated code as this borrows from:
// components/omnibox/browser/document_provider.cc
namespace {
// Verify if the host could possibly be for a valid doc URL. This is a more
// lightweight check than `ExtractDocIdFromUrl()`. It can be done before
// unescaping the URL as valid hosts don't contain escapable chars; unescaping
// is relatively expensive. E.g., 'docs.google.com' isn't a valid doc URL, but
// it's host looks like it could be, so return true. On the other hand,
// 'google.com' is definitely not a doc URL so return false.
bool ValidHostPrefix(const std::string& host) {
// There are 66 (5*11) valid, e.g. 'docs5.google.com', so rather than check
// all 66, we just check the 6 prefixes. Keep these prefixes consistent with
// those in `ExtractDocIdFromUrl()`.
constexpr auto kValidHostPrefixes = base::MakeFixedFlatSet<std::string_view>({
"spreadsheets",
"docs",
"drive",
"script",
"sites",
"jamboard",
});
for (const auto& valid_host_prefix : kValidHostPrefixes) {
if (base::StartsWith(host, valid_host_prefix,
base::CompareCase::INSENSITIVE_ASCII)) {
return true;
}
}
return false;
}
// Derived from google3/apps/share/util/docs_url_extractor.cc.
std::string ExtractDocIdFromUrl(const std::string& url) {
static const base::NoDestructor<RE2> docs_url_pattern(
"\\b(" // The first groups matches the whole URL.
// Domain.
"(?:https?://)?(?:"
// Keep the hosts consistent with `ValidHostPrefix()`.
"spreadsheets|docs|drive|script|sites|jamboard"
")[0-9]?\\.google\\.com"
"(?::[0-9]+)?\\/" // Port.
"(?:\\S*)" // Non-whitespace chars.
"(?:"
// Doc url prefix to match /d/{id}. (?:e/)? deviates from google3.
"(?:/d/(?:e/)?(?P<path_docid>[0-9a-zA-Z\\-\\_]+))"
"|"
// Docs id expr to match a valid id parameter.
"(?:(?:\\?|&|&amp;)"
"(?:id|docid|key|docID|DocId)=(?P<query_docid>[0-9a-zA-Z\\-\\_]+))"
"|"
// Folder url prefix to match /folders/{folder_id}.
"(?:/folders/(?P<folder_docid>[0-9a-zA-Z\\-\\_]+))"
"|"
// Sites url prefix.
"(?:/?s/)(?P<sites_docid>[0-9a-zA-Z\\-\\_]+)"
"(?:/p/[0-9a-zA-Z\\-\\_]+)?/edit"
"|"
// Jam url.
"(?:d/)(?P<jam_docid>[0-9a-zA-Z\\-\\_]+)/(?:edit|viewer)"
")"
// Other valid chars.
"(?:[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]*)"
// Summarization details.
"(?:summarizationDetails=[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/"
"\\?(?:%5B)(?:%5D)]*)?"
// Other valid chars.
"(?:[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]*)"
"(?:(#[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]+)?)" // Fragment
")");
std::vector<std::string_view> matched_doc_ids(
docs_url_pattern->NumberOfCapturingGroups() + 1);
// ANCHOR_START deviates from google3 which uses UNANCHORED. Using
// ANCHOR_START prevents incorrectly matching with non-drive URLs but which
// contain a drive URL; e.g.,
// url-parser.com/?url=https://docs.google.com/document/d/(id)/edit.
if (!docs_url_pattern->Match(url, 0, url.size(), RE2::ANCHOR_START,
matched_doc_ids.data(),
matched_doc_ids.size())) {
return std::string();
}
for (const auto& doc_id_group : docs_url_pattern->NamedCapturingGroups()) {
std::string_view identified_doc_id = matched_doc_ids[doc_id_group.second];
if (!identified_doc_id.empty()) {
return std::string(identified_doc_id);
}
}
return std::string();
}
} // namespace
namespace url_deduplication {
GURL DocsURLStripHandler::StripExtraParams(GURL url) {
if (!url.is_valid()) {
return GURL();
}
// A memoization cache. Only updated if `ExtractDocIdFromUrl()` was attempted.
// That's the most expensive part of this algorithm, and memoizing the earlier
// trivial checks would worsen performance by pushing out more useful cache
// entries.
static base::NoDestructor<base::LRUCache<GURL, GURL>> cache(10);
const auto& cached = cache->Get(url);
if (cached != cache->end()) {
return cached->second;
}
// Early exit to avoid unnecessary and more involved checks. Don't update the
// cache for trivial cases to avoid pushing out a more useful entry.
if (!url.DomainIs("google.com")) {
return GURL();
}
// We aim to prevent duplicate Drive URLs to appear between the Drive document
// search provider and history/bookmark entries.
// All URLs are canonicalized to a GURL form only used for deduplication and
// not guaranteed to be usable for navigation.
// Drive redirects are already handled by the regex in |ExtractDocIdFromUrl|.
// The below logic handles google.com redirects; e.g., google.com/url/q=<url>
std::string url_str;
std::string url_str_host;
if (url.host() == "www.google.com" && url.path() == "/url") {
if ((!net::GetValueForKeyInQuery(url, "q", &url_str) || url_str.empty()) &&
(!net::GetValueForKeyInQuery(url, "url", &url_str) ||
url_str.empty())) {
return GURL();
}
url_str_host = GURL(url_str).host();
} else {
url_str = url.spec();
url_str_host = url.host();
}
// Recheck the domain, since a google URL could redirect to a non-google URL
if (!base::EndsWith(url_str_host, "google.com",
base::CompareCase::INSENSITIVE_ASCII)) {
return GURL();
}
// Filter out non-doc hosts. Do this before unescaping the URL below, as
// unescaping can be expensive and valid hosts don't contain escapable chars.
// Do this after simplifying the google.com redirect above, as that changes
// the host.
if (!ValidHostPrefix(url_str_host)) {
return GURL();
}
// Unescape |url_str|
url_str = base::UnescapeURLComponent(
url_str,
base::UnescapeRule::PATH_SEPARATORS |
base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);
const std::string id = ExtractDocIdFromUrl(url_str);
// Canonicalize to the /open form without any extra args.
// This is similar to what we expect from the server.
GURL deduping_url =
id.empty() ? GURL() : GURL("https://drive.google.com/open?id=" + id);
cache->Put(url, deduping_url);
return deduping_url;
}
} // namespace url_deduplication