blob: 5d33c5b84c410652a1631086d79e36a35592dada [file] [log] [blame]
// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/url_matcher/url_util.h"
#include <memory>
#include <string>
#include "base/files/file_path.h"
#include "base/logging.h"
#include "base/no_destructor.h"
#include "base/strings/escape.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "components/google/core/common/google_util.h"
#include "components/url_formatter/url_fixer.h"
#include "components/url_matcher/url_matcher.h"
#include "net/base/filename_util.h"
#include "net/base/url_util.h"
#include "third_party/re2/src/re2/re2.h"
#include "url/gurl.h"
using url_matcher::URLMatcher;
using url_matcher::URLMatcherCondition;
using url_matcher::URLMatcherConditionFactory;
using url_matcher::URLMatcherConditionSet;
using url_matcher::URLMatcherPortFilter;
using url_matcher::URLMatcherSchemeFilter;
using url_matcher::URLQueryElementMatcherCondition;
namespace url_matcher {
namespace util {
namespace {
// Host/regex pattern for Google AMP Cache URLs.
// See https://developers.google.com/amp/cache/overview#amp-cache-url-format
// for a definition of the format of AMP Cache URLs.
const char kGoogleAmpCacheHost[] = "cdn.ampproject.org";
const char kGoogleAmpCachePathPattern[] = "/[a-z]/(s/)?(.*)";
// Regex pattern for the path of Google AMP Viewer URLs.
const char kGoogleAmpViewerPathPattern[] = "/amp/(s/)?(.*)";
// Host, path prefix, and query regex pattern for Google web cache URLs.
const char kGoogleWebCacheHost[] = "webcache.googleusercontent.com";
const char kGoogleWebCachePathPrefix[] = "/search";
const char kGoogleWebCacheQueryPattern[] =
"cache:(.{12}:)?(https?://)?([^ :]*)( [^:]*)?";
const char kGoogleTranslateSubdomain[] = "translate.";
const char kAlternateGoogleTranslateHost[] = "translate.googleusercontent.com";
// Maximum filters allowed. Filters over this index are ignored.
const size_t kMaxFiltersAllowed = 1000;
// Returns a full URL using either "http" or "https" as the scheme.
GURL BuildURL(bool is_https, const std::string& host_and_path) {
std::string scheme = is_https ? url::kHttpsScheme : url::kHttpScheme;
return GURL(scheme + "://" + host_and_path);
}
void ProcessQueryToConditions(
url_matcher::URLMatcherConditionFactory* condition_factory,
const std::string& query,
bool allow,
std::set<URLQueryElementMatcherCondition>* query_conditions) {
url::Component query_left = url::MakeRange(0, query.length());
url::Component key;
url::Component value;
// Depending on the filter type being block-list or allow-list, the matcher
// choose any or every match. The idea is a URL should be blocked if
// there is any occurrence of the key value pair. It should be allowed
// only if every occurrence of the key is followed by the value. This avoids
// situations such as a user appending an allowed video parameter in the
// end of the query and watching a video of their choice (the last parameter
// is ignored by some web servers like youtube's).
URLQueryElementMatcherCondition::Type match_type =
allow ? URLQueryElementMatcherCondition::MATCH_ALL
: URLQueryElementMatcherCondition::MATCH_ANY;
while (ExtractQueryKeyValue(query.data(), &query_left, &key, &value)) {
URLQueryElementMatcherCondition::QueryElementType query_element_type =
value.len ? URLQueryElementMatcherCondition::ELEMENT_TYPE_KEY_VALUE
: URLQueryElementMatcherCondition::ELEMENT_TYPE_KEY;
URLQueryElementMatcherCondition::QueryValueMatchType query_value_match_type;
if (!value.len && key.len && query[key.end() - 1] == '*') {
--key.len;
query_value_match_type =
URLQueryElementMatcherCondition::QUERY_VALUE_MATCH_PREFIX;
} else if (value.len && query[value.end() - 1] == '*') {
--value.len;
query_value_match_type =
URLQueryElementMatcherCondition::QUERY_VALUE_MATCH_PREFIX;
} else {
query_value_match_type =
URLQueryElementMatcherCondition::QUERY_VALUE_MATCH_EXACT;
}
query_conditions->insert(URLQueryElementMatcherCondition(
query.substr(key.begin, key.len), query.substr(value.begin, value.len),
query_value_match_type, query_element_type, match_type,
condition_factory));
}
}
// Helper class for testing the URL against precompiled regexes. This is a
// singleton so the cached regexes are only created once.
class EmbeddedURLExtractor {
public:
EmbeddedURLExtractor(const EmbeddedURLExtractor&) = delete;
EmbeddedURLExtractor& operator=(const EmbeddedURLExtractor&) = delete;
static EmbeddedURLExtractor* GetInstance() {
static base::NoDestructor<EmbeddedURLExtractor> instance;
return instance.get();
}
// Implements url_filter::GetEmbeddedURL().
GURL GetEmbeddedURL(const GURL& url) {
// Check for "*.cdn.ampproject.org" URLs.
if (url.DomainIs(kGoogleAmpCacheHost)) {
std::string s;
std::string embedded;
if (re2::RE2::FullMatch(url.path(), google_amp_cache_path_regex_, &s,
&embedded)) {
if (url.has_query())
embedded += "?" + url.query();
return BuildURL(!s.empty(), embedded);
}
}
// Check for "www.google.TLD/amp/" URLs.
if (google_util::IsGoogleDomainUrl(
url, google_util::DISALLOW_SUBDOMAIN,
google_util::DISALLOW_NON_STANDARD_PORTS)) {
std::string s;
std::string embedded;
if (re2::RE2::FullMatch(url.path(), google_amp_viewer_path_regex_, &s,
&embedded)) {
// The embedded URL may be percent-encoded. Undo that.
embedded = base::UnescapeBinaryURLComponent(embedded);
return BuildURL(!s.empty(), embedded);
}
}
// Check for Google web cache URLs
// ("webcache.googleusercontent.com/search?q=cache:...").
std::string query;
if (url.host_piece() == kGoogleWebCacheHost &&
base::StartsWith(url.path_piece(), kGoogleWebCachePathPrefix) &&
net::GetValueForKeyInQuery(url, "q", &query)) {
std::string fingerprint;
std::string scheme;
std::string embedded;
if (re2::RE2::FullMatch(query, google_web_cache_query_regex_,
&fingerprint, &scheme, &embedded)) {
return BuildURL(scheme == "https://", embedded);
}
}
// Check for Google translate URLs ("translate.google.TLD/...?...&u=URL" or
// "translate.googleusercontent.com/...?...&u=URL").
bool is_translate = false;
if (base::StartsWith(url.host_piece(), kGoogleTranslateSubdomain)) {
// Remove the "translate." prefix.
GURL::Replacements replace;
replace.SetHostStr(
url.host_piece().substr(strlen(kGoogleTranslateSubdomain)));
GURL trimmed = url.ReplaceComponents(replace);
// Check that the remainder is a Google URL. Note: IsGoogleDomainUrl
// checks for [www.]google.TLD, but we don't want the "www.", so
// explicitly exclude that.
// TODO(treib,pam): Instead of excluding "www." manually, teach
// IsGoogleDomainUrl a mode that doesn't allow it.
is_translate = google_util::IsGoogleDomainUrl(
trimmed, google_util::DISALLOW_SUBDOMAIN,
google_util::DISALLOW_NON_STANDARD_PORTS) &&
!base::StartsWith(trimmed.host_piece(), "www.");
}
bool is_alternate_translate =
url.host_piece() == kAlternateGoogleTranslateHost;
if (is_translate || is_alternate_translate) {
std::string embedded;
if (net::GetValueForKeyInQuery(url, "u", &embedded)) {
// The embedded URL may or may not include a scheme. Fix it if
// necessary.
return url_formatter::FixupURL(embedded, /*desired_tld=*/std::string());
}
}
return GURL();
}
private:
friend class base::NoDestructor<EmbeddedURLExtractor>;
EmbeddedURLExtractor()
: google_amp_cache_path_regex_(kGoogleAmpCachePathPattern),
google_amp_viewer_path_regex_(kGoogleAmpViewerPathPattern),
google_web_cache_query_regex_(kGoogleWebCacheQueryPattern) {
DCHECK(google_amp_cache_path_regex_.ok());
DCHECK(google_amp_viewer_path_regex_.ok());
DCHECK(google_web_cache_query_regex_.ok());
}
~EmbeddedURLExtractor() = default;
const re2::RE2 google_amp_cache_path_regex_;
const re2::RE2 google_amp_viewer_path_regex_;
const re2::RE2 google_web_cache_query_regex_;
};
} // namespace
// Converts a ValueList |value| of strings into a vector. Returns true if
// successful.
bool GetAsStringVector(const base::Value* value,
std::vector<std::string>* out) {
if (!value->is_list())
return false;
for (const base::Value& item : value->GetListDeprecated()) {
if (!item.is_string())
return false;
out->push_back(item.GetString());
}
return true;
}
GURL Normalize(const GURL& url) {
GURL normalized_url = url;
GURL::Replacements replacements;
// Strip username, password, query, and ref.
replacements.ClearUsername();
replacements.ClearPassword();
replacements.ClearQuery();
replacements.ClearRef();
return url.ReplaceComponents(replacements);
}
GURL GetEmbeddedURL(const GURL& url) {
return EmbeddedURLExtractor::GetInstance()->GetEmbeddedURL(url);
}
size_t GetMaxFiltersAllowed() {
return kMaxFiltersAllowed;
}
FilterComponents::FilterComponents() = default;
FilterComponents::~FilterComponents() = default;
FilterComponents::FilterComponents(FilterComponents&&) = default;
bool FilterComponents::IsWildcard() const {
return host.empty() && scheme.empty() && path.empty() && query.empty() &&
port == 0 && number_of_url_matching_conditions == 0 &&
match_subdomains;
}
scoped_refptr<URLMatcherConditionSet> CreateConditionSet(
URLMatcher* url_matcher,
base::MatcherStringPattern::ID id,
const std::string& scheme,
const std::string& host,
bool match_subdomains,
uint16_t port,
const std::string& path,
const std::string& query,
bool allow) {
URLMatcherConditionFactory* condition_factory =
url_matcher->condition_factory();
std::set<URLMatcherCondition> conditions;
conditions.insert(
match_subdomains
? condition_factory->CreateHostSuffixPathPrefixCondition(host, path)
: condition_factory->CreateHostEqualsPathPrefixCondition(host, path));
std::set<URLQueryElementMatcherCondition> query_conditions;
if (!query.empty()) {
ProcessQueryToConditions(condition_factory, query, allow,
&query_conditions);
}
std::unique_ptr<URLMatcherSchemeFilter> scheme_filter;
if (!scheme.empty())
scheme_filter = std::make_unique<URLMatcherSchemeFilter>(scheme);
std::unique_ptr<URLMatcherPortFilter> port_filter;
if (port != 0) {
std::vector<URLMatcherPortFilter::Range> ranges;
ranges.push_back(URLMatcherPortFilter::CreateRange(port));
port_filter = std::make_unique<URLMatcherPortFilter>(ranges);
}
return base::MakeRefCounted<URLMatcherConditionSet>(
id, conditions, query_conditions, std::move(scheme_filter),
std::move(port_filter));
}
bool FilterToComponents(const std::string& filter,
std::string* scheme,
std::string* host,
bool* match_subdomains,
uint16_t* port,
std::string* path,
std::string* query) {
DCHECK(scheme);
DCHECK(host);
DCHECK(match_subdomains);
DCHECK(port);
DCHECK(path);
DCHECK(query);
url::Parsed parsed;
std::string lc_filter = base::ToLowerASCII(filter);
const std::string url_scheme = url_formatter::SegmentURL(filter, &parsed);
// Check if it's a scheme wildcard pattern. We support both versions
// (scheme:* and scheme://*) the later being consistent with old filter
// definitions.
if (lc_filter == url_scheme + ":*" || lc_filter == url_scheme + "://*") {
scheme->assign(url_scheme);
host->clear();
*match_subdomains = true;
*port = 0;
path->clear();
query->clear();
return true;
}
if (url_scheme == url::kFileScheme) {
base::FilePath file_path;
if (!net::FileURLToFilePath(GURL(filter), &file_path))
return false;
*scheme = url::kFileScheme;
host->clear();
*match_subdomains = true;
*port = 0;
*path = file_path.AsUTF8Unsafe();
#if defined(FILE_PATH_USES_WIN_SEPARATORS)
// Separators have to be canonicalized on Windows.
std::replace(path->begin(), path->end(), '\\', '/');
*path = "/" + *path;
#endif
return true;
}
// According to documentation host can't be empty.
if (!parsed.host.is_nonempty())
return false;
if (parsed.scheme.is_nonempty())
scheme->assign(url_scheme);
else
scheme->clear();
host->assign(filter, parsed.host.begin, parsed.host.len);
*host = base::ToLowerASCII(*host);
// Special '*' host, matches all hosts.
if (*host == "*") {
host->clear();
*match_subdomains = true;
} else if (host->at(0) == '.') {
// A leading dot in the pattern syntax means that we don't want to match
// subdomains.
host->erase(0, 1);
*match_subdomains = false;
} else {
url::RawCanonOutputT<char> output;
url::CanonHostInfo host_info;
url::CanonicalizeHostVerbose(filter.c_str(), parsed.host, &output,
&host_info);
if (host_info.family == url::CanonHostInfo::NEUTRAL) {
// We want to match subdomains. Add a dot in front to make sure we only
// match at domain component boundaries.
*host = "." + *host;
*match_subdomains = true;
} else {
*match_subdomains = false;
}
}
if (parsed.port.is_nonempty()) {
int int_port;
if (!base::StringToInt(filter.substr(parsed.port.begin, parsed.port.len),
&int_port)) {
return false;
}
if (int_port <= 0 || int_port > std::numeric_limits<uint16_t>::max())
return false;
*port = int_port;
} else {
// Match any port.
*port = 0;
}
if (parsed.path.is_nonempty())
path->assign(filter, parsed.path.begin, parsed.path.len);
else
path->clear();
if (parsed.query.is_nonempty())
query->assign(filter, parsed.query.begin, parsed.query.len);
else
query->clear();
return true;
}
void AddFilters(URLMatcher* matcher,
bool allow,
base::MatcherStringPattern::ID* id,
const base::Value::List& patterns,
std::map<base::MatcherStringPattern::ID,
url_matcher::util::FilterComponents>* filters) {
URLMatcherConditionSet::Vector all_conditions;
size_t size = std::min(kMaxFiltersAllowed, patterns.size());
scoped_refptr<URLMatcherConditionSet> condition_set;
for (size_t i = 0; i < size; ++i) {
DCHECK(patterns[i].is_string());
const std::string pattern = patterns[i].GetString();
FilterComponents components;
components.allow = allow;
if (!FilterToComponents(pattern, &components.scheme, &components.host,
&components.match_subdomains, &components.port,
&components.path, &components.query)) {
LOG(ERROR) << "Invalid pattern " << pattern;
continue;
}
condition_set =
CreateConditionSet(matcher, ++(*id), components.scheme, components.host,
components.match_subdomains, components.port,
components.path, components.query, allow);
if (filters) {
components.number_of_url_matching_conditions =
condition_set->query_conditions().size();
(*filters)[*id] = std::move(components);
}
all_conditions.push_back(std::move(condition_set));
}
matcher->AddConditionSets(all_conditions);
}
void AddFilters(URLMatcher* matcher,
bool allow,
base::MatcherStringPattern::ID* id,
const std::vector<std::string>& patterns,
std::map<base::MatcherStringPattern::ID,
url_matcher::util::FilterComponents>* filters) {
URLMatcherConditionSet::Vector all_conditions;
size_t size = std::min(kMaxFiltersAllowed, patterns.size());
scoped_refptr<URLMatcherConditionSet> condition_set;
for (size_t i = 0; i < size; ++i) {
FilterComponents components;
components.allow = allow;
if (!FilterToComponents(patterns[i], &components.scheme, &components.host,
&components.match_subdomains, &components.port,
&components.path, &components.query)) {
LOG(ERROR) << "Invalid pattern " << patterns[i];
continue;
}
condition_set =
CreateConditionSet(matcher, ++(*id), components.scheme, components.host,
components.match_subdomains, components.port,
components.path, components.query, allow);
if (filters) {
components.number_of_url_matching_conditions =
condition_set->query_conditions().size();
(*filters)[*id] = std::move(components);
}
all_conditions.push_back(std::move(condition_set));
}
matcher->AddConditionSets(all_conditions);
}
void AddAllowFilters(url_matcher::URLMatcher* matcher,
const base::Value::List& patterns) {
base::MatcherStringPattern::ID id(0);
AddFilters(matcher, true, &id, patterns);
}
void AddAllowFilters(url_matcher::URLMatcher* matcher,
const std::vector<std::string>& patterns) {
base::MatcherStringPattern::ID id(0);
AddFilters(matcher, true, &id, patterns);
}
} // namespace util
} // namespace url_matcher