1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/dns/dns_hosts.h"
6
7 #include <string>
8 #include <utility>
9
10 #include "base/check.h"
11 #include "base/files/file_path.h"
12 #include "base/files/file_util.h"
13 #include "base/metrics/histogram_functions.h"
14 #include "base/strings/string_piece.h"
15 #include "base/strings/string_util.h"
16 #include "base/trace_event/memory_usage_estimator.h"
17 #include "build/build_config.h"
18 #include "net/base/cronet_buildflags.h"
19 #include "net/base/url_util.h"
20 #include "net/dns/dns_util.h"
21 #include "url/url_canon.h"
22
23 using base::StringPiece;
24
25 namespace net {
26
27 namespace {
28
29 // Parses the contents of a hosts file. Returns one token (IP or hostname) at
30 // a time. Doesn't copy anything; accepts the file as a StringPiece and
31 // returns tokens as StringPieces.
32 class HostsParser {
33 public:
HostsParser(const StringPiece & text,ParseHostsCommaMode comma_mode)34 explicit HostsParser(const StringPiece& text, ParseHostsCommaMode comma_mode)
35 : text_(text),
36 data_(text.data()),
37 end_(text.size()),
38 comma_mode_(comma_mode) {}
39
40 HostsParser(const HostsParser&) = delete;
41 HostsParser& operator=(const HostsParser&) = delete;
42
43 // Advances to the next token (IP or hostname). Returns whether another
44 // token was available. |token_is_ip| and |token| can be used to find out
45 // the type and text of the token.
Advance()46 bool Advance() {
47 bool next_is_ip = (pos_ == 0);
48 while (pos_ < end_ && pos_ != std::string::npos) {
49 switch (text_[pos_]) {
50 case ' ':
51 case '\t':
52 SkipWhitespace();
53 break;
54
55 case '\r':
56 case '\n':
57 next_is_ip = true;
58 pos_++;
59 break;
60
61 case '#':
62 SkipRestOfLine();
63 break;
64
65 case ',':
66 if (comma_mode_ == PARSE_HOSTS_COMMA_IS_WHITESPACE) {
67 SkipWhitespace();
68 break;
69 }
70
71 // If comma_mode_ is COMMA_IS_TOKEN, fall through:
72 [[fallthrough]];
73
74 default: {
75 size_t token_start = pos_;
76 SkipToken();
77 size_t token_end = (pos_ == std::string::npos) ? end_ : pos_;
78
79 token_ = StringPiece(data_ + token_start, token_end - token_start);
80 token_is_ip_ = next_is_ip;
81
82 return true;
83 }
84 }
85 }
86
87 return false;
88 }
89
90 // Fast-forwards the parser to the next line. Should be called if an IP
91 // address doesn't parse, to avoid wasting time tokenizing hostnames that
92 // will be ignored.
SkipRestOfLine()93 void SkipRestOfLine() { pos_ = text_.find("\n", pos_); }
94
95 // Returns whether the last-parsed token is an IP address (true) or a
96 // hostname (false).
token_is_ip()97 bool token_is_ip() { return token_is_ip_; }
98
99 // Returns the text of the last-parsed token as a StringPiece referencing
100 // the same underlying memory as the StringPiece passed to the constructor.
101 // Returns an empty StringPiece if no token has been parsed or the end of
102 // the input string has been reached.
token()103 const StringPiece& token() { return token_; }
104
105 private:
SkipToken()106 void SkipToken() {
107 switch (comma_mode_) {
108 case PARSE_HOSTS_COMMA_IS_TOKEN:
109 pos_ = text_.find_first_of(" \t\n\r#", pos_);
110 break;
111 case PARSE_HOSTS_COMMA_IS_WHITESPACE:
112 pos_ = text_.find_first_of(" ,\t\n\r#", pos_);
113 break;
114 }
115 }
116
SkipWhitespace()117 void SkipWhitespace() {
118 switch (comma_mode_) {
119 case PARSE_HOSTS_COMMA_IS_TOKEN:
120 pos_ = text_.find_first_not_of(" \t", pos_);
121 break;
122 case PARSE_HOSTS_COMMA_IS_WHITESPACE:
123 pos_ = text_.find_first_not_of(" ,\t", pos_);
124 break;
125 }
126 }
127
128 const StringPiece text_;
129 const char* data_;
130 const size_t end_;
131
132 size_t pos_ = 0;
133 StringPiece token_;
134 bool token_is_ip_ = false;
135
136 const ParseHostsCommaMode comma_mode_;
137 };
138
ParseHostsWithCommaMode(const std::string & contents,DnsHosts * dns_hosts,ParseHostsCommaMode comma_mode)139 void ParseHostsWithCommaMode(const std::string& contents,
140 DnsHosts* dns_hosts,
141 ParseHostsCommaMode comma_mode) {
142 CHECK(dns_hosts);
143
144 StringPiece ip_text;
145 IPAddress ip;
146 AddressFamily family = ADDRESS_FAMILY_IPV4;
147 HostsParser parser(contents, comma_mode);
148 while (parser.Advance()) {
149 if (parser.token_is_ip()) {
150 StringPiece new_ip_text = parser.token();
151 // Some ad-blocking hosts files contain thousands of entries pointing to
152 // the same IP address (usually 127.0.0.1). Don't bother parsing the IP
153 // again if it's the same as the one above it.
154 if (new_ip_text != ip_text) {
155 IPAddress new_ip;
156 if (new_ip.AssignFromIPLiteral(parser.token())) {
157 ip_text = new_ip_text;
158 ip = new_ip;
159 family = (ip.IsIPv4()) ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_IPV6;
160 } else {
161 parser.SkipRestOfLine();
162 }
163 }
164 } else {
165 url::CanonHostInfo canonicalization_info;
166 std::string canonicalized_host =
167 CanonicalizeHost(parser.token(), &canonicalization_info);
168
169 // Skip if token is invalid for host canonicalization, or if it
170 // canonicalizes as an IP address.
171 if (canonicalization_info.family != url::CanonHostInfo::NEUTRAL)
172 continue;
173
174 DnsHostsKey key(std::move(canonicalized_host), family);
175 if (!IsCanonicalizedHostCompliant(key.first))
176 continue;
177 IPAddress* mapped_ip = &(*dns_hosts)[key];
178 if (mapped_ip->empty())
179 *mapped_ip = ip;
180 // else ignore this entry (first hit counts)
181 }
182 }
183 }
184
185 } // namespace
186
ParseHostsWithCommaModeForTesting(const std::string & contents,DnsHosts * dns_hosts,ParseHostsCommaMode comma_mode)187 void ParseHostsWithCommaModeForTesting(const std::string& contents,
188 DnsHosts* dns_hosts,
189 ParseHostsCommaMode comma_mode) {
190 ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
191 }
192
ParseHosts(const std::string & contents,DnsHosts * dns_hosts)193 void ParseHosts(const std::string& contents, DnsHosts* dns_hosts) {
194 ParseHostsCommaMode comma_mode;
195 #if BUILDFLAG(IS_APPLE)
196 // Mac OS X allows commas to separate hostnames.
197 comma_mode = PARSE_HOSTS_COMMA_IS_WHITESPACE;
198 #else
199 // Linux allows commas in hostnames.
200 comma_mode = PARSE_HOSTS_COMMA_IS_TOKEN;
201 #endif
202
203 ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
204
205 // TODO(crbug.com/1377305): Remove this when we have enough data.
206 base::UmaHistogramCounts100000("Net.DNS.DnsHosts.Count", dns_hosts->size());
207
208 #if !BUILDFLAG(CRONET_BUILD)
209 // Cronet disables tracing and doesn't provide an implementation of
210 // base::trace_event::EstimateMemoryUsage for DnsHosts. Having this
211 // conditional is preferred over a fake implementation to avoid reporting fake
212 // metrics.
213 base::UmaHistogramMemoryKB(
214 "Net.DNS.DnsHosts.EstimateMemoryUsage",
215 base::trace_event::EstimateMemoryUsage(*dns_hosts));
216 #endif // !BUILDFLAG(CRONET_BUILD)
217 }
218
219 DnsHostsParser::~DnsHostsParser() = default;
220
DnsHostsFileParser(base::FilePath hosts_file_path)221 DnsHostsFileParser::DnsHostsFileParser(base::FilePath hosts_file_path)
222 : hosts_file_path_(std::move(hosts_file_path)) {}
223
224 DnsHostsFileParser::~DnsHostsFileParser() = default;
225
ParseHosts(DnsHosts * dns_hosts) const226 bool DnsHostsFileParser::ParseHosts(DnsHosts* dns_hosts) const {
227 dns_hosts->clear();
228 // Missing file indicates empty HOSTS.
229 if (!base::PathExists(hosts_file_path_))
230 return true;
231
232 int64_t size;
233 if (!base::GetFileSize(hosts_file_path_, &size))
234 return false;
235
236 // Reject HOSTS files larger than |kMaxHostsSize| bytes.
237 const int64_t kMaxHostsSize = 1 << 25; // 32MB
238
239 // TODO(crbug.com/1377305): Remove this when we have enough data.
240 base::UmaHistogramCustomCounts("Net.DNS.DnsHosts.FileSize", size, 1,
241 kMaxHostsSize * 2, 50);
242 if (size > kMaxHostsSize)
243 return false;
244
245 std::string contents;
246 if (!base::ReadFileToString(hosts_file_path_, &contents))
247 return false;
248
249 net::ParseHosts(contents, dns_hosts);
250 return true;
251 }
252
253 } // namespace net
254