xref: /aosp_15_r20/external/cronet/net/dns/dns_hosts.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/dns/dns_hosts.h"
6 
7 #include <string>
8 #include <utility>
9 
10 #include "base/check.h"
11 #include "base/files/file_path.h"
12 #include "base/files/file_util.h"
13 #include "base/metrics/histogram_functions.h"
14 #include "base/strings/string_piece.h"
15 #include "base/strings/string_util.h"
16 #include "base/trace_event/memory_usage_estimator.h"
17 #include "build/build_config.h"
18 #include "net/base/cronet_buildflags.h"
19 #include "net/base/url_util.h"
20 #include "net/dns/dns_util.h"
21 #include "url/url_canon.h"
22 
23 using base::StringPiece;
24 
25 namespace net {
26 
27 namespace {
28 
29 // Parses the contents of a hosts file.  Returns one token (IP or hostname) at
30 // a time.  Doesn't copy anything; accepts the file as a StringPiece and
31 // returns tokens as StringPieces.
32 class HostsParser {
33  public:
HostsParser(const StringPiece & text,ParseHostsCommaMode comma_mode)34   explicit HostsParser(const StringPiece& text, ParseHostsCommaMode comma_mode)
35       : text_(text),
36         data_(text.data()),
37         end_(text.size()),
38         comma_mode_(comma_mode) {}
39 
40   HostsParser(const HostsParser&) = delete;
41   HostsParser& operator=(const HostsParser&) = delete;
42 
43   // Advances to the next token (IP or hostname).  Returns whether another
44   // token was available.  |token_is_ip| and |token| can be used to find out
45   // the type and text of the token.
Advance()46   bool Advance() {
47     bool next_is_ip = (pos_ == 0);
48     while (pos_ < end_ && pos_ != std::string::npos) {
49       switch (text_[pos_]) {
50         case ' ':
51         case '\t':
52           SkipWhitespace();
53           break;
54 
55         case '\r':
56         case '\n':
57           next_is_ip = true;
58           pos_++;
59           break;
60 
61         case '#':
62           SkipRestOfLine();
63           break;
64 
65         case ',':
66           if (comma_mode_ == PARSE_HOSTS_COMMA_IS_WHITESPACE) {
67             SkipWhitespace();
68             break;
69           }
70 
71           // If comma_mode_ is COMMA_IS_TOKEN, fall through:
72           [[fallthrough]];
73 
74         default: {
75           size_t token_start = pos_;
76           SkipToken();
77           size_t token_end = (pos_ == std::string::npos) ? end_ : pos_;
78 
79           token_ = StringPiece(data_ + token_start, token_end - token_start);
80           token_is_ip_ = next_is_ip;
81 
82           return true;
83         }
84       }
85     }
86 
87     return false;
88   }
89 
90   // Fast-forwards the parser to the next line.  Should be called if an IP
91   // address doesn't parse, to avoid wasting time tokenizing hostnames that
92   // will be ignored.
SkipRestOfLine()93   void SkipRestOfLine() { pos_ = text_.find("\n", pos_); }
94 
95   // Returns whether the last-parsed token is an IP address (true) or a
96   // hostname (false).
token_is_ip()97   bool token_is_ip() { return token_is_ip_; }
98 
99   // Returns the text of the last-parsed token as a StringPiece referencing
100   // the same underlying memory as the StringPiece passed to the constructor.
101   // Returns an empty StringPiece if no token has been parsed or the end of
102   // the input string has been reached.
token()103   const StringPiece& token() { return token_; }
104 
105  private:
SkipToken()106   void SkipToken() {
107     switch (comma_mode_) {
108       case PARSE_HOSTS_COMMA_IS_TOKEN:
109         pos_ = text_.find_first_of(" \t\n\r#", pos_);
110         break;
111       case PARSE_HOSTS_COMMA_IS_WHITESPACE:
112         pos_ = text_.find_first_of(" ,\t\n\r#", pos_);
113         break;
114     }
115   }
116 
SkipWhitespace()117   void SkipWhitespace() {
118     switch (comma_mode_) {
119       case PARSE_HOSTS_COMMA_IS_TOKEN:
120         pos_ = text_.find_first_not_of(" \t", pos_);
121         break;
122       case PARSE_HOSTS_COMMA_IS_WHITESPACE:
123         pos_ = text_.find_first_not_of(" ,\t", pos_);
124         break;
125     }
126   }
127 
128   const StringPiece text_;
129   const char* data_;
130   const size_t end_;
131 
132   size_t pos_ = 0;
133   StringPiece token_;
134   bool token_is_ip_ = false;
135 
136   const ParseHostsCommaMode comma_mode_;
137 };
138 
ParseHostsWithCommaMode(const std::string & contents,DnsHosts * dns_hosts,ParseHostsCommaMode comma_mode)139 void ParseHostsWithCommaMode(const std::string& contents,
140                              DnsHosts* dns_hosts,
141                              ParseHostsCommaMode comma_mode) {
142   CHECK(dns_hosts);
143 
144   StringPiece ip_text;
145   IPAddress ip;
146   AddressFamily family = ADDRESS_FAMILY_IPV4;
147   HostsParser parser(contents, comma_mode);
148   while (parser.Advance()) {
149     if (parser.token_is_ip()) {
150       StringPiece new_ip_text = parser.token();
151       // Some ad-blocking hosts files contain thousands of entries pointing to
152       // the same IP address (usually 127.0.0.1).  Don't bother parsing the IP
153       // again if it's the same as the one above it.
154       if (new_ip_text != ip_text) {
155         IPAddress new_ip;
156         if (new_ip.AssignFromIPLiteral(parser.token())) {
157           ip_text = new_ip_text;
158           ip = new_ip;
159           family = (ip.IsIPv4()) ? ADDRESS_FAMILY_IPV4 : ADDRESS_FAMILY_IPV6;
160         } else {
161           parser.SkipRestOfLine();
162         }
163       }
164     } else {
165       url::CanonHostInfo canonicalization_info;
166       std::string canonicalized_host =
167           CanonicalizeHost(parser.token(), &canonicalization_info);
168 
169       // Skip if token is invalid for host canonicalization, or if it
170       // canonicalizes as an IP address.
171       if (canonicalization_info.family != url::CanonHostInfo::NEUTRAL)
172         continue;
173 
174       DnsHostsKey key(std::move(canonicalized_host), family);
175       if (!IsCanonicalizedHostCompliant(key.first))
176         continue;
177       IPAddress* mapped_ip = &(*dns_hosts)[key];
178       if (mapped_ip->empty())
179         *mapped_ip = ip;
180       // else ignore this entry (first hit counts)
181     }
182   }
183 }
184 
185 }  // namespace
186 
ParseHostsWithCommaModeForTesting(const std::string & contents,DnsHosts * dns_hosts,ParseHostsCommaMode comma_mode)187 void ParseHostsWithCommaModeForTesting(const std::string& contents,
188                                        DnsHosts* dns_hosts,
189                                        ParseHostsCommaMode comma_mode) {
190   ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
191 }
192 
ParseHosts(const std::string & contents,DnsHosts * dns_hosts)193 void ParseHosts(const std::string& contents, DnsHosts* dns_hosts) {
194   ParseHostsCommaMode comma_mode;
195 #if BUILDFLAG(IS_APPLE)
196   // Mac OS X allows commas to separate hostnames.
197   comma_mode = PARSE_HOSTS_COMMA_IS_WHITESPACE;
198 #else
199   // Linux allows commas in hostnames.
200   comma_mode = PARSE_HOSTS_COMMA_IS_TOKEN;
201 #endif
202 
203   ParseHostsWithCommaMode(contents, dns_hosts, comma_mode);
204 
205   // TODO(crbug.com/1377305): Remove this when we have enough data.
206   base::UmaHistogramCounts100000("Net.DNS.DnsHosts.Count", dns_hosts->size());
207 
208 #if !BUILDFLAG(CRONET_BUILD)
209   // Cronet disables tracing and doesn't provide an implementation of
210   // base::trace_event::EstimateMemoryUsage for DnsHosts. Having this
211   // conditional is preferred over a fake implementation to avoid reporting fake
212   // metrics.
213   base::UmaHistogramMemoryKB(
214       "Net.DNS.DnsHosts.EstimateMemoryUsage",
215       base::trace_event::EstimateMemoryUsage(*dns_hosts));
216 #endif  // !BUILDFLAG(CRONET_BUILD)
217 }
218 
219 DnsHostsParser::~DnsHostsParser() = default;
220 
DnsHostsFileParser(base::FilePath hosts_file_path)221 DnsHostsFileParser::DnsHostsFileParser(base::FilePath hosts_file_path)
222     : hosts_file_path_(std::move(hosts_file_path)) {}
223 
224 DnsHostsFileParser::~DnsHostsFileParser() = default;
225 
ParseHosts(DnsHosts * dns_hosts) const226 bool DnsHostsFileParser::ParseHosts(DnsHosts* dns_hosts) const {
227   dns_hosts->clear();
228   // Missing file indicates empty HOSTS.
229   if (!base::PathExists(hosts_file_path_))
230     return true;
231 
232   int64_t size;
233   if (!base::GetFileSize(hosts_file_path_, &size))
234     return false;
235 
236   // Reject HOSTS files larger than |kMaxHostsSize| bytes.
237   const int64_t kMaxHostsSize = 1 << 25;  // 32MB
238 
239   // TODO(crbug.com/1377305): Remove this when we have enough data.
240   base::UmaHistogramCustomCounts("Net.DNS.DnsHosts.FileSize", size, 1,
241                                  kMaxHostsSize * 2, 50);
242   if (size > kMaxHostsSize)
243     return false;
244 
245   std::string contents;
246   if (!base::ReadFileToString(hosts_file_path_, &contents))
247     return false;
248 
249   net::ParseHosts(contents, dns_hosts);
250   return true;
251 }
252 
253 }  // namespace net
254