xref: /aosp_15_r20/external/toybox/toys/net/wget.c (revision cf5a6c84e2b8763fc1a7db14496fd4742913b199)
1 /* wget.c - Simple downloader to get the resource file from a HTTP server
2  *
3  * Copyright 2016 Lipi C.H. Lee <[email protected]>
4  * Copyright 2021 Eric Molitor <[email protected]>
5  *
6  * Relevant sources of information
7  * -------------------------------
8  * HTTP 1.1: https://www.rfc-editor.org/rfc/rfc7230
9  * Chunked Encoding: https://www.rfc-editor.org/rfc/rfc7230#section-4.1
10  * UTF-8 Encoded Header Values https://www.rfc-editor.org/rfc/rfc5987
11  *
12  * Test URLs
13  * ---------
14  * Chunked Encoding: https://jigsaw.w3.org/HTTP/ChunkedScript
15  * Redirect 301: https://jigsaw.w3.org/HTTP/300/301.html
16  * Redirect 302: https://jigsaw.w3.org/HTTP/300/302.html
17  * TLS 1.0: https://tls-v1-0.badssl.com:1010/
18  * TLS 1.1: https://tls-v1-1.badssl.com:1011/
19  * TLS 1.2: https://tls-v1-2.badssl.com:1012/
20  * TLS 1.3: https://tls13.1d.pw/
21  * Transfer Encoding [gzip|deflate]: https://jigsaw.w3.org/HTTP/TE/bar.txt
22  *
23  *
24  * TODO: Add support for configurable TLS versions
25  * TODO: Add support for ftp
26  * TODO: Add support for Transfer Encoding (gzip|deflate)
27  * TODO: Add support for RFC5987
28 
29 USE_WGET(NEWTOY(wget, "<1>1(max-redirect)#<0=20d(debug)O(output-document):p(post-data):", TOYFLAG_USR|TOYFLAG_BIN))
30 
31 config WGET
32   bool "wget"
33   default y
34   help
35     usage: wget [OPTIONS]... [URL]
36         --max-redirect          maximum redirections allowed
37     -d, --debug                 print lots of debugging information
38     -O, --output-document=FILE  specify output filename
39     -p, --post-data=DATA        send data in body of POST request
40 
41     examples:
42       wget http://www.example.com
43 
44 config WGET_LIBTLS
45   bool "Enable HTTPS support for wget via LibTLS"
46   default n
47   depends on WGET && !TOYBOX_LIBCRYPTO
48   help
49     Enable HTTPS support for wget by linking to LibTLS.
50     Supports using libtls, libretls or libtls-bearssl.
51 
52     Use TOYBOX_LIBCRYPTO to enable HTTPS support via OpenSSL.
53 */
54 
55 #define FOR_wget
56 #include "toys.h"
57 
58 #if CFG_WGET_LIBTLS
59 #define WGET_SSL 1
60 #include <tls.h>
61 #elif CFG_TOYBOX_LIBCRYPTO
62 #define WGET_SSL 1
63 #include <openssl/crypto.h>
64 #include <openssl/ssl.h>
65 #include <openssl/err.h>
66 #else
67 #define WGET_SSL 0
68 #endif
69 #define HTTPS (WGET_SSL && TT.https)
70 
71 
72 GLOBALS(
73   char *p, *O;
74   long max_redirect;
75 
76   int sock, https;
77   char *url;
78 #if CFG_WGET_LIBTLS
79   struct tls *tls;
80 #elif CFG_TOYBOX_LIBCRYPTO
81   struct ssl_ctx_st *ctx;
82   struct ssl_st *ssl;
83 #endif
84 )
85 
86 // get http info in URL
wget_info(char * url,char ** host,char ** port,char ** path)87 static void wget_info(char *url, char **host, char **port, char **path)
88 {
89   char *ss = url;
90 
91   // Must start with case insensitive http:// or https://
92   if (strncasecmp(url, "http", 4)) url = 0;
93   else {
94     url += 4;
95     if ((TT.https = WGET_SSL && toupper(*url=='s'))) url++;
96     if (!strstart(&url, "://")) url = 0;
97   }
98   if (!url) error_exit("unsupported protocol: %s", ss);
99   if ((*path = strchr(*host = url, '/'))) *((*path)++) = 0;
100   else *path = "";
101 
102   // Get port number and trim literal IPv6 addresses
103   if (**host=='[' && (ss = strchr(++*host, ']'))) {
104     *ss++ = 0;
105     *port = (*ss==':') ? ++ss : 0;
106   } else if ((*port = strchr(*host, ':'))) *((*port)++) = 0;
107   if (!*port) *port = HTTPS ? "443" : "80";
108 }
109 
wget_connect(char * host,char * port)110 static void wget_connect(char *host, char *port)
111 {
112   if (!HTTPS)
113     TT.sock = xconnectany(xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0));
114   else {
115 #if CFG_WGET_LIBTLS
116     struct tls_config *cfg = NULL;
117     uint32_t protocols;
118     if (!(TT.tls = tls_client()))
119       error_exit("tls_client: %s", tls_error(TT.tls));
120     if (!(cfg = tls_config_new()))
121       error_exit("tls_config_new: %s", tls_config_error(cfg));
122     if (tls_config_parse_protocols(&protocols, "tlsv1.2"))
123       error_exit("tls_config_parse_protocols");
124     if (tls_config_set_protocols(cfg, protocols))
125       error_exit("tls_config_set_protocols: %s", tls_config_error(cfg));
126     if (tls_configure(TT.tls, cfg))
127       error_exit("tls_configure: %s", tls_error(TT.tls));
128     tls_config_free(cfg);
129 
130     if (tls_connect(TT.tls, host, port))
131       error_exit("tls_connect: %s", tls_error(TT.tls));
132 #elif CFG_TOYBOX_LIBCRYPTO
133     SSL_library_init();
134     OpenSSL_add_all_algorithms();
135     SSL_load_error_strings();
136     ERR_load_crypto_strings();
137 
138     TT.ctx = SSL_CTX_new(TLS_client_method());
139     if (!TT.ctx) error_exit("SSL_CTX_new");
140 
141     TT.sock = xconnectany(xgetaddrinfo(host, port, AF_UNSPEC, SOCK_STREAM, 0, 0));
142 
143     TT.ssl = SSL_new(TT.ctx);
144     if (!TT.ssl)
145       error_exit("SSL_new: %s", ERR_error_string(ERR_get_error(), NULL));
146 
147     if (!SSL_set_tlsext_host_name(TT.ssl, host))
148       error_exit("SSL_set_tlsext_host_name: %s",
149                  ERR_error_string(ERR_get_error(), NULL));
150 
151     SSL_set_fd(TT.ssl, TT.sock);
152     if (SSL_connect(TT.ssl) == -1)
153       error_exit("SSL_set_fd: %s", ERR_error_string(ERR_get_error(), NULL));
154 
155     if (FLAG(d)) printf("TLS: %s\n", SSL_get_cipher(TT.ssl));
156 #endif
157   }
158 }
159 
wget_read(void * buf,size_t len)160 static size_t wget_read(void *buf, size_t len)
161 {
162   if (!HTTPS) return xread(TT.sock, buf, len);
163   else {
164     char *err = 0;
165     int ret;
166 
167 #if CFG_WGET_LIBTLS
168     if ((ret = tls_read(TT.tls, buf, len))<0) err = tls_error(TT.tls);
169 #elif CFG_TOYBOX_LIBCRYPTO
170     if ((ret = SSL_read(TT.ssl, buf, len))<0)
171       err = ERR_error_string(ERR_get_error(), 0);
172 #endif
173     if (err) error_exit("https read: %s", err);
174 
175     return ret;
176   }
177 }
178 
wget_write(void * buf,size_t len)179 static void wget_write(void *buf, size_t len)
180 {
181   if (!HTTPS) xwrite(TT.sock, buf, len);
182   else {
183     char *err = 0;
184 
185 #if CFG_WGET_LIBTLS
186     if (len != tls_write(TT.tls, buf, len)) err = tls_error(TT.tls);
187 #elif CFG_TOYBOX_LIBCRYPTO
188     if (len != SSL_write(TT.ssl, buf, len))
189       err = ERR_error_string(ERR_get_error(), 0);
190 #endif
191     if (err) error_exit("https write: %s", err);
192   }
193 }
194 
wget_close()195 static void wget_close()
196 {
197   if (TT.sock) {
198       xclose(TT.sock);
199       TT.sock = 0;
200   }
201 
202 #if CFG_WGET_LIBTLS
203   if (TT.tls) {
204     tls_close(TT.tls);
205     tls_free(TT.tls);
206     TT.tls = 0;
207   }
208 #elif CFG_TOYBOX_LIBCRYPTO
209   if (TT.ssl) {
210     SSL_shutdown(TT.ssl);
211     SSL_free(TT.ssl);
212     TT.ssl = 0;
213   }
214 
215   if (TT.ctx) {
216     SSL_CTX_free(TT.ctx);
217     TT.ctx = 0;
218   }
219 #endif
220 }
221 
wget_find_header(char * header,char * val)222 static char *wget_find_header(char *header, char *val)
223 {
224   if (!(header = strcasestr(header, val))) return 0;
225   header += strlen(val);
226 
227   return xstrndup(header, strcspn(header, "\r\n"));
228 }
229 
wget_main(void)230 void wget_main(void)
231 {
232   long status = 0;
233   size_t len, c_len = 0;
234   int fd = 0, ii;
235   char *body, *index, *host, *port, *path = 0, *chunked, *ss;
236   char agent[] = "toybox wget/" TOYBOX_VERSION;
237 
238   TT.url = escape_url(*toys.optargs, 0);
239 
240   // Ask server for URL, following redirects until success
241   while (status != 200) {
242     if (!TT.max_redirect--) error_exit("Too many redirects");
243 
244     // Connect and write request
245     wget_info(TT.url, &host, &port, &path);
246     if (TT.p) sprintf(toybuf, "Content-Length: %ld\r\n", (long)strlen(TT.p));
247     ss = xmprintf("%s /%s HTTP/1.1\r\nHost: %s\r\nUser-Agent: %s\r\n"
248                   "Connection: close\r\n%s\r\n%s", FLAG(p) ? "POST" : "GET",
249                   path, host, agent, TT.p ? toybuf : "", TT.p ? : "");
250     if (FLAG(d)) printf("--- Request\n%s", ss);
251     wget_connect(host, port);
252     wget_write(ss, strlen(ss));
253     free(ss);
254 
255     // Read HTTP response into toybuf (probably with some body at end)
256     for (index = toybuf;
257       (len = wget_read(index, sizeof(toybuf)-(index-toybuf)))>0; index += len);
258 
259     // Split response into header and body, and null terminate header.
260     // (RFC7230 says header cannot contain NUL.)
261     if (!(body = memmem(ss = toybuf, index-toybuf, "\r\n\r\n", 4)))
262       error_exit("response header too large");
263     *body = 0;
264     body += 4;
265     len = index-body;
266     if (FLAG(d)) printf("--- Response\n%s\n\n", toybuf);
267 
268     status = strstart(&ss, "HTTP/1.1 ") ? strtol(ss, 0, 10) : 0;
269     if ((status == 301) || (status == 302)) {
270       if (!(ss = wget_find_header(toybuf, "Location: ")))
271         error_exit("bad redirect");
272       free(TT.url);
273       TT.url = ss;
274       wget_close();
275     } else if (status != 200) error_exit("response %ld", status);
276   }
277 
278   // Open output file
279   if (TT.O && !strcmp(TT.O, "-")) fd = 1;
280   else if (!TT.O) {
281     ss = wget_find_header(toybuf, "Content-Disposition: attachment; filename=");
282     if (ss) {
283       unescape_url(ss, 1);
284       for (ii = strlen(ss); ii; ii--) {
285         if (ss[ii]=='/') memmove(ss, ss+ii, strlen(ss+ii));
286         break;
287       }
288       if (!*ss) {
289         free(ss);
290         ss = 0;
291       }
292     }
293     if (!ss) {
294       path = 0;
295       for (ii = 0, ss = *toys.optargs; *ss && *ss!='?' && *ss!='#'; ss++)
296         if (*ss=='/' && ++ii>2) path = ss+1;
297       ss = (path && ss>path) ? xstrndup(path, ss-path) : 0;
298       // TODO: handle %20 style escapes
299     }
300     if (!ss) ss = "index.html";
301     if (!access((TT.O = ss), F_OK)) error_exit("%s already exists", TT.O);
302   }
303   // TODO: don't allow header/basename to write to stdout
304   if (!fd) fd = xcreate(TT.O, (O_WRONLY|O_CREAT|O_TRUNC), 0644);
305 
306   // If chunked we offset the first buffer by 2 character, meaning it is
307   // pointing at half of the header boundary, aka '\r\n'. This simplifies
308   // parsing of the first c_len length by allowing the do while loop to fall
309   // through on the first iteration and parse the first c_len size.
310   chunked = wget_find_header(toybuf, "transfer-encoding: chunked");
311   if (chunked) memmove(toybuf, body-2, len += 2);
312   else memmove(toybuf, body, len);
313 
314   // len is the size remaining in toybuf
315   // c_len is the size of the remaining bytes in the current chunk
316   do {
317     if (chunked) {
318       if (c_len > 0) { // We have an incomplete c_len to write
319         if (len <= c_len) { // Buffer is less than the c_len so full write
320           xwrite(fd, toybuf, len);
321           c_len = c_len - len;
322           len = 0;
323         } else { // Buffer is larger than the c_len so partial write
324           xwrite(fd, toybuf, c_len);
325           len = len - c_len;
326           memmove(toybuf, toybuf + c_len, len);
327           c_len = 0;
328         }
329       }
330 
331       // If len is less than 2 we can't validate the chunk boundary so fall
332       // through and go read more into toybuf.
333       if (!c_len && (len > 2)) {
334         char *c;
335         if (strncmp(toybuf, "\r\n", 2) != 0) error_exit("chunk boundary");
336 
337         // If we can't find the end of the new chunk signature fall through and
338         // read more into toybuf.
339         c = memmem(toybuf + 2, len - 2, "\r\n",2);
340         if (c) {
341           c_len = strtol(toybuf + 2, NULL, 16);
342           if (!c_len) break; // A c_len of zero means we are complete
343           len = len - (c - toybuf) - 2;
344           memmove(toybuf, c + 2, len);
345         }
346       }
347 
348       if (len == sizeof(toybuf)) error_exit("chunk overflow");
349     } else {
350       xwrite(fd, toybuf, len);
351       len = 0;
352     }
353   } while ((len += wget_read(toybuf + len, sizeof(toybuf) - len)) > 0);
354 
355   wget_close();
356   free(TT.url);
357 }
358