xref: /freebsd/lib/libfetch/http.c (revision 71fe318b852b8dfb3e799cb12ef184750f7f8eac)
1 /*-
2  * Copyright (c) 2000 Dag-Erling Co�dan Sm�rgrav
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote products
15  *    derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 /*
33  * The following copyright applies to the base64 code:
34  *
35  *-
36  * Copyright 1997 Massachusetts Institute of Technology
37  *
38  * Permission to use, copy, modify, and distribute this software and
39  * its documentation for any purpose and without fee is hereby
40  * granted, provided that both the above copyright notice and this
41  * permission notice appear in all copies, that both the above
42  * copyright notice and this permission notice appear in all
43  * supporting documentation, and that the name of M.I.T. not be used
44  * in advertising or publicity pertaining to distribution of the
45  * software without specific, written prior permission.  M.I.T. makes
46  * no representations about the suitability of this software for any
47  * purpose.  It is provided "as is" without express or implied
48  * warranty.
49  *
50  * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
51  * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
52  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
53  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
54  * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
55  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
56  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
57  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
58  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
59  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
60  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61  * SUCH DAMAGE.
62  */
63 
64 #include <sys/param.h>
65 #include <sys/socket.h>
66 
67 #include <ctype.h>
68 #include <err.h>
69 #include <errno.h>
70 #include <locale.h>
71 #include <netdb.h>
72 #include <stdarg.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76 #include <time.h>
77 #include <unistd.h>
78 
79 #include "fetch.h"
80 #include "common.h"
81 #include "httperr.h"
82 
83 /* Maximum number of redirects to follow */
84 #define MAX_REDIRECT 5
85 
86 /* Symbolic names for reply codes we care about */
87 #define HTTP_OK			200
88 #define HTTP_PARTIAL		206
89 #define HTTP_MOVED_PERM		301
90 #define HTTP_MOVED_TEMP		302
91 #define HTTP_SEE_OTHER		303
92 #define HTTP_NEED_AUTH		401
93 #define HTTP_NEED_PROXY_AUTH	407
94 #define HTTP_PROTOCOL_ERROR	999
95 
96 #define HTTP_REDIRECT(xyz) ((xyz) == HTTP_MOVED_PERM \
97 			    || (xyz) == HTTP_MOVED_TEMP \
98 			    || (xyz) == HTTP_SEE_OTHER)
99 
100 #define HTTP_ERROR(xyz) ((xyz) > 400 && (xyz) < 599)
101 
102 
103 /*****************************************************************************
104  * I/O functions for decoding chunked streams
105  */
106 
107 struct httpio
108 {
109 	conn_t		*conn;		/* connection */
110 	int		 chunked;	/* chunked mode */
111 	char		*buf;		/* chunk buffer */
112 	size_t		 bufsize;	/* size of chunk buffer */
113 	ssize_t		 buflen;	/* amount of data currently in buffer */
114 	int		 bufpos;	/* current read offset in buffer */
115 	int		 eof;		/* end-of-file flag */
116 	int		 error;		/* error flag */
117 	size_t		 chunksize;	/* remaining size of current chunk */
118 #ifndef NDEBUG
119 	size_t		 total;
120 #endif
121 };
122 
123 /*
124  * Get next chunk header
125  */
126 static int
127 _http_new_chunk(struct httpio *io)
128 {
129 	char *p;
130 
131 	if (_fetch_getln(io->conn) == -1)
132 		return (-1);
133 
134 	if (io->conn->buflen < 2 || !ishexnumber(*io->conn->buf))
135 		return (-1);
136 
137 	for (p = io->conn->buf; *p && !isspace(*p); ++p) {
138 		if (*p == ';')
139 			break;
140 		if (!ishexnumber(*p))
141 			return (-1);
142 		if (isdigit(*p)) {
143 			io->chunksize = io->chunksize * 16 +
144 			    *p - '0';
145 		} else {
146 			io->chunksize = io->chunksize * 16 +
147 			    10 + tolower(*p) - 'a';
148 		}
149 	}
150 
151 #ifndef NDEBUG
152 	if (fetchDebug) {
153 		io->total += io->chunksize;
154 		if (io->chunksize == 0)
155 			fprintf(stderr, "_http_fillbuf(): "
156 			    "end of last chunk\n");
157 		else
158 			fprintf(stderr, "_http_fillbuf(): "
159 			    "new chunk: %lu (%lu)\n",
160 			    (unsigned long)io->chunksize, (unsigned long)io->total);
161 	}
162 #endif
163 
164 	return (io->chunksize);
165 }
166 
167 /*
168  * Grow the input buffer to at least len bytes
169  */
170 static inline int
171 _http_growbuf(struct httpio *io, size_t len)
172 {
173 	char *tmp;
174 
175 	if (io->bufsize >= len)
176 		return (0);
177 
178 	if ((tmp = realloc(io->buf, len)) == NULL)
179 		return (-1);
180 	io->buf = tmp;
181 	io->bufsize = len;
182 	return (0);
183 }
184 
185 /*
186  * Fill the input buffer, do chunk decoding on the fly
187  */
188 static int
189 _http_fillbuf(struct httpio *io, size_t len)
190 {
191 	if (io->error)
192 		return (-1);
193 	if (io->eof)
194 		return (0);
195 
196 	if (io->chunked == 0) {
197 		if (_http_growbuf(io, len) == -1)
198 			return (-1);
199 		if ((io->buflen = _fetch_read(io->conn, io->buf, len)) == -1)
200 			return (-1);
201 		io->bufpos = 0;
202 		return (io->buflen);
203 	}
204 
205 	if (io->chunksize == 0) {
206 		switch (_http_new_chunk(io)) {
207 		case -1:
208 			io->error = 1;
209 			return (-1);
210 		case 0:
211 			io->eof = 1;
212 			return (0);
213 		}
214 	}
215 
216 	if (len > io->chunksize)
217 		len = io->chunksize;
218 	if (_http_growbuf(io, len) == -1)
219 		return (-1);
220 	if ((io->buflen = _fetch_read(io->conn, io->buf, len)) == -1)
221 		return (-1);
222 	io->chunksize -= io->buflen;
223 
224 	if (io->chunksize == 0) {
225 		char endl[2];
226 
227 		if (_fetch_read(io->conn, endl, 2) != 2 ||
228 		    endl[0] != '\r' || endl[1] != '\n')
229 			return (-1);
230 	}
231 
232 	io->bufpos = 0;
233 
234 	return (io->buflen);
235 }
236 
237 /*
238  * Read function
239  */
240 static int
241 _http_readfn(void *v, char *buf, int len)
242 {
243 	struct httpio *io = (struct httpio *)v;
244 	int l, pos;
245 
246 	if (io->error)
247 		return (-1);
248 	if (io->eof)
249 		return (0);
250 
251 	for (pos = 0; len > 0; pos += l, len -= l) {
252 		/* empty buffer */
253 		if (!io->buf || io->bufpos == io->buflen)
254 			if (_http_fillbuf(io, len) < 1)
255 				break;
256 		l = io->buflen - io->bufpos;
257 		if (len < l)
258 			l = len;
259 		bcopy(io->buf + io->bufpos, buf + pos, l);
260 		io->bufpos += l;
261 	}
262 
263 	if (!pos && io->error)
264 		return (-1);
265 	return (pos);
266 }
267 
268 /*
269  * Write function
270  */
271 static int
272 _http_writefn(void *v, const char *buf, int len)
273 {
274 	struct httpio *io = (struct httpio *)v;
275 
276 	return (_fetch_write(io->conn, buf, len));
277 }
278 
279 /*
280  * Close function
281  */
282 static int
283 _http_closefn(void *v)
284 {
285 	struct httpio *io = (struct httpio *)v;
286 	int r;
287 
288 	r = _fetch_close(io->conn);
289 	if (io->buf)
290 		free(io->buf);
291 	free(io);
292 	return (r);
293 }
294 
295 /*
296  * Wrap a file descriptor up
297  */
298 static FILE *
299 _http_funopen(conn_t *conn, int chunked)
300 {
301 	struct httpio *io;
302 	FILE *f;
303 
304 	if ((io = calloc(1, sizeof *io)) == NULL) {
305 		_fetch_syserr();
306 		return (NULL);
307 	}
308 	io->conn = conn;
309 	io->chunked = chunked;
310 	f = funopen(io, _http_readfn, _http_writefn, NULL, _http_closefn);
311 	if (f == NULL) {
312 		_fetch_syserr();
313 		free(io);
314 		return (NULL);
315 	}
316 	return (f);
317 }
318 
319 
320 /*****************************************************************************
321  * Helper functions for talking to the server and parsing its replies
322  */
323 
324 /* Header types */
325 typedef enum {
326 	hdr_syserror = -2,
327 	hdr_error = -1,
328 	hdr_end = 0,
329 	hdr_unknown = 1,
330 	hdr_content_length,
331 	hdr_content_range,
332 	hdr_last_modified,
333 	hdr_location,
334 	hdr_transfer_encoding,
335 	hdr_www_authenticate
336 } hdr_t;
337 
338 /* Names of interesting headers */
339 static struct {
340 	hdr_t		 num;
341 	const char	*name;
342 } hdr_names[] = {
343 	{ hdr_content_length,		"Content-Length" },
344 	{ hdr_content_range,		"Content-Range" },
345 	{ hdr_last_modified,		"Last-Modified" },
346 	{ hdr_location,			"Location" },
347 	{ hdr_transfer_encoding,	"Transfer-Encoding" },
348 	{ hdr_www_authenticate,		"WWW-Authenticate" },
349 	{ hdr_unknown,			NULL },
350 };
351 
352 /*
353  * Send a formatted line; optionally echo to terminal
354  */
355 static int
356 _http_cmd(conn_t *conn, const char *fmt, ...)
357 {
358 	va_list ap;
359 	size_t len;
360 	char *msg;
361 	int r;
362 
363 	va_start(ap, fmt);
364 	len = vasprintf(&msg, fmt, ap);
365 	va_end(ap);
366 
367 	if (msg == NULL) {
368 		errno = ENOMEM;
369 		_fetch_syserr();
370 		return (-1);
371 	}
372 
373 	r = _fetch_putln(conn, msg, len);
374 	free(msg);
375 
376 	if (r == -1) {
377 		_fetch_syserr();
378 		return (-1);
379 	}
380 
381 	return (0);
382 }
383 
384 /*
385  * Get and parse status line
386  */
387 static int
388 _http_get_reply(conn_t *conn)
389 {
390 	char *p;
391 
392 	if (_fetch_getln(conn) == -1)
393 		return (-1);
394 	/*
395 	 * A valid status line looks like "HTTP/m.n xyz reason" where m
396 	 * and n are the major and minor protocol version numbers and xyz
397 	 * is the reply code.
398 	 * Unfortunately, there are servers out there (NCSA 1.5.1, to name
399 	 * just one) that do not send a version number, so we can't rely
400 	 * on finding one, but if we do, insist on it being 1.0 or 1.1.
401 	 * We don't care about the reason phrase.
402 	 */
403 	if (strncmp(conn->buf, "HTTP", 4) != 0)
404 		return (HTTP_PROTOCOL_ERROR);
405 	p = conn->buf + 4;
406 	if (*p == '/') {
407 		if (p[1] != '1' || p[2] != '.' || (p[3] != '0' && p[3] != '1'))
408 			return (HTTP_PROTOCOL_ERROR);
409 		p += 4;
410 	}
411 	if (*p != ' ' || !isdigit(p[1]) || !isdigit(p[2]) || !isdigit(p[3]))
412 		return (HTTP_PROTOCOL_ERROR);
413 
414 	conn->err = (p[1] - '0') * 100 + (p[2] - '0') * 10 + (p[3] - '0');
415 	return (conn->err);
416 }
417 
418 /*
419  * Check a header; if the type matches the given string, return a pointer
420  * to the beginning of the value.
421  */
422 static const char *
423 _http_match(const char *str, const char *hdr)
424 {
425 	while (*str && *hdr && tolower(*str++) == tolower(*hdr++))
426 		/* nothing */;
427 	if (*str || *hdr != ':')
428 		return (NULL);
429 	while (*hdr && isspace(*++hdr))
430 		/* nothing */;
431 	return (hdr);
432 }
433 
434 /*
435  * Get the next header and return the appropriate symbolic code.
436  */
437 static hdr_t
438 _http_next_header(conn_t *conn, const char **p)
439 {
440 	int i;
441 
442 	if (_fetch_getln(conn) == -1)
443 		return (hdr_syserror);
444 	while (conn->buflen && isspace(conn->buf[conn->buflen - 1]))
445 		conn->buflen--;
446 	conn->buf[conn->buflen] = '\0';
447 	if (conn->buflen == 0)
448 		return (hdr_end);
449 	/*
450 	 * We could check for malformed headers but we don't really care.
451 	 * A valid header starts with a token immediately followed by a
452 	 * colon; a token is any sequence of non-control, non-whitespace
453 	 * characters except "()<>@,;:\\\"{}".
454 	 */
455 	for (i = 0; hdr_names[i].num != hdr_unknown; i++)
456 		if ((*p = _http_match(hdr_names[i].name, conn->buf)) != NULL)
457 			return (hdr_names[i].num);
458 	return (hdr_unknown);
459 }
460 
461 /*
462  * Parse a last-modified header
463  */
464 static int
465 _http_parse_mtime(const char *p, time_t *mtime)
466 {
467 	char locale[64], *r;
468 	struct tm tm;
469 
470 	strncpy(locale, setlocale(LC_TIME, NULL), sizeof locale);
471 	setlocale(LC_TIME, "C");
472 	r = strptime(p, "%a, %d %b %Y %H:%M:%S GMT", &tm);
473 	/* XXX should add support for date-2 and date-3 */
474 	setlocale(LC_TIME, locale);
475 	if (r == NULL)
476 		return (-1);
477 	DEBUG(fprintf(stderr, "last modified: [%04d-%02d-%02d "
478 		  "%02d:%02d:%02d]\n",
479 		  tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
480 		  tm.tm_hour, tm.tm_min, tm.tm_sec));
481 	*mtime = timegm(&tm);
482 	return (0);
483 }
484 
485 /*
486  * Parse a content-length header
487  */
488 static int
489 _http_parse_length(const char *p, off_t *length)
490 {
491 	off_t len;
492 
493 	for (len = 0; *p && isdigit(*p); ++p)
494 		len = len * 10 + (*p - '0');
495 	if (*p)
496 		return (-1);
497 	DEBUG(fprintf(stderr, "content length: [%lld]\n",
498 	    (long long)len));
499 	*length = len;
500 	return (0);
501 }
502 
503 /*
504  * Parse a content-range header
505  */
506 static int
507 _http_parse_range(const char *p, off_t *offset, off_t *length, off_t *size)
508 {
509 	off_t first, last, len;
510 
511 	if (strncasecmp(p, "bytes ", 6) != 0)
512 		return (-1);
513 	for (first = 0, p += 6; *p && isdigit(*p); ++p)
514 		first = first * 10 + *p - '0';
515 	if (*p != '-')
516 		return (-1);
517 	for (last = 0, ++p; *p && isdigit(*p); ++p)
518 		last = last * 10 + *p - '0';
519 	if (first > last || *p != '/')
520 		return (-1);
521 	for (len = 0, ++p; *p && isdigit(*p); ++p)
522 		len = len * 10 + *p - '0';
523 	if (*p || len < last - first + 1)
524 		return (-1);
525 	DEBUG(fprintf(stderr, "content range: [%lld-%lld/%lld]\n",
526 	    (long long)first, (long long)last, (long long)len));
527 	*offset = first;
528 	*length = last - first + 1;
529 	*size = len;
530 	return (0);
531 }
532 
533 
534 /*****************************************************************************
535  * Helper functions for authorization
536  */
537 
538 /*
539  * Base64 encoding
540  */
541 static char *
542 _http_base64(const char *src)
543 {
544 	static const char base64[] =
545 	    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
546 	    "abcdefghijklmnopqrstuvwxyz"
547 	    "0123456789+/";
548 	char *str, *dst;
549 	size_t l;
550 	int t, r;
551 
552 	l = strlen(src);
553 	if ((str = malloc(((l + 2) / 3) * 4)) == NULL)
554 		return (NULL);
555 	dst = str;
556 	r = 0;
557 
558 	while (l >= 3) {
559 		t = (src[0] << 16) | (src[1] << 8) | src[2];
560 		dst[0] = base64[(t >> 18) & 0x3f];
561 		dst[1] = base64[(t >> 12) & 0x3f];
562 		dst[2] = base64[(t >> 6) & 0x3f];
563 		dst[3] = base64[(t >> 0) & 0x3f];
564 		src += 3; l -= 3;
565 		dst += 4; r += 4;
566 	}
567 
568 	switch (l) {
569 	case 2:
570 		t = (src[0] << 16) | (src[1] << 8);
571 		dst[0] = base64[(t >> 18) & 0x3f];
572 		dst[1] = base64[(t >> 12) & 0x3f];
573 		dst[2] = base64[(t >> 6) & 0x3f];
574 		dst[3] = '=';
575 		dst += 4;
576 		r += 4;
577 		break;
578 	case 1:
579 		t = src[0] << 16;
580 		dst[0] = base64[(t >> 18) & 0x3f];
581 		dst[1] = base64[(t >> 12) & 0x3f];
582 		dst[2] = dst[3] = '=';
583 		dst += 4;
584 		r += 4;
585 		break;
586 	case 0:
587 		break;
588 	}
589 
590 	*dst = 0;
591 	return (str);
592 }
593 
594 /*
595  * Encode username and password
596  */
597 static int
598 _http_basic_auth(conn_t *conn, const char *hdr, const char *usr, const char *pwd)
599 {
600 	char *upw, *auth;
601 	int r;
602 
603 	DEBUG(fprintf(stderr, "usr: [%s]\n", usr));
604 	DEBUG(fprintf(stderr, "pwd: [%s]\n", pwd));
605 	if (asprintf(&upw, "%s:%s", usr, pwd) == -1)
606 		return (-1);
607 	auth = _http_base64(upw);
608 	free(upw);
609 	if (auth == NULL)
610 		return (-1);
611 	r = _http_cmd(conn, "%s: Basic %s", hdr, auth);
612 	free(auth);
613 	return (r);
614 }
615 
616 /*
617  * Send an authorization header
618  */
619 static int
620 _http_authorize(conn_t *conn, const char *hdr, const char *p)
621 {
622 	/* basic authorization */
623 	if (strncasecmp(p, "basic:", 6) == 0) {
624 		char *user, *pwd, *str;
625 		int r;
626 
627 		/* skip realm */
628 		for (p += 6; *p && *p != ':'; ++p)
629 			/* nothing */ ;
630 		if (!*p || strchr(++p, ':') == NULL)
631 			return (-1);
632 		if ((str = strdup(p)) == NULL)
633 			return (-1); /* XXX */
634 		user = str;
635 		pwd = strchr(str, ':');
636 		*pwd++ = '\0';
637 		r = _http_basic_auth(conn, hdr, user, pwd);
638 		free(str);
639 		return (r);
640 	}
641 	return (-1);
642 }
643 
644 
645 /*****************************************************************************
646  * Helper functions for connecting to a server or proxy
647  */
648 
649 /*
650  * Connect to the correct HTTP server or proxy.
651  */
652 static conn_t *
653 _http_connect(struct url *URL, struct url *purl, const char *flags)
654 {
655 	conn_t *conn;
656 	int verbose;
657 	int af;
658 
659 #ifdef INET6
660 	af = AF_UNSPEC;
661 #else
662 	af = AF_INET;
663 #endif
664 
665 	verbose = CHECK_FLAG('v');
666 	if (CHECK_FLAG('4'))
667 		af = AF_INET;
668 #ifdef INET6
669 	else if (CHECK_FLAG('6'))
670 		af = AF_INET6;
671 #endif
672 
673 	if (purl && strcasecmp(URL->scheme, SCHEME_HTTPS) != 0) {
674 		URL = purl;
675 	} else if (strcasecmp(URL->scheme, SCHEME_FTP) == 0) {
676 		/* can't talk http to an ftp server */
677 		/* XXX should set an error code */
678 		return (NULL);
679 	}
680 
681 	if ((conn = _fetch_connect(URL->host, URL->port, af, verbose)) == NULL)
682 		/* _fetch_connect() has already set an error code */
683 		return (NULL);
684 	if (strcasecmp(URL->scheme, SCHEME_HTTPS) == 0 &&
685 	    _fetch_ssl(conn, verbose) == -1) {
686 		_fetch_close(conn);
687 		/* grrr */
688 		errno = EAUTH;
689 		_fetch_syserr();
690 		return (NULL);
691 	}
692 	return (conn);
693 }
694 
695 static struct url *
696 _http_get_proxy(void)
697 {
698 	struct url *purl;
699 	char *p;
700 
701 	if (((p = getenv("HTTP_PROXY")) || (p = getenv("http_proxy"))) &&
702 	    (purl = fetchParseURL(p))) {
703 		if (!*purl->scheme)
704 			strcpy(purl->scheme, SCHEME_HTTP);
705 		if (!purl->port)
706 			purl->port = _fetch_default_proxy_port(purl->scheme);
707 		if (strcasecmp(purl->scheme, SCHEME_HTTP) == 0)
708 			return (purl);
709 		fetchFreeURL(purl);
710 	}
711 	return (NULL);
712 }
713 
714 static void
715 _http_print_html(FILE *out, FILE *in)
716 {
717 	size_t len;
718 	char *line, *p, *q;
719 	int comment, tag;
720 
721 	comment = tag = 0;
722 	while ((line = fgetln(in, &len)) != NULL) {
723 		while (len && isspace(line[len - 1]))
724 			--len;
725 		for (p = q = line; q < line + len; ++q) {
726 			if (comment && *q == '-') {
727 				if (q + 2 < line + len &&
728 				    strcmp(q, "-->") == 0) {
729 					tag = comment = 0;
730 					q += 2;
731 				}
732 			} else if (tag && !comment && *q == '>') {
733 				p = q + 1;
734 				tag = 0;
735 			} else if (!tag && *q == '<') {
736 				if (q > p)
737 					fwrite(p, q - p, 1, out);
738 				tag = 1;
739 				if (q + 3 < line + len &&
740 				    strcmp(q, "<!--") == 0) {
741 					comment = 1;
742 					q += 3;
743 				}
744 			}
745 		}
746 		if (!tag && q > p)
747 			fwrite(p, q - p, 1, out);
748 		fputc('\n', out);
749 	}
750 }
751 
752 
753 /*****************************************************************************
754  * Core
755  */
756 
757 /*
758  * Send a request and process the reply
759  *
760  * XXX This function is way too long, the do..while loop should be split
761  * XXX off into a separate function.
762  */
763 FILE *
764 _http_request(struct url *URL, const char *op, struct url_stat *us,
765     struct url *purl, const char *flags)
766 {
767 	conn_t *conn;
768 	struct url *url, *new;
769 	int chunked, direct, need_auth, noredirect, verbose;
770 	int e, i, n;
771 	off_t offset, clength, length, size;
772 	time_t mtime;
773 	const char *p;
774 	FILE *f;
775 	hdr_t h;
776 	char *host;
777 #ifdef INET6
778 	char hbuf[MAXHOSTNAMELEN + 1];
779 #endif
780 
781 	direct = CHECK_FLAG('d');
782 	noredirect = CHECK_FLAG('A');
783 	verbose = CHECK_FLAG('v');
784 
785 	if (direct && purl) {
786 		fetchFreeURL(purl);
787 		purl = NULL;
788 	}
789 
790 	/* try the provided URL first */
791 	url = URL;
792 
793 	/* if the A flag is set, we only get one try */
794 	n = noredirect ? 1 : MAX_REDIRECT;
795 	i = 0;
796 
797 	e = HTTP_PROTOCOL_ERROR;
798 	need_auth = 0;
799 	do {
800 		new = NULL;
801 		chunked = 0;
802 		offset = 0;
803 		clength = -1;
804 		length = -1;
805 		size = -1;
806 		mtime = 0;
807 
808 		/* check port */
809 		if (!url->port)
810 			url->port = _fetch_default_port(url->scheme);
811 
812 		/* were we redirected to an FTP URL? */
813 		if (purl == NULL && strcmp(url->scheme, SCHEME_FTP) == 0) {
814 			if (strcmp(op, "GET") == 0)
815 				return (_ftp_request(url, "RETR", us, purl, flags));
816 			else if (strcmp(op, "HEAD") == 0)
817 				return (_ftp_request(url, "STAT", us, purl, flags));
818 		}
819 
820 		/* connect to server or proxy */
821 		if ((conn = _http_connect(url, purl, flags)) == NULL)
822 			goto ouch;
823 
824 		host = url->host;
825 #ifdef INET6
826 		if (strchr(url->host, ':')) {
827 			snprintf(hbuf, sizeof(hbuf), "[%s]", url->host);
828 			host = hbuf;
829 		}
830 #endif
831 
832 		/* send request */
833 		if (verbose)
834 			_fetch_info("requesting %s://%s:%d%s",
835 			    url->scheme, host, url->port, url->doc);
836 		if (purl) {
837 			_http_cmd(conn, "%s %s://%s:%d%s HTTP/1.1",
838 			    op, url->scheme, host, url->port, url->doc);
839 		} else {
840 			_http_cmd(conn, "%s %s HTTP/1.1",
841 			    op, url->doc);
842 		}
843 
844 		/* virtual host */
845 		if (url->port == _fetch_default_port(url->scheme))
846 			_http_cmd(conn, "Host: %s", host);
847 		else
848 			_http_cmd(conn, "Host: %s:%d", host, url->port);
849 
850 		/* proxy authorization */
851 		if (purl) {
852 			if (*purl->user || *purl->pwd)
853 				_http_basic_auth(conn, "Proxy-Authorization",
854 				    purl->user, purl->pwd);
855 			else if ((p = getenv("HTTP_PROXY_AUTH")) != NULL && *p != '\0')
856 				_http_authorize(conn, "Proxy-Authorization", p);
857 		}
858 
859 		/* server authorization */
860 		if (need_auth || *url->user || *url->pwd) {
861 			if (*url->user || *url->pwd)
862 				_http_basic_auth(conn, "Authorization", url->user, url->pwd);
863 			else if ((p = getenv("HTTP_AUTH")) != NULL && *p != '\0')
864 				_http_authorize(conn, "Authorization", p);
865 			else if (fetchAuthMethod && fetchAuthMethod(url) == 0) {
866 				_http_basic_auth(conn, "Authorization", url->user, url->pwd);
867 			} else {
868 				_http_seterr(HTTP_NEED_AUTH);
869 				goto ouch;
870 			}
871 		}
872 
873 		/* other headers */
874 		if ((p = getenv("HTTP_USER_AGENT")) != NULL && *p != '\0')
875 			_http_cmd(conn, "User-Agent: %s", p);
876 		else
877 			_http_cmd(conn, "User-Agent: %s " _LIBFETCH_VER, getprogname());
878 		if (url->offset)
879 			_http_cmd(conn, "Range: bytes=%lld-", (long long)url->offset);
880 		_http_cmd(conn, "Connection: close");
881 		_http_cmd(conn, "");
882 
883 		/* get reply */
884 		switch (_http_get_reply(conn)) {
885 		case HTTP_OK:
886 		case HTTP_PARTIAL:
887 			/* fine */
888 			break;
889 		case HTTP_MOVED_PERM:
890 		case HTTP_MOVED_TEMP:
891 		case HTTP_SEE_OTHER:
892 			/*
893 			 * Not so fine, but we still have to read the headers to
894 			 * get the new location.
895 			 */
896 			break;
897 		case HTTP_NEED_AUTH:
898 			if (need_auth) {
899 				/*
900 				 * We already sent out authorization code, so there's
901 				 * nothing more we can do.
902 				 */
903 				_http_seterr(conn->err);
904 				goto ouch;
905 			}
906 			/* try again, but send the password this time */
907 			if (verbose)
908 				_fetch_info("server requires authorization");
909 			break;
910 		case HTTP_NEED_PROXY_AUTH:
911 			/*
912 			 * If we're talking to a proxy, we already sent our proxy
913 			 * authorization code, so there's nothing more we can do.
914 			 */
915 			_http_seterr(conn->err);
916 			goto ouch;
917 		case HTTP_PROTOCOL_ERROR:
918 			/* fall through */
919 		case -1:
920 			_fetch_syserr();
921 			goto ouch;
922 		default:
923 			_http_seterr(conn->err);
924 			if (!verbose)
925 				goto ouch;
926 			/* fall through so we can get the full error message */
927 		}
928 
929 		/* get headers */
930 		do {
931 			switch ((h = _http_next_header(conn, &p))) {
932 			case hdr_syserror:
933 				_fetch_syserr();
934 				goto ouch;
935 			case hdr_error:
936 				_http_seterr(HTTP_PROTOCOL_ERROR);
937 				goto ouch;
938 			case hdr_content_length:
939 				_http_parse_length(p, &clength);
940 				break;
941 			case hdr_content_range:
942 				_http_parse_range(p, &offset, &length, &size);
943 				break;
944 			case hdr_last_modified:
945 				_http_parse_mtime(p, &mtime);
946 				break;
947 			case hdr_location:
948 				if (!HTTP_REDIRECT(conn->err))
949 					break;
950 				if (new)
951 					free(new);
952 				if (verbose)
953 					_fetch_info("%d redirect to %s", conn->err, p);
954 				if (*p == '/')
955 					/* absolute path */
956 					new = fetchMakeURL(url->scheme, url->host, url->port, p,
957 					    url->user, url->pwd);
958 				else
959 					new = fetchParseURL(p);
960 				if (new == NULL) {
961 					/* XXX should set an error code */
962 					DEBUG(fprintf(stderr, "failed to parse new URL\n"));
963 					goto ouch;
964 				}
965 				if (!*new->user && !*new->pwd) {
966 					strcpy(new->user, url->user);
967 					strcpy(new->pwd, url->pwd);
968 				}
969 				new->offset = url->offset;
970 				new->length = url->length;
971 				break;
972 			case hdr_transfer_encoding:
973 				/* XXX weak test*/
974 				chunked = (strcasecmp(p, "chunked") == 0);
975 				break;
976 			case hdr_www_authenticate:
977 				if (conn->err != HTTP_NEED_AUTH)
978 					break;
979 				/* if we were smarter, we'd check the method and realm */
980 				break;
981 			case hdr_end:
982 				/* fall through */
983 			case hdr_unknown:
984 				/* ignore */
985 				break;
986 			}
987 		} while (h > hdr_end);
988 
989 		/* we need to provide authentication */
990 		if (conn->err == HTTP_NEED_AUTH) {
991 			e = conn->err;
992 			need_auth = 1;
993 			_fetch_close(conn);
994 			conn = NULL;
995 			continue;
996 		}
997 
998 		/* we have a hit or an error */
999 		if (conn->err == HTTP_OK || conn->err == HTTP_PARTIAL || HTTP_ERROR(conn->err))
1000 			break;
1001 
1002 		/* all other cases: we got a redirect */
1003 		e = conn->err;
1004 		need_auth = 0;
1005 		_fetch_close(conn);
1006 		conn = NULL;
1007 		if (!new) {
1008 			DEBUG(fprintf(stderr, "redirect with no new location\n"));
1009 			break;
1010 		}
1011 		if (url != URL)
1012 			fetchFreeURL(url);
1013 		url = new;
1014 	} while (++i < n);
1015 
1016 	/* we failed, or ran out of retries */
1017 	if (conn == NULL) {
1018 		_http_seterr(e);
1019 		goto ouch;
1020 	}
1021 
1022 	DEBUG(fprintf(stderr, "offset %lld, length %lld,"
1023 		  " size %lld, clength %lld\n",
1024 		  (long long)offset, (long long)length,
1025 		  (long long)size, (long long)clength));
1026 
1027 	/* check for inconsistencies */
1028 	if (clength != -1 && length != -1 && clength != length) {
1029 		_http_seterr(HTTP_PROTOCOL_ERROR);
1030 		goto ouch;
1031 	}
1032 	if (clength == -1)
1033 		clength = length;
1034 	if (clength != -1)
1035 		length = offset + clength;
1036 	if (length != -1 && size != -1 && length != size) {
1037 		_http_seterr(HTTP_PROTOCOL_ERROR);
1038 		goto ouch;
1039 	}
1040 	if (size == -1)
1041 		size = length;
1042 
1043 	/* fill in stats */
1044 	if (us) {
1045 		us->size = size;
1046 		us->atime = us->mtime = mtime;
1047 	}
1048 
1049 	/* too far? */
1050 	if (offset > URL->offset) {
1051 		_http_seterr(HTTP_PROTOCOL_ERROR);
1052 		goto ouch;
1053 	}
1054 
1055 	/* report back real offset and size */
1056 	URL->offset = offset;
1057 	URL->length = clength;
1058 
1059 	/* wrap it up in a FILE */
1060 	if ((f = _http_funopen(conn, chunked)) == NULL) {
1061 		_fetch_syserr();
1062 		goto ouch;
1063 	}
1064 
1065 	if (url != URL)
1066 		fetchFreeURL(url);
1067 	if (purl)
1068 		fetchFreeURL(purl);
1069 
1070 	if (HTTP_ERROR(conn->err)) {
1071 		_http_print_html(stderr, f);
1072 		fclose(f);
1073 		f = NULL;
1074 	}
1075 
1076 	return (f);
1077 
1078 ouch:
1079 	if (url != URL)
1080 		fetchFreeURL(url);
1081 	if (purl)
1082 		fetchFreeURL(purl);
1083 	if (conn != NULL)
1084 		_fetch_close(conn);
1085 	return (NULL);
1086 }
1087 
1088 
1089 /*****************************************************************************
1090  * Entry points
1091  */
1092 
1093 /*
1094  * Retrieve and stat a file by HTTP
1095  */
1096 FILE *
1097 fetchXGetHTTP(struct url *URL, struct url_stat *us, const char *flags)
1098 {
1099 	return (_http_request(URL, "GET", us, _http_get_proxy(), flags));
1100 }
1101 
1102 /*
1103  * Retrieve a file by HTTP
1104  */
1105 FILE *
1106 fetchGetHTTP(struct url *URL, const char *flags)
1107 {
1108 	return (fetchXGetHTTP(URL, NULL, flags));
1109 }
1110 
1111 /*
1112  * Store a file by HTTP
1113  */
1114 FILE *
1115 fetchPutHTTP(struct url *URL __unused, const char *flags __unused)
1116 {
1117 	warnx("fetchPutHTTP(): not implemented");
1118 	return (NULL);
1119 }
1120 
1121 /*
1122  * Get an HTTP document's metadata
1123  */
1124 int
1125 fetchStatHTTP(struct url *URL, struct url_stat *us, const char *flags)
1126 {
1127 	FILE *f;
1128 
1129 	if ((f = _http_request(URL, "HEAD", us, _http_get_proxy(), flags)) == NULL)
1130 		return (-1);
1131 	fclose(f);
1132 	return (0);
1133 }
1134 
1135 /*
1136  * List a directory
1137  */
1138 struct url_ent *
1139 fetchListHTTP(struct url *url __unused, const char *flags __unused)
1140 {
1141 	warnx("fetchListHTTP(): not implemented");
1142 	return (NULL);
1143 }
1144