rpm  5.4.15
html-parse.c
Go to the documentation of this file.
1 /* HTML parser for Wget.
2  Copyright (C) 1998, 2000, 2003 Free Software Foundation, Inc.
3 
4 This file is part of GNU Wget.
5 
6 GNU Wget is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
10 
11 GNU Wget is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with Wget; if not, write to the Free Software
18 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 
20 In addition, as a special exception, the Free Software Foundation
21 gives permission to link the code of its release of Wget with the
22 OpenSSL project's "OpenSSL" library (or with modified versions of it
23 that use the same license as the "OpenSSL" library), and distribute
24 the linked executables. You must obey the GNU General Public License
25 in all respects for all of the code used other than "OpenSSL". If you
26 modify this file, you may extend this exception to your version of the
27 file, but you are not obligated to do so. If you do not wish to do
28 so, delete this exception statement from your version. */
29 
30 /* The only entry point to this module is map_html_tags(), which see. */
31 
32 /* TODO:
33 
34  - Allow hooks for callers to process contents outside tags. This
35  is needed to implement handling <style> and <script>. The
36  taginfo structure already carries the information about where the
37  tags are, but this is not enough, because one would also want to
38  skip the comments. (The funny thing is that for <style> and
39  <script> you *don't* want to skip comments!)
40 
41  - Create a test suite for regression testing. */
42 
43 /* HISTORY:
44 
45  This is the third HTML parser written for Wget. The first one was
46  written some time during the Geturl 1.0 beta cycle, and was very
47  inefficient and buggy. It also contained some very complex code to
48  remember a list of parser states, because it was supposed to be
49  reentrant.
50 
51  The second HTML parser was written for Wget 1.4 (the first version
52  by the name `Wget'), and was a complete rewrite. Although the new
53  parser behaved much better and made no claims of reentrancy, it
54  still shared many of the fundamental flaws of the old version -- it
55  only regarded HTML in terms tag-attribute pairs, where the
56  attribute's value was a URL to be returned. Any other property of
57  HTML, such as <base href=...>, or strange way to specify a URL,
58  such as <meta http-equiv=Refresh content="0; URL=..."> had to be
59  crudely hacked in -- and the caller had to be aware of these hacks.
60  Like its predecessor, this parser did not support HTML comments.
61 
62  After Wget 1.5.1 was released, I set out to write a third HTML
63  parser. The objectives of the new parser were to: (1) provide a
64  clean way to analyze HTML lexically, (2) separate interpretation of
65  the markup from the parsing process, (3) be as correct as possible,
66  e.g. correctly skipping comments and other SGML declarations, (4)
67  understand the most common errors in markup and skip them or be
68  relaxed towrds them, and (5) be reasonably efficient (no regexps,
69  minimum copying and minimum or no heap allocation).
70 
71  I believe this parser meets all of the above goals. It is
72  reasonably well structured, and could be relatively easily
73  separated from Wget and used elsewhere. While some of its
74  intrinsic properties limit its value as a general-purpose HTML
75  parser, I believe that, with minimum modifications, it could serve
76  as a backend for one.
77 
78  Due to time and other constraints, this parser was not integrated
79  into Wget until the version 1.7. */
80 
81 /* DESCRIPTION:
82 
83  The single entry point of this parser is map_html_tags(), which
84  works by calling a function you specify for each tag. The function
85  gets called with the pointer to a structure describing the tag and
86  its attributes. */
87 
88 /* To test as standalone, compile with `-DSTANDALONE -I.'. You'll
89  still need Wget headers to compile. */
90 
91 #include "system.h"
92 
93 #include <assert.h>
94 
95 /* XXX uncouple from wget.h baggage. */
96 
97 #ifndef PARAMS
98 # ifdef PROTOTYPES
99 # define PARAMS(args) args
100 # else
101 # define PARAMS(args) ()
102 # endif
103 #endif
104 
105 /* Copy the data delimited with BEG and END to alloca-allocated
106  storage, and zero-terminate it. Arguments are evaluated only once,
107  in the order BEG, END, PLACE. */
108 #define BOUNDED_TO_ALLOCA(beg, end, place) do { \
109  const char *BTA_beg = (beg); \
110  int BTA_len = (end) - BTA_beg; \
111  char **BTA_dest = &(place); \
112  *BTA_dest = alloca (BTA_len + 1); \
113  memcpy (*BTA_dest, BTA_beg, BTA_len); \
114  (*BTA_dest)[BTA_len] = '\0'; \
115 } while (0)
116 
117 /* Convert an ASCII hex digit to the corresponding number between 0
118  and 15. X should be a hexadecimal digit that satisfies isxdigit;
119  otherwise, the result is undefined. */
120 #define XDIGIT_TO_NUM(x) ((x) < 'A' ? (x) - '0' : TOUPPER (x) - 'A' + 10)
121 
122 /* Returns the number of elements in an array with fixed
123  initialization. For example:
124 
125  static char a[] = "foo"; -- countof(a) == 4 (for terminating \0)
126 
127  int a[5] = {1, 2}; -- countof(a) == 5
128 
129  char *a[] = { -- countof(a) == 3
130  "foo", "bar", "baz"
131  }; */
132 #define countof(array) (sizeof (array) / sizeof (*(array)))
133 
134 #include "html-parse.h"
135 
136 #ifdef STANDALONE
137 # undef xmalloc
138 # undef xrealloc
139 # undef xfree
140 # define xmalloc malloc
141 # define xrealloc realloc
142 # define xfree free
143 
144 # undef ISSPACE
145 # undef ISDIGIT
146 # undef ISXDIGIT
147 # undef ISALPHA
148 # undef ISALNUM
149 # undef TOLOWER
150 # undef TOUPPER
151 
152 # define ISSPACE(x) isspace (x)
153 # define ISDIGIT(x) isdigit (x)
154 # define ISXDIGIT(x) isxdigit (x)
155 # define ISALPHA(x) isalpha (x)
156 # define ISALNUM(x) isalnum (x)
157 # define TOLOWER(x) tolower (x)
158 # define TOUPPER(x) toupper (x)
159 
160 struct hash_table {
161  int dummy;
162 };
163 static void *
164 hash_table_get (const struct hash_table *ht, void *ptr)
165 {
166  return ptr;
167 }
168 #else /* not STANDALONE */
169 # include "hash.h"
170 #endif
171 
172 /* Pool support. A pool is a resizable chunk of memory. It is first
173  allocated on the stack, and moved to the heap if it needs to be
174  larger than originally expected. map_html_tags() uses it to store
175  the zero-terminated names and values of tags and attributes.
176 
177  Thus taginfo->name, and attr->name and attr->value for each
178  attribute, do not point into separately allocated areas, but into
179  different parts of the pool, separated only by terminating zeros.
180  This ensures minimum amount of allocation and, for most tags, no
181  allocation because the entire pool is kept on the stack. */
182 
183 struct pool {
184  char *contents; /* pointer to the contents. */
185  int size; /* size of the pool. */
186  int tail; /* next available position index. */
187  int resized; /* whether the pool has been resized
188  using malloc. */
189 
190  char *orig_contents; /* original pool contents, usually
191  stack-allocated. used by POOL_FREE
192  to restore the pool to the initial
193  state. */
195 };
196 
197 /* Initialize the pool to hold INITIAL_SIZE bytes of storage. */
198 
199 #define POOL_INIT(p, initial_storage, initial_size) do { \
200  struct pool *P = (p); \
201  P->contents = (initial_storage); \
202  P->size = (initial_size); \
203  P->tail = 0; \
204  P->resized = 0; \
205  P->orig_contents = P->contents; \
206  P->orig_size = P->size; \
207 } while (0)
208 
209 /* Grow the pool to accommodate at least SIZE new bytes. If the pool
210  already has room to accommodate SIZE bytes of data, this is a no-op. */
211 
212 #define POOL_GROW(p, increase) \
213  GROW_ARRAY ((p)->contents, (p)->size, (p)->tail + (increase), \
214  (p)->resized, char)
215 
216 /* Append text in the range [beg, end) to POOL. No zero-termination
217  is done. */
218 
219 #define POOL_APPEND(p, beg, end) do { \
220  const char *PA_beg = (beg); \
221  int PA_size = (end) - PA_beg; \
222  POOL_GROW (p, PA_size); \
223  memcpy ((p)->contents + (p)->tail, PA_beg, PA_size); \
224  (p)->tail += PA_size; \
225 } while (0)
226 
227 /* Append one character to the pool. Can be used to zero-terminate
228  pool strings. */
229 
230 #define POOL_APPEND_CHR(p, ch) do { \
231  char PAC_char = (ch); \
232  POOL_GROW (p, 1); \
233  (p)->contents[(p)->tail++] = PAC_char; \
234 } while (0)
235 
236 /* Forget old pool contents. The allocated memory is not freed. */
237 #define POOL_REWIND(p) (p)->tail = 0
238 
239 /* Free heap-allocated memory for contents of POOL. This calls
240  xfree() if the memory was allocated through malloc. It also
241  restores `contents' and `size' to their original, pre-malloc
242  values. That way after POOL_FREE, the pool is fully usable, just
243  as if it were freshly initialized with POOL_INIT. */
244 
245 #define POOL_FREE(p) do { \
246  struct pool *P = p; \
247  if (P->resized) \
248  xfree (P->contents); \
249  P->contents = P->orig_contents; \
250  P->size = P->orig_size; \
251  P->tail = 0; \
252  P->resized = 0; \
253 } while (0)
254 
255 /* Used for small stack-allocated memory chunks that might grow. Like
256  DO_REALLOC, this macro grows BASEVAR as necessary to take
257  NEEDED_SIZE items of TYPE.
258 
259  The difference is that on the first resize, it will use
260  malloc+memcpy rather than realloc. That way you can stack-allocate
261  the initial chunk, and only resort to heap allocation if you
262  stumble upon large data.
263 
264  After the first resize, subsequent ones are performed with realloc,
265  just like DO_REALLOC. */
266 
267 #define GROW_ARRAY(basevar, sizevar, needed_size, resized, type) do { \
268  long ga_needed_size = (needed_size); \
269  long ga_newsize = (sizevar); \
270  while (ga_newsize < ga_needed_size) \
271  ga_newsize <<= 1; \
272  if (ga_newsize != (sizevar)) \
273  { \
274  if (resized) \
275  basevar = (type *)xrealloc (basevar, ga_newsize * sizeof (type)); \
276  else \
277  { \
278  void *ga_new = xmalloc (ga_newsize * sizeof (type)); \
279  memcpy (ga_new, basevar, (sizevar) * sizeof (type)); \
280  (basevar) = ga_new; \
281  resized = 1; \
282  } \
283  (sizevar) = ga_newsize; \
284  } \
285 } while (0)
286 
287 #define AP_DOWNCASE 1
288 #define AP_PROCESS_ENTITIES 2
289 #define AP_TRIM_BLANKS 4
290 
291 /* Copy the text in the range [BEG, END) to POOL, optionally
292  performing operations specified by FLAGS. FLAGS may be any
293  combination of AP_DOWNCASE, AP_PROCESS_ENTITIES and AP_TRIM_BLANKS
294  with the following meaning:
295 
296  * AP_DOWNCASE -- downcase all the letters;
297 
298  * AP_PROCESS_ENTITIES -- process the SGML entities and write out
299  the decoded string. Recognized entities are &lt, &gt, &amp, &quot,
300  &nbsp and the numerical entities.
301 
302  * AP_TRIM_BLANKS -- ignore blanks at the beginning and at the end
303  of text. */
304 
305 static void
306 convert_and_copy (struct pool *pool, const char *beg, const char *end, int flags)
307 {
308  int old_tail = pool->tail;
309  int size;
310 
311  /* First, skip blanks if required. We must do this before entities
312  are processed, so that blanks can still be inserted as, for
313  instance, `&#32;'. */
314  if (flags & AP_TRIM_BLANKS)
315  {
316  while (beg < end && ISSPACE (*beg))
317  ++beg;
318  while (end > beg && ISSPACE (end[-1]))
319  --end;
320  }
321  size = end - beg;
322 
323  if (flags & AP_PROCESS_ENTITIES)
324  {
325  /* Grow the pool, then copy the text to the pool character by
326  character, processing the encountered entities as we go
327  along.
328 
329  It's safe (and necessary) to grow the pool in advance because
330  processing the entities can only *shorten* the string, it can
331  never lengthen it. */
332  const char *from = beg;
333  char *to;
334 
335  POOL_GROW (pool, end - beg);
336  to = pool->contents + pool->tail;
337 
338  while (from < end)
339  {
340  if (*from != '&')
341  *to++ = *from++;
342  else
343  {
344  const char *save = from;
345  int remain;
346 
347  if (++from == end)
348  goto lose;
349  remain = end - from;
350 
351  /* Process numeric entities "&#DDD;" and "&#xHH;". */
352  if (*from == '#')
353  {
354  int numeric = 0, digits = 0;
355  ++from;
356  if (*from == 'x')
357  {
358  ++from;
359  for (; from < end && ISXDIGIT (*from); from++, digits++)
360  numeric = (numeric << 4) + XDIGIT_TO_NUM (*from);
361  }
362  else
363  {
364  for (; from < end && ISDIGIT (*from); from++, digits++)
365  numeric = (numeric * 10) + (*from - '0');
366  }
367  if (!digits)
368  goto lose;
369  numeric &= 0xff;
370  *to++ = numeric;
371  }
372 #define FROB(x) (remain >= (sizeof (x) - 1) \
373  && 0 == memcmp (from, x, sizeof (x) - 1) \
374  && (*(from + sizeof (x) - 1) == ';' \
375  || remain == sizeof (x) - 1 \
376  || !ISALNUM (*(from + sizeof (x) - 1))))
377  else if (FROB ("lt"))
378  *to++ = '<', from += 2;
379  else if (FROB ("gt"))
380  *to++ = '>', from += 2;
381  else if (FROB ("amp"))
382  *to++ = '&', from += 3;
383  else if (FROB ("quot"))
384  *to++ = '\"', from += 4;
385  /* We don't implement the proposed "Added Latin 1"
386  entities (except for nbsp), because it is unnecessary
387  in the context of Wget, and would require hashing to
388  work efficiently. */
389  else if (FROB ("nbsp"))
390  *to++ = 160, from += 4;
391  else
392  goto lose;
393 #undef FROB
394  /* If the entity was followed by `;', we step over the
395  `;'. Otherwise, it was followed by either a
396  non-alphanumeric or EOB, in which case we do nothing. */
397  if (from < end && *from == ';')
398  ++from;
399  continue;
400 
401  lose:
402  /* This was not an entity after all. Back out. */
403  from = save;
404  *to++ = *from++;
405  }
406  }
407  /* Verify that we haven't exceeded the original size. (It
408  shouldn't happen, hence the assert.) */
409  assert (to - (pool->contents + pool->tail) <= end - beg);
410 
411  /* Make POOL's tail point to the position following the string
412  we've written. */
413  pool->tail = to - pool->contents;
414  POOL_APPEND_CHR (pool, '\0');
415  }
416  else
417  {
418  /* Just copy the text to the pool. */
419  POOL_APPEND (pool, beg, end);
420  POOL_APPEND_CHR (pool, '\0');
421  }
422 
423  if (flags & AP_DOWNCASE)
424  {
425  char *p = pool->contents + old_tail;
426  for (; *p; p++)
427  *p = TOLOWER (*p);
428  }
429 }
430 
431 /* Originally we used to adhere to rfc 1866 here, and allowed only
432  letters, digits, periods, and hyphens as names (of tags or
433  attributes). However, this broke too many pages which used
434  proprietary or strange attributes, e.g. <img src="a.gif"
435  v:shapes="whatever">.
436 
437  So now we allow any character except:
438  * whitespace
439  * 8-bit and control chars
440  * characters that clearly cannot be part of name:
441  '=', '>', '/'.
442 
443  This only affects attribute and tag names; attribute values allow
444  an even greater variety of characters. */
445 
446 #define NAME_CHAR_P(x) ((x) > 32 && (x) < 127 \
447  && (x) != '=' && (x) != '>' && (x) != '/')
448 
449 #ifdef STANDALONE
450 static int comment_backout_count;
451 #endif
452 
453 /* Advance over an SGML declaration, such as <!DOCTYPE ...>. In
454  strict comments mode, this is used for skipping over comments as
455  well.
456 
457  To recap: any SGML declaration may have comments associated with
458  it, e.g.
459  <!MY-DECL -- isn't this fun? -- foo bar>
460 
461  An HTML comment is merely an empty declaration (<!>) with a comment
462  attached, like this:
463  <!-- some stuff here -->
464 
465  Several comments may be embedded in one comment declaration:
466  <!-- have -- -- fun -->
467 
468  Whitespace is allowed between and after the comments, but not
469  before the first comment. Additionally, this function attempts to
470  handle double quotes in SGML declarations correctly. */
471 
472 static const char *
473 advance_declaration (const char *beg, const char *end)
474 {
475  const char *p = beg;
476  char quote_char = '\0'; /* shut up, gcc! */
477  char ch;
478 
479  enum {
480  AC_S_DONE,
481  AC_S_BACKOUT,
482  AC_S_BANG,
483  AC_S_DEFAULT,
484  AC_S_DCLNAME,
485  AC_S_DASH1,
486  AC_S_DASH2,
487  AC_S_COMMENT,
488  AC_S_DASH3,
489  AC_S_DASH4,
490  AC_S_QUOTE1,
491  AC_S_IN_QUOTE,
492  AC_S_QUOTE2
493  } state = AC_S_BANG;
494 
495  if (beg == end)
496  return beg;
497  ch = *p++;
498 
499  /* It looked like a good idea to write this as a state machine, but
500  now I wonder... */
501 
502  while (state != AC_S_DONE && state != AC_S_BACKOUT)
503  {
504  if (p == end)
505  state = AC_S_BACKOUT;
506  switch (state)
507  {
508  case AC_S_DONE:
509  case AC_S_BACKOUT:
510  break;
511  case AC_S_BANG:
512  if (ch == '!')
513  {
514  ch = *p++;
515  state = AC_S_DEFAULT;
516  }
517  else
518  state = AC_S_BACKOUT;
519  break;
520  case AC_S_DEFAULT:
521  switch (ch)
522  {
523  case '-':
524  state = AC_S_DASH1;
525  break;
526  case ' ':
527  case '\t':
528  case '\r':
529  case '\n':
530  ch = *p++;
531  break;
532  case '>':
533  state = AC_S_DONE;
534  break;
535  case '\'':
536  case '\"':
537  state = AC_S_QUOTE1;
538  break;
539  default:
540  if (NAME_CHAR_P (ch))
541  state = AC_S_DCLNAME;
542  else
543  state = AC_S_BACKOUT;
544  break;
545  }
546  break;
547  case AC_S_DCLNAME:
548  if (ch == '-')
549  state = AC_S_DASH1;
550  else if (NAME_CHAR_P (ch))
551  ch = *p++;
552  else
553  state = AC_S_DEFAULT;
554  break;
555  case AC_S_QUOTE1:
556  /* We must use 0x22 because broken assert macros choke on
557  '"' and '\"'. */
558  assert (ch == '\'' || ch == 0x22);
559  quote_char = ch; /* cheating -- I really don't feel like
560  introducing more different states for
561  different quote characters. */
562  ch = *p++;
563  state = AC_S_IN_QUOTE;
564  break;
565  case AC_S_IN_QUOTE:
566  if (ch == quote_char)
567  state = AC_S_QUOTE2;
568  else
569  ch = *p++;
570  break;
571  case AC_S_QUOTE2:
572  assert (ch == quote_char);
573  ch = *p++;
574  state = AC_S_DEFAULT;
575  break;
576  case AC_S_DASH1:
577  assert (ch == '-');
578  ch = *p++;
579  state = AC_S_DASH2;
580  break;
581  case AC_S_DASH2:
582  switch (ch)
583  {
584  case '-':
585  ch = *p++;
586  state = AC_S_COMMENT;
587  break;
588  default:
589  state = AC_S_BACKOUT;
590  }
591  break;
592  case AC_S_COMMENT:
593  switch (ch)
594  {
595  case '-':
596  state = AC_S_DASH3;
597  break;
598  default:
599  ch = *p++;
600  break;
601  }
602  break;
603  case AC_S_DASH3:
604  assert (ch == '-');
605  ch = *p++;
606  state = AC_S_DASH4;
607  break;
608  case AC_S_DASH4:
609  switch (ch)
610  {
611  case '-':
612  ch = *p++;
613  state = AC_S_DEFAULT;
614  break;
615  default:
616  state = AC_S_COMMENT;
617  break;
618  }
619  break;
620  }
621  }
622 
623  if (state == AC_S_BACKOUT)
624  {
625 #ifdef STANDALONE
626  ++comment_backout_count;
627 #endif
628  return beg + 1;
629  }
630  return p;
631 }
632 
633 /* Find the first occurrence of the substring "-->" in [BEG, END) and
634  return the pointer to the character after the substring. If the
635  substring is not found, return NULL. */
636 
637 static const char *
638 find_comment_end (const char *beg, const char *end)
639 {
640  /* Open-coded Boyer-Moore search for "-->". Examine the third char;
641  if it's not '>' or '-', advance by three characters. Otherwise,
642  look at the preceding characters and try to find a match. */
643 
644  const char *p = beg - 1;
645 
646  while ((p += 3) < end)
647  switch (p[0])
648  {
649  case '>':
650  if (p[-1] == '-' && p[-2] == '-')
651  return p + 1;
652  break;
653  case '-':
654  at_dash:
655  if (p[-1] == '-')
656  {
657  at_dash_dash:
658  if (++p == end) return NULL;
659  switch (p[0])
660  {
661  case '>': return p + 1;
662  case '-': goto at_dash_dash;
663  }
664  }
665  else
666  {
667  if ((p += 2) >= end) return NULL;
668  switch (p[0])
669  {
670  case '>':
671  if (p[-1] == '-')
672  return p + 1;
673  break;
674  case '-':
675  goto at_dash;
676  }
677  }
678  }
679  return NULL;
680 }
681 
682 /* Return non-zero of the string inside [b, e) are present in hash
683  table HT. */
684 
685 static int
686 name_allowed (const struct hash_table *ht, const char *b, const char *e)
687 {
688  char *copy;
689  if (!ht)
690  return 1;
691  BOUNDED_TO_ALLOCA (b, e, copy);
692  return hash_table_get (ht, copy) != NULL;
693 }
694 
695 /* Advance P (a char pointer), with the explicit intent of being able
696  to read the next character. If this is not possible, go to finish. */
697 
698 #define ADVANCE(p) do { \
699  ++p; \
700  if (p >= end) \
701  goto finish; \
702 } while (0)
703 
704 /* Skip whitespace, if any. */
705 
706 #define SKIP_WS(p) do { \
707  while (ISSPACE (*p)) { \
708  ADVANCE (p); \
709  } \
710 } while (0)
711 
712 /* Skip non-whitespace, if any. */
713 
714 #define SKIP_NON_WS(p) do { \
715  while (!ISSPACE (*p)) { \
716  ADVANCE (p); \
717  } \
718 } while (0)
719 
720 #ifdef STANDALONE
721 static int tag_backout_count;
722 #endif
723 
724 /* Map MAPFUN over HTML tags in TEXT, which is SIZE characters long.
725  MAPFUN will be called with two arguments: pointer to an initialized
726  struct taginfo, and MAPARG.
727 
728  ALLOWED_TAG_NAMES should be a NULL-terminated array of tag names to
729  be processed by this function. If it is NULL, all the tags are
730  allowed. The same goes for attributes and ALLOWED_ATTRIBUTE_NAMES.
731 
732  (Obviously, the caller can filter out unwanted tags and attributes
733  just as well, but this is just an optimization designed to avoid
734  unnecessary copying for tags/attributes which the caller doesn't
735  want to know about. These lists are searched linearly; therefore,
736  if you're interested in a large number of tags or attributes, you'd
737  better set these to NULL and filter them out yourself with a
738  hashing process most appropriate for your application.) */
739 
740 void
741 map_html_tags (const char *text, int size,
742  void (*mapfun) (struct taginfo *, void *), void *maparg,
743  int flags,
744  const struct hash_table *allowed_tags,
745  const struct hash_table *allowed_attributes)
746 {
747  /* storage for strings passed to MAPFUN callback; if 256 bytes is
748  too little, POOL_APPEND allocates more with malloc. */
749  char pool_initial_storage[256];
750  struct pool pool;
751 
752  const char *p = text;
753  const char *end = text + size;
754 
755  struct attr_pair attr_pair_initial_storage[8];
756  int attr_pair_size = countof (attr_pair_initial_storage);
757  int attr_pair_resized = 0;
758  struct attr_pair *pairs = attr_pair_initial_storage;
759 
760  if (!size)
761  return;
762 
763  POOL_INIT (&pool, pool_initial_storage, countof (pool_initial_storage));
764 
765  {
766  int nattrs, end_tag;
767  const char *tag_name_begin, *tag_name_end;
768  const char *tag_start_position;
769  int uninteresting_tag;
770 
771  look_for_tag:
772  POOL_REWIND (&pool);
773 
774  nattrs = 0;
775  end_tag = 0;
776 
777  /* Find beginning of tag. We use memchr() instead of the usual
778  looping with ADVANCE() for speed. */
779  p = memchr (p, '<', end - p);
780  if (!p)
781  goto finish;
782 
783  tag_start_position = p;
784  ADVANCE (p);
785 
786  /* Establish the type of the tag (start-tag, end-tag or
787  declaration). */
788  if (*p == '!')
789  {
790  if (!(flags & MHT_STRICT_COMMENTS)
791  && p < end + 3 && p[1] == '-' && p[2] == '-')
792  {
793  /* If strict comments are not enforced and if we know
794  we're looking at a comment, simply look for the
795  terminating "-->". Non-strict is the default because
796  it works in other browsers and most HTML writers can't
797  be bothered with getting the comments right. */
798  const char *comment_end = find_comment_end (p + 3, end);
799  if (comment_end)
800  p = comment_end;
801  }
802  else
803  {
804  /* Either in strict comment mode or looking at a non-empty
805  declaration. Real declarations are much less likely to
806  be misused the way comments are, so advance over them
807  properly regardless of strictness. */
808  p = advance_declaration (p, end);
809  }
810  if (p == end)
811  goto finish;
812  goto look_for_tag;
813  }
814  else if (*p == '/')
815  {
816  end_tag = 1;
817  ADVANCE (p);
818  }
819  tag_name_begin = p;
820  while (NAME_CHAR_P (*p))
821  ADVANCE (p);
822  if (p == tag_name_begin)
823  goto look_for_tag;
824  tag_name_end = p;
825  SKIP_WS (p);
826  if (end_tag && *p != '>')
827  goto backout_tag;
828 
829  if (!name_allowed (allowed_tags, tag_name_begin, tag_name_end))
830  /* We can't just say "goto look_for_tag" here because we need
831  the loop below to properly advance over the tag's attributes. */
832  uninteresting_tag = 1;
833  else
834  {
835  uninteresting_tag = 0;
836  convert_and_copy (&pool, tag_name_begin, tag_name_end, AP_DOWNCASE);
837  }
838 
839  /* Find the attributes. */
840  while (1)
841  {
842  const char *attr_name_begin, *attr_name_end;
843  const char *attr_value_begin, *attr_value_end;
844  const char *attr_raw_value_begin, *attr_raw_value_end;
845  int operation = AP_DOWNCASE; /* stupid compiler. */
846 
847  SKIP_WS (p);
848 
849  if (*p == '/')
850  {
851  /* A slash at this point means the tag is about to be
852  closed. This is legal in XML and has been popularized
853  in HTML via XHTML. */
854  /* <foo a=b c=d /> */
855  /* ^ */
856  ADVANCE (p);
857  SKIP_WS (p);
858  if (*p != '>')
859  goto backout_tag;
860  }
861 
862  /* Check for end of tag definition. */
863  if (*p == '>')
864  break;
865 
866  /* Establish bounds of attribute name. */
867  attr_name_begin = p; /* <foo bar ...> */
868  /* ^ */
869  while (NAME_CHAR_P (*p))
870  ADVANCE (p);
871  attr_name_end = p; /* <foo bar ...> */
872  /* ^ */
873  if (attr_name_begin == attr_name_end)
874  goto backout_tag;
875 
876  /* Establish bounds of attribute value. */
877  SKIP_WS (p);
878  if (NAME_CHAR_P (*p) || *p == '/' || *p == '>')
879  {
880  /* Minimized attribute syntax allows `=' to be omitted.
881  For example, <UL COMPACT> is a valid shorthand for <UL
882  COMPACT="compact">. Even if such attributes are not
883  useful to Wget, we need to support them, so that the
884  tags containing them can be parsed correctly. */
885  attr_raw_value_begin = attr_value_begin = attr_name_begin;
886  attr_raw_value_end = attr_value_end = attr_name_end;
887  }
888  else if (*p == '=')
889  {
890  ADVANCE (p);
891  SKIP_WS (p);
892  if (*p == '\"' || *p == '\'')
893  {
894  int newline_seen = 0;
895  char quote_char = *p;
896  attr_raw_value_begin = p;
897  ADVANCE (p);
898  attr_value_begin = p; /* <foo bar="baz"> */
899  /* ^ */
900  while (*p != quote_char)
901  {
902  if (!newline_seen && *p == '\n')
903  {
904  /* If a newline is seen within the quotes, it
905  is most likely that someone forgot to close
906  the quote. In that case, we back out to
907  the value beginning, and terminate the tag
908  at either `>' or the delimiter, whichever
909  comes first. Such a tag terminated at `>'
910  is discarded. */
911  p = attr_value_begin;
912  newline_seen = 1;
913  continue;
914  }
915  else if (newline_seen && *p == '>')
916  break;
917  ADVANCE (p);
918  }
919  attr_value_end = p; /* <foo bar="baz"> */
920  /* ^ */
921  if (*p == quote_char)
922  ADVANCE (p);
923  else
924  goto look_for_tag;
925  attr_raw_value_end = p; /* <foo bar="baz"> */
926  /* ^ */
927  operation = AP_PROCESS_ENTITIES;
928  if (flags & MHT_TRIM_VALUES)
929  operation |= AP_TRIM_BLANKS;
930  }
931  else
932  {
933  attr_value_begin = p; /* <foo bar=baz> */
934  /* ^ */
935  /* According to SGML, a name token should consist only
936  of alphanumerics, . and -. However, this is often
937  violated by, for instance, `%' in `width=75%'.
938  We'll be liberal and allow just about anything as
939  an attribute value. */
940  while (!ISSPACE (*p) && *p != '>')
941  ADVANCE (p);
942  attr_value_end = p; /* <foo bar=baz qux=quix> */
943  /* ^ */
944  if (attr_value_begin == attr_value_end)
945  /* <foo bar=> */
946  /* ^ */
947  goto backout_tag;
948  attr_raw_value_begin = attr_value_begin;
949  attr_raw_value_end = attr_value_end;
950  operation = AP_PROCESS_ENTITIES;
951  }
952  }
953  else
954  {
955  /* We skipped the whitespace and found something that is
956  neither `=' nor the beginning of the next attribute's
957  name. Back out. */
958  goto backout_tag; /* <foo bar [... */
959  /* ^ */
960  }
961 
962  /* If we're not interested in the tag, don't bother with any
963  of the attributes. */
964  if (uninteresting_tag)
965  continue;
966 
967  /* If we aren't interested in the attribute, skip it. We
968  cannot do this test any sooner, because our text pointer
969  needs to correctly advance over the attribute. */
970  if (!name_allowed (allowed_attributes, attr_name_begin, attr_name_end))
971  continue;
972 
973  GROW_ARRAY (pairs, attr_pair_size, nattrs + 1, attr_pair_resized,
974  struct attr_pair);
975 
976  pairs[nattrs].name_pool_index = pool.tail;
977  convert_and_copy (&pool, attr_name_begin, attr_name_end, AP_DOWNCASE);
978 
979  pairs[nattrs].value_pool_index = pool.tail;
980  convert_and_copy (&pool, attr_value_begin, attr_value_end, operation);
981  pairs[nattrs].value_raw_beginning = attr_raw_value_begin;
982  pairs[nattrs].value_raw_size = (attr_raw_value_end
983  - attr_raw_value_begin);
984  ++nattrs;
985  }
986 
987  if (uninteresting_tag)
988  {
989  ADVANCE (p);
990  goto look_for_tag;
991  }
992 
993  /* By now, we have a valid tag with a name and zero or more
994  attributes. Fill in the data and call the mapper function. */
995  {
996  int i;
997  struct taginfo taginfo;
998 
999  taginfo.name = pool.contents;
1000  taginfo.end_tag_p = end_tag;
1001  taginfo.nattrs = nattrs;
1002  /* We fill in the char pointers only now, when pool can no
1003  longer get realloc'ed. If we did that above, we could get
1004  hosed by reallocation. Obviously, after this point, the pool
1005  may no longer be grown. */
1006  for (i = 0; i < nattrs; i++)
1007  {
1008  pairs[i].name = pool.contents + pairs[i].name_pool_index;
1009  pairs[i].value = pool.contents + pairs[i].value_pool_index;
1010  }
1011  taginfo.attrs = pairs;
1012  taginfo.start_position = tag_start_position;
1013  taginfo.end_position = p + 1;
1014  /* Ta-dam! */
1015  (*mapfun) (&taginfo, maparg);
1016  ADVANCE (p);
1017  }
1018  goto look_for_tag;
1019 
1020  backout_tag:
1021 #ifdef STANDALONE
1022  ++tag_backout_count;
1023 #endif
1024  /* The tag wasn't really a tag. Treat its contents as ordinary
1025  data characters. */
1026  p = tag_start_position + 1;
1027  goto look_for_tag;
1028  }
1029 
1030  finish:
1031  POOL_FREE (&pool);
1032  if (attr_pair_resized)
1033  xfree (pairs);
1034 }
1035 
1036 #undef ADVANCE
1037 #undef SKIP_WS
1038 #undef SKIP_NON_WS
1039 
1040 #ifdef STANDALONE
1041 
1042 #include "rpmio.h"
1043 
1044 extern int _rpmio_debug;
1045 extern int _dav_debug;
1046 extern int _ftp_debug;
1047 
1048 #if 0
1049 #define HTMLPATH "http://download.fedora.redhat.com/pub/fedora/linux/core/3/i386/os/Fedora/RPMS/"
1050 #define HTMLPATH "http://localhost/rawhide/test/"
1051 #else
1052 #define HTMLPATH "http://localhost/rawhide/"
1053 #endif
1054 static const char * htmlpath = HTMLPATH;
1055 
1056 static void
1057 test_mapper (struct taginfo *taginfo, void *arg)
1058 {
1059  int i;
1060 
1061  printf ("%s%s", taginfo->end_tag_p ? "/" : "", taginfo->name);
1062  for (i = 0; i < taginfo->nattrs; i++)
1063  printf (" %s=%s", taginfo->attrs[i].name, taginfo->attrs[i].value);
1064  putchar ('\n');
1065  ++*(int *)arg;
1066 }
1067 
1068 int main ()
1069 {
1070  int size = 256;
1071  char *x = (char *)xmalloc (size);
1072  int length = 0;
1073  int read_count;
1074  int tag_counter = 0;
1075  int flags = MHT_TRIM_VALUES;
1076  struct hash_table *interesting_tags = (struct hash_table *)1;
1077  struct hash_table *interesting_attributes = (struct hash_table *)1;
1078  FD_t fd;
1079 
1080 _rpmio_debug = 0;
1081 _dav_debug = 0;
1082  fd = Fopen(htmlpath, "r");
1083  while ((read_count = Fread (x + length, 1, size - length, fd)))
1084  {
1085  if (read_count <= 0)
1086  break;
1087  length += read_count;
1088  size <<= 1;
1089  x = (char *)xrealloc (x, size);
1090  }
1091  (void) Fclose(fd);
1092  x[length] = '\0';
1093 #if 0
1094 fprintf(stderr, "============== %p[%d]\n%s\n", x, length, x);
1095 #endif
1096 
1097  map_html_tags (x, length, test_mapper, &tag_counter,
1098  flags, interesting_tags, interesting_attributes);
1099  printf ("TAGS: %d\n", tag_counter);
1100  printf ("Tag backouts: %d\n", tag_backout_count);
1101  printf ("Comment backouts: %d\n", comment_backout_count);
1102  return 0;
1103 }
1104 #endif /* STANDALONE */
void map_html_tags(const char *text, int size, void(*mapfun)(struct taginfo *, void *), void *maparg, int flags, const struct hash_table *allowed_tags, const struct hash_table *allowed_attributes)
Definition: html-parse.c:741
const bson * b
Definition: bson.h:280
#define AP_TRIM_BLANKS
Definition: html-parse.c:289
#define GROW_ARRAY(basevar, sizevar, needed_size, resized, type)
Definition: html-parse.c:267
#define POOL_GROW(p, increase)
Definition: html-parse.c:212
int _ftp_debug
Definition: rpmio.c:190
static const char * find_comment_end(const char *beg, const char *end)
Definition: html-parse.c:638
char * name
Definition: html-parse.h:34
FD_t Fopen(const char *path, const char *_fmode)
fopen(3) clone.
Definition: rpmio.c:2840
#define POOL_INIT(p, initial_storage, initial_size)
Definition: html-parse.c:199
int value_pool_index
Definition: html-parse.h:43
#define ISXDIGIT(c)
Definition: fnmatch.c:84
static void convert_and_copy(struct pool *pool, const char *beg, const char *end, int flags)
Definition: html-parse.c:306
int main(int argc, const char **argv, char **envp)
Definition: rpmqv.c:453
#define BOUNDED_TO_ALLOCA(beg, end, place)
Definition: html-parse.c:108
int end_tag_p
Definition: html-parse.h:48
char * name
Definition: html-parse.h:47
#define ISDIGIT(c)
Definition: fnmatch.c:76
static const char * advance_declaration(const char *beg, const char *end)
Definition: html-parse.c:473
#define ISSPACE(c)
Definition: fnmatch.c:82
#define NAME_CHAR_P(x)
Definition: html-parse.c:446
#define AP_DOWNCASE
Definition: html-parse.c:287
int resized
Definition: html-parse.c:187
const char * end_position
Definition: html-parse.h:53
#define FROB(x)
#define MHT_TRIM_VALUES
Definition: html-parse.h:60
struct attr_pair * attrs
Definition: html-parse.h:50
char * value
Definition: html-parse.h:35
char * memchr()
int _dav_debug
Definition: rpmio.c:195
int tail
Definition: html-parse.c:186
The FD_t File Handle data structure.
char * orig_contents
Definition: html-parse.c:190
#define POOL_APPEND(p, beg, end)
Definition: html-parse.c:219
#define XDIGIT_TO_NUM(x)
Definition: html-parse.c:120
size_t Fread(void *buf, size_t size, size_t nmemb, FD_t fd)
fread(3) clone.
Definition: rpmio.c:2412
const char const char int arg
Definition: mongo.h:777
int Fclose(FD_t fd)
fclose(3) clone.
Definition: rpmio.c:2534
const char const bson int mongo_write_concern int flags
Definition: mongo.h:485
int size
Definition: html-parse.c:185
char * contents
Definition: html-parse.c:184
const char const int i
Definition: bson.h:778
const char * value_raw_beginning
Definition: html-parse.h:39
int _rpmio_debug
Definition: rpmio.c:180
#define POOL_FREE(p)
Definition: html-parse.c:245
#define AP_PROCESS_ENTITIES
Definition: html-parse.c:288
int nattrs
Definition: html-parse.h:49
const char const char size_t size
Definition: bson.h:895
#define POOL_APPEND_CHR(p, ch)
Definition: html-parse.c:230
#define ADVANCE(p)
Definition: html-parse.c:698
static int name_allowed(const struct hash_table *ht, const char *b, const char *e)
Definition: html-parse.c:686
#define SKIP_WS(p)
Definition: html-parse.c:706
int orig_size
Definition: html-parse.c:194
const char * start_position
Definition: html-parse.h:52
#define xfree
Definition: system.h:38
#define xmalloc
Definition: system.h:32
int name_pool_index
Definition: html-parse.h:43
#define MHT_STRICT_COMMENTS
Definition: html-parse.h:59
#define xrealloc
Definition: system.h:35
#define countof(array)
Definition: html-parse.c:132
#define POOL_REWIND(p)
Definition: html-parse.c:237
int value_raw_size
Definition: html-parse.h:40