Gumbo  0.9.2
A C library for parsing HTML.
gumbo.h
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Author: jdtang@google.com (Jonathan Tang)
16 //
17 // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, and
18 // GUMBO_ as a prefix for enum constants (static constants get the Google-style
19 // kGumbo prefix).
20 
42 #ifndef GUMBO_GUMBO_H_
43 #define GUMBO_GUMBO_H_
44 
45 #ifdef _MSC_VER
46 #define _CRT_SECURE_NO_WARNINGS
47 #define fileno _fileno
48 #endif
49 
50 #include <stdbool.h>
51 #include <stddef.h>
52 
53 #ifdef __cplusplus
54 extern "C" {
55 #endif
56 
67 typedef struct {
68  unsigned int line;
69  unsigned int column;
70  unsigned int offset;
72 
77 extern const GumboSourcePosition kGumboEmptySourcePosition;
78 
88 typedef struct {
90  const char* data;
91 
93  size_t length;
95 
97 extern const GumboStringPiece kGumboEmptyString;
98 
103 bool gumbo_string_equals(
104  const GumboStringPiece* str1, const GumboStringPiece* str2);
105 
110 bool gumbo_string_equals_ignore_case(
111  const GumboStringPiece* str1, const GumboStringPiece* str2);
112 
122 typedef struct {
126  void** data;
127 
129  unsigned int length;
130 
132  unsigned int capacity;
133 } GumboVector;
134 
136 extern const GumboVector kGumboEmptyVector;
137 
142 int gumbo_vector_index_of(GumboVector* vector, const void* element);
143 
156 typedef enum {
157 // Load all the tags from an external source, generated from tag.in.
158 #include "tag_enum.h"
159  // Used for all tags that don't have special handling in HTML. Add new tags
160  // to the end of tag.in so as to preserve backwards-compatibility.
161  GUMBO_TAG_UNKNOWN,
162  // A marker value to indicate the end of the enum, for iterating over it.
163  // Also used as the terminator for varargs functions that take tags.
164  GUMBO_TAG_LAST,
165 } GumboTag;
166 
172 const char* gumbo_normalized_tagname(GumboTag tag);
173 
184 void gumbo_tag_from_original_text(GumboStringPiece* text);
185 
198 const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname);
199 
204 GumboTag gumbo_tag_enum(const char* tagname);
205 GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length);
206 
212 typedef enum {
213  GUMBO_ATTR_NAMESPACE_NONE,
214  GUMBO_ATTR_NAMESPACE_XLINK,
215  GUMBO_ATTR_NAMESPACE_XML,
216  GUMBO_ATTR_NAMESPACE_XMLNS,
217 } GumboAttributeNamespaceEnum;
218 
224 typedef struct {
231  GumboAttributeNamespaceEnum attr_namespace;
232 
237  const char* name;
238 
244 
251  const char* value;
252 
262 
265 
272 
275 
279 
285 GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name);
286 
291 typedef enum {
293  GUMBO_NODE_DOCUMENT,
295  GUMBO_NODE_ELEMENT,
297  GUMBO_NODE_TEXT,
299  GUMBO_NODE_CDATA,
301  GUMBO_NODE_COMMENT,
303  GUMBO_NODE_WHITESPACE,
309  GUMBO_NODE_TEMPLATE
310 } GumboNodeType;
311 
316 typedef struct GumboInternalNode GumboNode;
317 
321 typedef enum {
322  GUMBO_DOCTYPE_NO_QUIRKS,
323  GUMBO_DOCTYPE_QUIRKS,
324  GUMBO_DOCTYPE_LIMITED_QUIRKS
325 } GumboQuirksModeEnum;
326 
334 typedef enum {
335  GUMBO_NAMESPACE_HTML,
336  GUMBO_NAMESPACE_SVG,
337  GUMBO_NAMESPACE_MATHML
338 } GumboNamespaceEnum;
339 
348 typedef enum {
353  GUMBO_INSERTION_NORMAL = 0,
354 
361  GUMBO_INSERTION_BY_PARSER = 1 << 0,
362 
374  GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1,
375 
376  // Value 1 << 2 was for a flag that has since been removed.
377 
382  GUMBO_INSERTION_IMPLIED = 1 << 3,
383 
390  GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4,
391 
393  GUMBO_INSERTION_FROM_ISINDEX = 1 << 5,
394 
396  GUMBO_INSERTION_FROM_IMAGE = 1 << 6,
397 
403  GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7,
404 
406  GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8,
407 
409  GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9,
410 
415  GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10,
416 } GumboParseFlags;
417 
421 typedef struct {
427  GumboVector /* GumboNode* */ children;
428 
429  // True if there was an explicit doctype token as opposed to it being omitted.
430  bool has_doctype;
431 
432  // Fields from the doctype token, copied verbatim.
433  const char* name;
434  const char* public_identifier;
435  const char* system_identifier;
436 
441  GumboQuirksModeEnum doc_type_quirks_mode;
442 } GumboDocument;
443 
448 typedef struct {
453  const char* text;
454 
460 
466 } GumboText;
467 
472 typedef struct {
477  GumboVector /* GumboNode* */ children;
478 
480  GumboTag tag;
481 
483  GumboNamespaceEnum tag_namespace;
484 
492 
499 
502 
505 
510  GumboVector /* GumboAttribute* */ attributes;
511 } GumboElement;
512 
519  GumboNodeType type;
520 
522  GumboNode* parent;
523 
526 
532  GumboParseFlags parse_flags;
533 
535  union {
536  GumboDocument document; // For GUMBO_NODE_DOCUMENT.
537  GumboElement element; // For GUMBO_NODE_ELEMENT.
538  GumboText text; // For everything else.
539  } v;
540 };
541 
548 // TODO(jdtang): Add checks throughout the codebase for out-of-memory condition.
549 typedef void* (*GumboAllocatorFunction)(void* userdata, size_t size);
550 
555 typedef void (*GumboDeallocatorFunction)(void* userdata, void* ptr);
556 
563 typedef struct GumboInternalOptions {
565  GumboAllocatorFunction allocator;
566 
568  GumboDeallocatorFunction deallocator;
569 
574  void* userdata;
575 
580  int tab_stop;
581 
587 
596 
611 
618  GumboNamespaceEnum fragment_namespace;
619 } GumboOptions;
620 
622 extern const GumboOptions kGumboDefaultOptions;
623 
625 typedef struct GumboInternalOutput {
630  GumboNode* document;
631 
636  GumboNode* root;
637 
645  GumboVector /* GumboError */ errors;
646 } GumboOutput;
647 
655 GumboOutput* gumbo_parse(const char* buffer);
656 
661 GumboOutput* gumbo_parse_with_options(
662  const GumboOptions* options, const char* buffer, size_t buffer_length);
663 
665 void gumbo_destroy_output(const GumboOptions* options, GumboOutput* output);
666 
667 #ifdef __cplusplus
668 }
669 #endif
670 
671 #endif // GUMBO_GUMBO_H_
GumboInternalNode::parse_flags
GumboParseFlags parse_flags
Definition: gumbo.h:532
GumboOptions::max_errors
int max_errors
Definition: gumbo.h:595
GumboAttribute::name
const char * name
Definition: gumbo.h:237
GumboElement::original_end_tag
GumboStringPiece original_end_tag
Definition: gumbo.h:498
GumboElement::tag_namespace
GumboNamespaceEnum tag_namespace
Definition: gumbo.h:483
GumboVector::capacity
unsigned int capacity
Definition: gumbo.h:132
GumboOptions::allocator
GumboAllocatorFunction allocator
Definition: gumbo.h:565
GumboDocument
Definition: gumbo.h:421
GumboOptions::fragment_context
GumboTag fragment_context
Definition: gumbo.h:610
GumboElement::attributes
GumboVector attributes
Definition: gumbo.h:510
GumboText
Definition: gumbo.h:448
GumboText::start_pos
GumboSourcePosition start_pos
Definition: gumbo.h:465
GumboInternalNode
Definition: gumbo.h:517
GumboInternalNode::type
GumboNodeType type
Definition: gumbo.h:519
GumboOutput
Definition: gumbo.h:625
GumboAttribute
Definition: gumbo.h:224
GumboAttribute::value_end
GumboSourcePosition value_end
Definition: gumbo.h:277
GumboAttribute::name_end
GumboSourcePosition name_end
Definition: gumbo.h:271
GumboDocument::doc_type_quirks_mode
GumboQuirksModeEnum doc_type_quirks_mode
Definition: gumbo.h:441
GumboVector::data
void ** data
Definition: gumbo.h:126
GumboAttribute::original_value
GumboStringPiece original_value
Definition: gumbo.h:261
GumboOptions::fragment_namespace
GumboNamespaceEnum fragment_namespace
Definition: gumbo.h:618
GumboInternalNode::v
union GumboInternalNode::@0 v
GumboElement
Definition: gumbo.h:472
GumboOutput::root
GumboNode * root
Definition: gumbo.h:636
GumboElement::start_pos
GumboSourcePosition start_pos
Definition: gumbo.h:501
GumboStringPiece::data
const char * data
Definition: gumbo.h:90
GumboAttribute::value
const char * value
Definition: gumbo.h:251
GumboDocument::children
GumboVector children
Definition: gumbo.h:427
GumboVector
Definition: gumbo.h:122
GumboOptions::deallocator
GumboDeallocatorFunction deallocator
Definition: gumbo.h:568
GumboElement::children
GumboVector children
Definition: gumbo.h:477
GumboVector::length
unsigned int length
Definition: gumbo.h:129
GumboOptions::tab_stop
int tab_stop
Definition: gumbo.h:580
GumboInternalNode::parent
GumboNode * parent
Definition: gumbo.h:522
GumboOptions
Definition: gumbo.h:563
GumboStringPiece::length
size_t length
Definition: gumbo.h:93
GumboInternalNode::index_within_parent
size_t index_within_parent
Definition: gumbo.h:525
GumboOutput::document
GumboNode * document
Definition: gumbo.h:630
GumboOptions::stop_on_first_error
bool stop_on_first_error
Definition: gumbo.h:586
GumboAttribute::attr_namespace
GumboAttributeNamespaceEnum attr_namespace
Definition: gumbo.h:231
GumboText::text
const char * text
Definition: gumbo.h:453
GumboElement::tag
GumboTag tag
Definition: gumbo.h:480
GumboElement::original_tag
GumboStringPiece original_tag
Definition: gumbo.h:491
GumboSourcePosition
Definition: gumbo.h:67
GumboStringPiece
Definition: gumbo.h:88
GumboOutput::errors
GumboVector errors
Definition: gumbo.h:645
GumboAttribute::value_start
GumboSourcePosition value_start
Definition: gumbo.h:274
GumboOptions::userdata
void * userdata
Definition: gumbo.h:574
GumboElement::end_pos
GumboSourcePosition end_pos
Definition: gumbo.h:504
GumboAttribute::name_start
GumboSourcePosition name_start
Definition: gumbo.h:264
GumboAttribute::original_name
GumboStringPiece original_name
Definition: gumbo.h:243
GumboText::original_text
GumboStringPiece original_text
Definition: gumbo.h:459