class LibXML::XML::HTMLParser::Context
The XML::HTMLParser::Context
class provides in-depth control over how a document is parsed.
Public Class Methods
XML::HTMLParser::Context.file(file) → XML::HTMLParser::Context
click to toggle source
Creates a new parser context based on the specified file or uri.
Parameters:
file - A filename or uri options - A or'ed together list of LibXML::XML::HTMLParser::Options values
static VALUE rxml_html_parser_context_file(int argc, VALUE* argv, VALUE klass) { VALUE file, options; rb_scan_args(argc, argv, "11", &file, &options); htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(StringValuePtr(file), NULL); if (!ctxt) rxml_raise(xmlGetLastError()); /* This is annoying, but xmlInitParserCtxt (called indirectly above) and xmlCtxtUseOptionsInternal (called below) initialize slightly different context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */ htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options)); return rxml_html_parser_context_wrap(ctxt); }
XML::HTMLParser::Context.io(io) → XML::HTMLParser::Context
click to toggle source
Creates a new parser context based on the specified io object.
Parameters:
io - A ruby IO object options - A or'ed together list of LibXML::XML::HTMLParser::Options values
static VALUE rxml_html_parser_context_io(int argc, VALUE* argv, VALUE klass) { VALUE io, options; rb_scan_args(argc, argv, "11", &io, &options); VALUE result; htmlParserCtxtPtr ctxt; xmlParserInputBufferPtr input; xmlParserInputPtr stream; if (NIL_P(io)) rb_raise(rb_eTypeError, "Must pass in an IO object"); input = xmlParserInputBufferCreateIO((xmlInputReadCallback) rxml_read_callback, NULL, (void*)io, XML_CHAR_ENCODING_NONE); ctxt = htmlNewParserCtxt(); if (!ctxt) { xmlFreeParserInputBuffer(input); rxml_raise(xmlGetLastError()); } /* This is annoying, but xmlInitParserCtxt (called indirectly above) and xmlCtxtUseOptionsInternal (called below) initialize slightly different context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */ htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options)); stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); if (!stream) { xmlFreeParserInputBuffer(input); xmlFreeParserCtxt(ctxt); rxml_raise(xmlGetLastError()); } inputPush(ctxt, stream); result = rxml_html_parser_context_wrap(ctxt); /* Attach io object to parser so it won't get freed.*/ rb_ivar_set(result, IO_ATTR, io); return result; }
XML::HTMLParser::Context.string(string) → XML::HTMLParser::Context
click to toggle source
Creates a new parser context based on the specified string.
Parameters:
string - A string that contains the data to parse options - A or'ed together list of LibXML::XML::HTMLParser::Options values
static VALUE rxml_html_parser_context_string(int argc, VALUE* argv, VALUE klass) { VALUE string, options; rb_scan_args(argc, argv, "11", &string, &options); Check_Type(string, T_STRING); if (RSTRING_LEN(string) == 0) rb_raise(rb_eArgError, "Must specify a string with one or more characters"); htmlParserCtxtPtr ctxt = xmlCreateMemoryParserCtxt(StringValuePtr(string), (int)RSTRING_LEN(string)); if (!ctxt) rxml_raise(xmlGetLastError()); /* This is annoying, but xmlInitParserCtxt (called indirectly above) and xmlCtxtUseOptionsInternal (called below) initialize slightly different context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */ htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options)); // Setup sax handler // TODO - there must be a better way? The sax handler is initialized for XML, but we want // to use HTML memset(ctxt->sax, 0, sizeof(xmlSAXHandler)); xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax); return rxml_html_parser_context_wrap(ctxt); }
Public Instance Methods
close → nil
click to toggle source
Closes the underlying input streams. This is useful when parsing a large amount of files and you want to close the files without relying on Ruby’s garbage collector to run.
static VALUE rxml_html_parser_context_close(VALUE self) { htmlParserCtxtPtr ctxt; xmlParserInputPtr xinput; Data_Get_Struct(self, htmlParserCtxt, ctxt); while ((xinput = inputPop(ctxt)) != NULL) { xmlFreeInputStream(xinput); } return Qnil; }
disable_cdata = (true|false)
click to toggle source
Control whether the CDATA nodes will be created in this context.
static VALUE rxml_html_parser_context_disable_cdata_set(VALUE self, VALUE value) { htmlParserCtxtPtr ctxt; Data_Get_Struct(self, htmlParserCtxt, ctxt); if (ctxt->sax == NULL) rb_raise(rb_eRuntimeError, "Sax handler is not yet set"); /* LibXML controls this internally with the default SAX handler. */ if (value) ctxt->sax->cdataBlock = NULL; else ctxt->sax->cdataBlock = xmlSAX2CDataBlock; return value; }
options = XML::Parser::Options::NOENT |
click to toggle source
XML::Parser::Options::NOCDATA
Provides control over the execution of a parser. Valid values are the constants defined on XML::Parser::Options
. Multiple options can be combined by using Bitwise OR (|).
static VALUE rxml_html_parser_context_options_set(VALUE self, VALUE options) { int xml_options = NUM2INT(options); htmlParserCtxtPtr ctxt; Check_Type(options, T_FIXNUM); Data_Get_Struct(self, htmlParserCtxt, ctxt); htmlCtxtUseOptions(ctxt, xml_options); #if LIBXML_VERSION >= 20707 /* Big hack here, but htmlCtxtUseOptions doens't support HTML_PARSE_NOIMPLIED. So do it ourselves. There must be a better way??? */ if (xml_options & HTML_PARSE_NOIMPLIED) { ctxt->options |= HTML_PARSE_NOIMPLIED; } #endif return self; }