class LibXML::XML::HTMLParser::Context
The XML::HTMLParser::Context class provides in-depth control over how a document is parsed.
Public Class Methods
XML::HTMLParser::Context.file(file) → XML::HTMLParser::Context
click to toggle source
Creates a new parser context based on the specified file or uri.
Parameters:
file - A filename or uri options - A or'ed together list of LibXML::XML::HTMLParser::Options values
static VALUE rxml_html_parser_context_file(int argc, VALUE* argv, VALUE klass)
{
VALUE file, options;
rb_scan_args(argc, argv, "11", &file, &options);
htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(StringValuePtr(file), NULL);
if (!ctxt)
rxml_raise(xmlGetLastError());
/* This is annoying, but xmlInitParserCtxt (called indirectly above) and
xmlCtxtUseOptionsInternal (called below) initialize slightly different
context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */
htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));
return rxml_html_parser_context_wrap(ctxt);
}
XML::HTMLParser::Context.io(io) → XML::HTMLParser::Context
click to toggle source
Creates a new parser context based on the specified io object.
Parameters:
io - A ruby IO object options - A or'ed together list of LibXML::XML::HTMLParser::Options values
static VALUE rxml_html_parser_context_io(int argc, VALUE* argv, VALUE klass)
{
VALUE io, options;
rb_scan_args(argc, argv, "11", &io, &options);
VALUE result;
htmlParserCtxtPtr ctxt;
xmlParserInputBufferPtr input;
xmlParserInputPtr stream;
if (NIL_P(io))
rb_raise(rb_eTypeError, "Must pass in an IO object");
input = xmlParserInputBufferCreateIO((xmlInputReadCallback) rxml_read_callback, NULL,
(void*)io, XML_CHAR_ENCODING_NONE);
ctxt = htmlNewParserCtxt();
if (!ctxt)
{
xmlFreeParserInputBuffer(input);
rxml_raise(xmlGetLastError());
}
/* This is annoying, but xmlInitParserCtxt (called indirectly above) and
xmlCtxtUseOptionsInternal (called below) initialize slightly different
context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */
htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
if (!stream)
{
xmlFreeParserInputBuffer(input);
xmlFreeParserCtxt(ctxt);
rxml_raise(xmlGetLastError());
}
inputPush(ctxt, stream);
result = rxml_html_parser_context_wrap(ctxt);
/* Attach io object to parser so it won't get freed.*/
rb_ivar_set(result, IO_ATTR, io);
return result;
}
XML::HTMLParser::Context.string(string) → XML::HTMLParser::Context
click to toggle source
Creates a new parser context based on the specified string.
Parameters:
string - A string that contains the data to parse options - A or'ed together list of LibXML::XML::HTMLParser::Options values
static VALUE rxml_html_parser_context_string(int argc, VALUE* argv, VALUE klass)
{
VALUE string, options;
rb_scan_args(argc, argv, "11", &string, &options);
Check_Type(string, T_STRING);
if (RSTRING_LEN(string) == 0)
rb_raise(rb_eArgError, "Must specify a string with one or more characters");
htmlParserCtxtPtr ctxt = xmlCreateMemoryParserCtxt(StringValuePtr(string),
(int)RSTRING_LEN(string));
if (!ctxt)
rxml_raise(xmlGetLastError());
/* This is annoying, but xmlInitParserCtxt (called indirectly above) and
xmlCtxtUseOptionsInternal (called below) initialize slightly different
context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
sets to 0 and xmlCtxtUseOptionsInternal sets to 1. So we have to call both. */
htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));
// Setup sax handler
// TODO - there must be a better way? The sax handler is initialized for XML, but we want
// to use HTML
memset(ctxt->sax, 0, sizeof(xmlSAXHandler));
xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
return rxml_html_parser_context_wrap(ctxt);
}
Public Instance Methods
close → nil
click to toggle source
Closes the underlying input streams. This is useful when parsing a large amount of files and you want to close the files without relying on Ruby’s garbage collector to run.
static VALUE rxml_html_parser_context_close(VALUE self)
{
htmlParserCtxtPtr ctxt;
xmlParserInputPtr xinput;
Data_Get_Struct(self, htmlParserCtxt, ctxt);
while ((xinput = inputPop(ctxt)) != NULL)
{
xmlFreeInputStream(xinput);
}
return Qnil;
}
disable_cdata = (true|false)
click to toggle source
Control whether the CDATA nodes will be created in this context.
static VALUE rxml_html_parser_context_disable_cdata_set(VALUE self, VALUE value)
{
htmlParserCtxtPtr ctxt;
Data_Get_Struct(self, htmlParserCtxt, ctxt);
if (ctxt->sax == NULL)
rb_raise(rb_eRuntimeError, "Sax handler is not yet set");
/* LibXML controls this internally with the default SAX handler. */
if (value)
ctxt->sax->cdataBlock = NULL;
else
ctxt->sax->cdataBlock = xmlSAX2CDataBlock;
return value;
}
options = XML::Parser::Options::NOENT |
click to toggle source
XML::Parser::Options::NOCDATA
Provides control over the execution of a parser. Valid values are the constants defined on XML::Parser::Options. Multiple options can be combined by using Bitwise OR (|).
static VALUE rxml_html_parser_context_options_set(VALUE self, VALUE options)
{
int xml_options = NUM2INT(options);
htmlParserCtxtPtr ctxt;
Check_Type(options, T_FIXNUM);
Data_Get_Struct(self, htmlParserCtxt, ctxt);
htmlCtxtUseOptions(ctxt, xml_options);
#if LIBXML_VERSION >= 20707
/* Big hack here, but htmlCtxtUseOptions doens't support HTML_PARSE_NOIMPLIED.
So do it ourselves. There must be a better way??? */
if (xml_options & HTML_PARSE_NOIMPLIED)
{
ctxt->options |= HTML_PARSE_NOIMPLIED;
}
#endif
return self;
}