class LibXML::XML::HTMLParser::Context

The XML::HTMLParser::Context class provides in-depth control over how a document is parsed.

Public Class Methods

XML::HTMLParser::Context.file(file) → XML::HTMLParser::Context click to toggle source

Creates a new parser context based on the specified file or uri.

Parameters:

file - A filename or uri
options - A or'ed together list of LibXML::XML::HTMLParser::Options values
static VALUE rxml_html_parser_context_file(int argc, VALUE* argv, VALUE klass)
{
  VALUE file, options;
  rb_scan_args(argc, argv, "11", &file, &options);

  htmlParserCtxtPtr ctxt = htmlCreateFileParserCtxt(StringValuePtr(file), NULL);
  if (!ctxt)
    rxml_raise(xmlGetLastError());

  /* This is annoying, but xmlInitParserCtxt (called indirectly above) and 
     xmlCtxtUseOptionsInternal (called below) initialize slightly different
     context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
     sets to 0 and xmlCtxtUseOptionsInternal sets to 1.  So we have to call both. */
  htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));

  return rxml_html_parser_context_wrap(ctxt);
}
XML::HTMLParser::Context.io(io) → XML::HTMLParser::Context click to toggle source

Creates a new parser context based on the specified io object.

Parameters:

io - A ruby IO object
options - A or'ed together list of LibXML::XML::HTMLParser::Options values
static VALUE rxml_html_parser_context_io(int argc, VALUE* argv, VALUE klass)
{
  VALUE io, options;
  rb_scan_args(argc, argv, "11", &io, &options);

  VALUE result;
  htmlParserCtxtPtr ctxt;
  xmlParserInputBufferPtr input;
  xmlParserInputPtr stream;

  if (NIL_P(io))
    rb_raise(rb_eTypeError, "Must pass in an IO object");

  input = xmlParserInputBufferCreateIO((xmlInputReadCallback) rxml_read_callback, NULL,
                                     (void*)io, XML_CHAR_ENCODING_NONE);

  ctxt = htmlNewParserCtxt();
  if (!ctxt)
  {
    xmlFreeParserInputBuffer(input);
    rxml_raise(xmlGetLastError());
  }

  /* This is annoying, but xmlInitParserCtxt (called indirectly above) and 
     xmlCtxtUseOptionsInternal (called below) initialize slightly different
     context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
     sets to 0 and xmlCtxtUseOptionsInternal sets to 1.  So we have to call both. */
  htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));

  stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);

  if (!stream)
  {
    xmlFreeParserInputBuffer(input);
    xmlFreeParserCtxt(ctxt);
    rxml_raise(xmlGetLastError());
  }
  inputPush(ctxt, stream);
  result = rxml_html_parser_context_wrap(ctxt);

  /* Attach io object to parser so it won't get freed.*/
  rb_ivar_set(result, IO_ATTR, io);

  return result;
}
XML::HTMLParser::Context.string(string) → XML::HTMLParser::Context click to toggle source

Creates a new parser context based on the specified string.

Parameters:

string - A string that contains the data to parse
options - A or'ed together list of LibXML::XML::HTMLParser::Options values
static VALUE rxml_html_parser_context_string(int argc, VALUE* argv, VALUE klass)
{
  VALUE string, options;
  rb_scan_args(argc, argv, "11", &string, &options);

  Check_Type(string, T_STRING);

  if (RSTRING_LEN(string) == 0)
    rb_raise(rb_eArgError, "Must specify a string with one or more characters");

  htmlParserCtxtPtr ctxt = xmlCreateMemoryParserCtxt(StringValuePtr(string),
                                   (int)RSTRING_LEN(string));
  if (!ctxt)
    rxml_raise(xmlGetLastError());

  /* This is annoying, but xmlInitParserCtxt (called indirectly above) and 
     xmlCtxtUseOptionsInternal (called below) initialize slightly different
     context options, in particular XML_PARSE_NODICT which xmlInitParserCtxt
     sets to 0 and xmlCtxtUseOptionsInternal sets to 1.  So we have to call both. */
  htmlCtxtUseOptions(ctxt, options == Qnil ? 0 : NUM2INT(options));

  // Setup sax handler
  // TODO - there must be a better way? The sax handler is initialized for XML, but we want
  // to use HTML
  memset(ctxt->sax, 0, sizeof(xmlSAXHandler));
  xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
  
  return rxml_html_parser_context_wrap(ctxt);
}

Public Instance Methods

close → nil click to toggle source

Closes the underlying input streams. This is useful when parsing a large amount of files and you want to close the files without relying on Ruby’s garbage collector to run.

static VALUE rxml_html_parser_context_close(VALUE self)
{
  htmlParserCtxtPtr ctxt;
  xmlParserInputPtr xinput;
  Data_Get_Struct(self, htmlParserCtxt, ctxt);

  while ((xinput = inputPop(ctxt)) != NULL)
  {
         xmlFreeInputStream(xinput);
  }
  return Qnil;
}
disable_cdata = (true|false) click to toggle source

Control whether the CDATA nodes will be created in this context.

static VALUE rxml_html_parser_context_disable_cdata_set(VALUE self, VALUE value)
{
  htmlParserCtxtPtr ctxt;
  Data_Get_Struct(self, htmlParserCtxt, ctxt);

  if (ctxt->sax == NULL)
    rb_raise(rb_eRuntimeError, "Sax handler is not yet set");

  /* LibXML controls this internally with the default SAX handler. */ 
  if (value)
    ctxt->sax->cdataBlock = NULL;
  else
    ctxt->sax->cdataBlock = xmlSAX2CDataBlock;

  return value;
}
options = XML::Parser::Options::NOENT | click to toggle source
XML::Parser::Options::NOCDATA

Provides control over the execution of a parser. Valid values are the constants defined on XML::Parser::Options. Multiple options can be combined by using Bitwise OR (|).

static VALUE rxml_html_parser_context_options_set(VALUE self, VALUE options)
{
  int xml_options = NUM2INT(options);
  htmlParserCtxtPtr ctxt;
  Check_Type(options, T_FIXNUM);

  Data_Get_Struct(self, htmlParserCtxt, ctxt);
  htmlCtxtUseOptions(ctxt, xml_options);

#if LIBXML_VERSION >= 20707
  /* Big hack here, but htmlCtxtUseOptions doens't support HTML_PARSE_NOIMPLIED.
     So do it ourselves. There must be a better way??? */
  if (xml_options & HTML_PARSE_NOIMPLIED) 
  {
          ctxt->options |= HTML_PARSE_NOIMPLIED;
  }
#endif

  return self;
}