Scala TagSoup

Do you like this?

Summary:
This short post shows you different ways of using TagSoup Library.


Content:

Example 1:

import org.ccil.cowan.tagsoup.jaxp.SAXParserImpl
import java.net._
import org.xml.sax.helpers.DefaultHandler
import org.xml.sax.Attributes


object Test extends App {

  val url = new URL("http://your-site.com")
  val urlc = url.openConnection();
  urlc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7");

  SAXParserImpl.newInstance(null).parse(
    urlc.getInputStream(),
    new DefaultHandler() {
      override def startElement(uri: String, localName: String,
        name: String, a: Attributes) {
        if (name.equalsIgnoreCase("div") && a.getValue("class") != null)
          if (a.getValue("class").contains("field-item even")) {
            System.out.println(a.getValue("class"));

          }
      }
    });
}
Example 2:
import java.net._
import org.jdom2.input.SAXBuilder
import org.jdom2.xpath.jaxen.JDOMXPath
import org.jdom2.Element
import java.io.BufferedReader
import java.io.InputStreamReader
import org.jdom2.output.XMLOutputter

object Test2 extends App {


  val url = new URL("http://your-site.com")
  val urlc = url.openConnection();
  urlc.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7");

  val reader = new BufferedReader(
        new InputStreamReader(urlc.getInputStream()));
  
  val builder = new SAXBuilder("org.ccil.cowan.tagsoup.Parser")
  val doc = builder.build(reader)
  val domPath = new JDOMXPath("/h:html")
  domPath.addNamespace("h", "http://www.w3.org/1999/xhtml");

  val content = domPath.selectSingleNode(doc).asInstanceOf[Element]
  val out = new XMLOutputter().outputString(content)
  System.out.println(out);
}

 
comments powered by Disqus