Java HTML Tidy

Updated 3 Sep 1999


This is a Java version of HTML Tidy Release 26 Jul 1999 Copyright © 1999 W3C, see Tidy.java for the copyright notice.

I have made available:

To use the Tidy Java Bean, just include JTidy\lib\Tidy.jar in your classpath.

To build Tidy from the source, you need a Java compiler/runtime environment, supporting Java 1.1 or higher. First, download and expand the archive. For Win 9x/NT, build it using the batch file JTidy\make\build.bat as follows:

    cd JTidy\make
    build c: 26jul9999

Where c: is the root where you expanded the JTidy archive, and 26jul1999 is the directory under JTidy\src where the source is located. NOTE: build.bat assumes that the environment variable java_home points to your JDK installation, and that the JDK tools are in your path.

For Unix environments, follow the procedure in build.bat. Sorry about that, I will try to get a more generic build procedure going in the future.

The main class is: org.w3c.tidy.Tidy


Release News


Code example of how to use the Tidy Java Bean

import java.io.IOException;
import java.net.URL;
import java.io.BufferedInputStream;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.io.FileWriter;
import org.w3c.tidy.Tidy;


/**
 * This program shows how HTML could be tidied directly from
 * a URL stream, and running on separate threads.  Note the use
 * of the 'parse' method to parse from an InputStream, and send
 * the pretty-printed result to an OutputStream.
 * In this example thread th1 outputs XML, and thread th2 outputs
 * HTML.  This shows that properties are per instance of Tidy.
 */

public class Test16 implements Runnable {

    private String url;
    private String outFileName;
    private String errOutFileName;
    private boolean xmlOut;

    public Test16(String url, String outFileName,
                  String errOutFileName, boolean xmlOut)
    {
        this.url = url;
        this.outFileName = outFileName;
        this.errOutFileName = errOutFileName;
        this.xmlOut = xmlOut;
    }

    public void run()
    {
        URL u;
        BufferedInputStream in;
        FileOutputStream out;
        Tidy tidy = new Tidy();

        tidy.setXmlOut(xmlOut);
        try {
            tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
            u = new URL(url);
            in = new BufferedInputStream(u.openStream());
            out = new FileOutputStream(outFileName);
            tidy.parse(in, out);
        }
        catch ( IOException e ) {
            System.out.println( this.toString() + e.toString() );
        }
    }

    public static void main( String[] args ) {
        Test16 t1 = new Test16(args[0], args[1], args[2], true);
        Test16 t2 = new Test16(args[3], args[4], args[5], false);
        Thread th1 = new Thread(t1);
        Thread th2 = new Thread(t2);

        th1.start();
        th2.start();
    }

}

Code example of using Java Tidy as a parser

import java.io.IOException;
import java.io.FileInputStream;
import org.w3c.tidy.Tidy;
import org.w3c.tidy.Node;

/**
 * This program shows how to use Tidy as an HTML parser.
 * It creates an instance of Tidy, calls the parse method
 * to parse a file input stream, and dumps a text representation
 * of the parse tree to System.out.
 */

public class Test17 {

    private static final String spaces =
        "                                                             ";

    private static void dump(Node node, int indent)
    {
        String prefix = spaces.substring(0, indent);
        Node n = node;

        while (n != null) {
            System.out.println( prefix + "------Node-------");
            switch (node.type) {
            case Node.RootNode:
                System.out.println( prefix + "type: RootNode");
                break;
            case Node.DocTypeTag:
                System.out.println( prefix + "type: DocTypeTag");
                break;
            case Node.CommentTag:
                System.out.println( prefix + "type: CommentTag");
                break;
            case Node.ProcInsTag:
                System.out.println( prefix + "type: ProcInsTag");
                break;
            case Node.TextNode:
                System.out.println( prefix + "type: TextNode");
                String v = n.getNodeValue();
                if (v != null) {
                    System.out.println( prefix + "value: " + v);
                } else {
                    System.out.println( prefix + "value: null");
                }
                break;
            case Node.StartTag:
                System.out.println( prefix + "type: StartTag");
                break;
            case Node.EndTag:
                System.out.println( prefix + "type: EndTag");
                break;
            case Node.StartEndTag:
                System.out.println( prefix + "type: StartEndTag");
                break;
            case Node.AspTag:
                System.out.println( prefix + "type: AspTag");
                break;
            default:
                System.out.println( prefix + "invalid type");
                break;
            }
            if (n.element != null) {
                System.out.println( prefix + "element: " + n.element);
            }
            dump(n.content, indent+4);
            n = n.next;
        }
    }

    public static void main( String[] args )
    {
        FileInputStream in;
        Tidy tidy = new Tidy();
        Node root = null;

        try {
            in = new FileInputStream(args[0]);
            tidy.setMakeClean(true);
            root = tidy.parse(in, null);
            dump(root, 0);
        }
        catch ( IOException e ) {
            System.out.println( e.toString() );
        }
    }

}

Send questions, comments, or bug reports to Andy Quick.