In this example, you will see complete steps to extract content and metadata from the HTML file by using TIKA HtmlParser.
Sample File

Complete Example
import java.io.File; import java.io.FileInputStream; import java.io.IOException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.html.HtmlParser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.SAXException; public class TikaHTMLParserExample { public static void main(final String[] args) throws IOException, SAXException, TikaException { // detecting the file type BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); FileInputStream inputstream = new FileInputStream(new File("C:\\Users\\Saurabh Gupta\\Desktop\\TIKA\\HTML-FILE.html")); ParseContext pcontext = new ParseContext(); // Html parser to get content HtmlParser htmlparser = new HtmlParser(); htmlparser.parse(inputstream, handler, metadata, pcontext); System.out.println("Contents of the HTML document:" + handler.toString()); System.out.println("Metadata of the HTML document:"); String[] metadataNames = metadata.names(); for (String name : metadataNames) { System.out.println(name + ": " + metadata.get(name)); } } }
Output
Contents of the HTML document: First Name Last Name DOB
Saurabh Gupta 10-Dec-2085
Gaurav Kumar 12-May-1986
Rahul Roi 12-Jun-2010
Raghvendra Rana 05-Jan-2095
Tanaya Jain 13-Mar-1985
Metadata of the HTML document:
Content-Encoding: windows-1252
Content-Type: text/html; charset=windows-1252