In this program, You will see the complete example of extract content and metadata from pdf file by using TIKA PDFParser.
Sample File
Complete Example
import java.io.File; import java.io.FileInputStream; import java.io.IOException; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.SAXException; public class TikaPDFParserExample { public static void main(final String[] args) throws IOException,TikaException, SAXException { BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); FileInputStream inputstream = new FileInputStream(new File("D:\\Leraning Material\\Blogs Data\\Bharti Ticket Original.pdf")); ParseContext pcontext = new ParseContext(); //document parsing using PDF parser PDFParser pdfparser = new PDFParser(); pdfparser.parse(inputstream, handler, metadata,pcontext); //extract content of the document System.out.println("Contents of the PDF File :" + handler.toString()); //get metadata of the document System.out.println("Metadata of the PDF File:"); String[] metadataNames = metadata.names(); for(String name : metadataNames) { System.out.println(name+ " : " + metadata.get(name)); } } }
Output
Contents of the PDF File :
TEXT-FILE.txt
You are in FacingIssuesOnIT.
Learn from Others Experience.
Page 1
Metadata of the PDF File:
date : 2019-11-22T23:49:59Z
pdf:unmappedUnicodeCharsPerPage : 0
pdf:PDFVersion : 1.7
pdf:docinfo:title : TEXT-FILE.txt - Notepad
access_permission:modify_annotations : true
access_permission:can_print_degraded : true
dc:creator : Saurabh Gupta
dcterms:created : 2019-11-22T23:49:59Z
Last-Modified : 2019-11-22T23:49:59Z
dcterms:modified : 2019-11-22T23:49:59Z
dc:format : application/pdf; version=1.7
title : TEXT-FILE.txt - Notepad
Last-Save-Date : 2019-11-22T23:49:59Z
access_permission:fill_in_form : true
pdf:docinfo:modified : 2019-11-22T23:49:59Z
meta:save-date : 2019-11-22T23:49:59Z
pdf:encrypted : false
dc:title : TEXT-FILE.txt - Notepad
modified : 2019-11-22T23:49:59Z
Content-Type : application/pdf
pdf:docinfo:creator : Saurabh Gupta
creator : Saurabh Gupta
meta:author : Saurabh Gupta
meta:creation-date : 2019-11-22T23:49:59Z
created : 2019-11-22T23:49:59Z
access_permission:extract_for_accessibility : true
access_permission:assemble_document : true
xmpTPg:NPages : 1
Creation-Date : 2019-11-22T23:49:59Z
pdf:charsPerPage : 76
access_permission:extract_content : true
access_permission:can_print : true
Author : Saurabh Gupta
producer : Microsoft: Print To PDF
access_permission:can_modify : true
pdf:docinfo:producer : Microsoft: Print To PDF
pdf:docinfo:created : 2019-11-22T23:49:59Z
You must be logged in to post a comment.