In this program, you will see complete steps to extract content and metadata of Open Office Document Format (ODF) .
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.odf.OpenDocumentParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class TikaOpenDocumentParserExample {
public static void main(final String[] args) throws IOException, SAXException, TikaException {
// detecting the file type
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputstream = new FileInputStream(new File("TIKA-OPEN-DOCUMENT.odp"));
ParseContext pcontext = new ParseContext();
// Open Document Parser
OpenDocumentParser openofficeparser = new OpenDocumentParser();
openofficeparser.parse(inputstream, handler, metadata, pcontext);
System.out.println("Contents of the Open Office document:" + handler.toString());
System.out.println("Metadata of the Open Office document:");
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
System.out.println(name + " : " + metadata.get(name));
}
}
}
Output
Contents of the Open Office document:You are in FacingIssuesOnIT.
Learn from Others Experience.
Metadata of the Open Office document:
editing-cycles: 4
meta:creation-date: 2009-04-16T11:32:32.86
dcterms:modified: 2014-09-28T07:46:13.03
meta:save-date: 2014-09-28T07:46:13.03
Last-Modified: 2014-09-28T07:46:13.03
dcterms:created: 2009-04-16T11:32:32.86
date: 2014-09-28T07:46:13.03
modified: 2014-09-28T07:46:13.03
nbObject: 36
Edit-Time: PT32M6S
Creation-Date: 2009-04-16T11:32:32.86
Object-Count: 36
meta:object-count: 36
generator: OpenOffice/4.1.0$Win32 OpenOffice.org_project/410m18$Build-9764
Content-Type: application/vnd.oasis.opendocument.presentation
Last-Save-Date: 2014-09-28T07:46:13.03
Like this: Like Loading...
In this program, you will see complete steps to extraction content and metadata of the MS-Excel file by using TIKA OOXMLParser.
Sample File
TIKA MS Excel File Content and Metadata extraction
Complete Example
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.ooxml.OOXMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class TikaMSExcelParserExample {
public static void main(final String[] args) throws IOException, TikaException, SAXException {
// detecting the file type
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputstream = new FileInputStream(new File("C:\\Users\\Saurabh Gupta\\Desktop\\TIKA\\TIKA-MS-EXCEL.xlsx"));
ParseContext pcontext = new ParseContext();
// OOXml parser
OOXMLParser msofficeparser = new OOXMLParser();
msofficeparser.parse(inputstream, handler, metadata, pcontext);
System.out.println("Contents of the excel document:" + handler.toString());
System.out.println("Metadata of the excel document:");
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
System.out.println(name + ": " + metadata.get(name));
}
}
}
Output
Contents of the excel document:Sheet1
First Name Last Name DOB
Saurabh Gupta 10-Dec-85
Gaurav Kumar 12-May-86
Rahul Roi 12-Jun-10
Raghvendra Rana 5-Jan-95
Tanaya Jain 13-Mar-85
Metadata of the excel document:
date: 2019-11-23T00:25:08Z
extended-properties:AppVersion: 15.0300
meta:creation-date: 2006-09-16T00:00:00Z
extended-properties:Application: Microsoft Excel
extended-properties:Company:
Creation-Date: 2006-09-16T00:00:00Z
dcterms:created: 2006-09-16T00:00:00Z
custom:WorkbookGuid: e742a774-13a6-49b2-8ba3-1b6118163781
dcterms:modified: 2019-11-23T00:25:08Z
Last-Modified: 2019-11-23T00:25:08Z
Last-Save-Date: 2019-11-23T00:25:08Z
Application-Version: 15.0300
protected: false
meta:save-date: 2019-11-23T00:25:08Z
Application-Name: Microsoft Excel
modified: 2019-11-23T00:25:08Z
publisher:
Content-Type: application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
dc:publisher:
Like this: Like Loading...
In this program, You will see the complete example of extract content and metadata from pdf file by using TIKA PDFParser.
Sample File
Complete Example
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class TikaPDFParserExample {
public static void main(final String[] args) throws IOException,TikaException, SAXException {
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputstream = new FileInputStream(new File("D:\\Leraning Material\\Blogs Data\\Bharti Ticket Original.pdf"));
ParseContext pcontext = new ParseContext();
//document parsing using PDF parser
PDFParser pdfparser = new PDFParser();
pdfparser.parse(inputstream, handler, metadata,pcontext);
//extract content of the document
System.out.println("Contents of the PDF File :" + handler.toString());
//get metadata of the document
System.out.println("Metadata of the PDF File:");
String[] metadataNames = metadata.names();
for(String name : metadataNames) {
System.out.println(name+ " : " + metadata.get(name));
}
}
}
Output
Contents of the PDF File :
TEXT-FILE.txt
You are in FacingIssuesOnIT.
Learn from Others Experience.
Page 1
Metadata of the PDF File:
date : 2019-11-22T23:49:59Z
pdf:unmappedUnicodeCharsPerPage : 0
pdf:PDFVersion : 1.7
pdf:docinfo:title : TEXT-FILE.txt - Notepad
access_permission:modify_annotations : true
access_permission:can_print_degraded : true
dc:creator : Saurabh Gupta
dcterms:created : 2019-11-22T23:49:59Z
Last-Modified : 2019-11-22T23:49:59Z
dcterms:modified : 2019-11-22T23:49:59Z
dc:format : application/pdf; version=1.7
title : TEXT-FILE.txt - Notepad
Last-Save-Date : 2019-11-22T23:49:59Z
access_permission:fill_in_form : true
pdf:docinfo:modified : 2019-11-22T23:49:59Z
meta:save-date : 2019-11-22T23:49:59Z
pdf:encrypted : false
dc:title : TEXT-FILE.txt - Notepad
modified : 2019-11-22T23:49:59Z
Content-Type : application/pdf
pdf:docinfo:creator : Saurabh Gupta
creator : Saurabh Gupta
meta:author : Saurabh Gupta
meta:creation-date : 2019-11-22T23:49:59Z
created : 2019-11-22T23:49:59Z
access_permission:extract_for_accessibility : true
access_permission:assemble_document : true
xmpTPg:NPages : 1
Creation-Date : 2019-11-22T23:49:59Z
pdf:charsPerPage : 76
access_permission:extract_content : true
access_permission:can_print : true
Author : Saurabh Gupta
producer : Microsoft: Print To PDF
access_permission:can_modify : true
pdf:docinfo:producer : Microsoft: Print To PDF
pdf:docinfo:created : 2019-11-22T23:49:59Z
Like this: Like Loading...
In this program, you will see complete steps to extract XML file content and metadata by using XMLParser.
Sample File
TIKA XML File Content and Metadata Extraction
Complete Example
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.xml.XMLParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
public class TikaXmlParserExample {
public static void main(final String[] args) throws IOException, SAXException, TikaException {
// detecting the file type
BodyContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
FileInputStream inputstream = new FileInputStream(new File("C:\\Users\\Saurabh Gupta\\Desktop\\TIKA\\EmployeeData.xml"));
ParseContext pcontext = new ParseContext();
// Xml parser
XMLParser xmlparser = new XMLParser();
xmlparser.parse(inputstream, handler, metadata, pcontext);
System.out.println("Contents of the XML document:" + handler.toString());
System.out.println("Metadata of the XML document:");
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
System.out.println(name + ": " + metadata.get(name));
}
}
}
Output
Contents of the XML document:
Saurabh
Gupta
13-Jun-1980
Gaurav
Kumar
12-Dec-1987
Rajesh
Gupta
10-Jan-1999
Raghvendra
Prasad
14-Mar-1985
Rahul
Jain
11-May-1981
Metadata of the XML document:
Content-Type: application/xml
Like this: Like Loading...
“Learn From Others Experience"
You must be logged in to post a comment.