/*
 * Decompiled with CFR 0.152.
 */
package net.yacy.document.parser;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;

public class docParser
extends AbstractParser
implements Parser {
    public docParser() {
        super("Word Document Parser");
        this.SUPPORTED_EXTENSIONS.add("doc");
        this.SUPPORTED_MIME_TYPES.add("application/msword");
        this.SUPPORTED_MIME_TYPES.add("application/doc");
        this.SUPPORTED_MIME_TYPES.add("appl/text");
        this.SUPPORTED_MIME_TYPES.add("application/vnd.msword");
        this.SUPPORTED_MIME_TYPES.add("application/vnd.ms-word");
        this.SUPPORTED_MIME_TYPES.add("application/winword");
        this.SUPPORTED_MIME_TYPES.add("application/word");
        this.SUPPORTED_MIME_TYPES.add("application/x-msw6");
        this.SUPPORTED_MIME_TYPES.add("application/x-msword");
    }

    @Override
    public Document[] parse(DigestURL location, String mimeType, String charset, VocabularyScraper scraper, int timezoneOffset, InputStream source) throws Parser.Failure, InterruptedException {
        WordExtractor extractor;
        POIFSFileSystem poifs = null;
        try {
            poifs = HWPFDocumentCore.verifyAndBuildPOIFS((InputStream)source);
            extractor = new WordExtractor(poifs);
        }
        catch (OldWordFileFormatException isOldWordDoc) {
            return this.parseOldWordDoc(location, mimeType, poifs);
        }
        catch (Exception e) {
            throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
        }
        StringBuilder contents = new StringBuilder(80);
        try {
            contents.append(extractor.getText());
        }
        catch (Exception e) {
            try {
                extractor.close();
            }
            catch (IOException iOException) {
                // empty catch block
            }
            throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
        }
        String title = contents.length() > 240 ? contents.substring(0, 240) : contents.toString().trim();
        title = title.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ").trim();
        if (title.length() > 80) {
            title = title.substring(0, 80);
        }
        int l = title.length();
        while ((title = title.replaceAll("  ", " ")).length() != l) {
            l = title.length();
        }
        String keywords = extractor.getSummaryInformation().getKeywords();
        String[] keywlist = keywords != null && !keywords.isEmpty() ? CommonPattern.COMMA.split(keywords) : null;
        String subject = extractor.getSummaryInformation().getSubject();
        ArrayList<String> descriptions = new ArrayList<String>();
        if (subject != null && !subject.isEmpty()) {
            descriptions.add(subject);
        }
        Document[] docs = new Document[]{new Document(location, mimeType, StandardCharsets.UTF_8.name(), this, null, keywlist, docParser.singleList(title), extractor.getSummaryInformation().getAuthor(), extractor.getDocSummaryInformation().getCompany(), null, descriptions, 0.0, 0.0, contents.toString(), null, null, null, false, extractor.getSummaryInformation().getLastSaveDateTime())};
        try {
            extractor.close();
        }
        catch (IOException iOException) {
            // empty catch block
        }
        return docs;
    }

    public Document[] parseOldWordDoc(DigestURL location, String mimeType, POIFSFileSystem poifs) throws Parser.Failure {
        Word6Extractor extractor;
        try {
            extractor = new Word6Extractor(poifs);
        }
        catch (Exception e) {
            throw new Parser.Failure("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
        }
        StringBuilder contents = new StringBuilder(80);
        try {
            contents.append(extractor.getText());
        }
        catch (Exception e) {
            try {
                extractor.close();
            }
            catch (IOException iOException) {
                // empty catch block
            }
            throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
        }
        String title = contents.length() > 240 ? contents.substring(0, 240) : contents.toString().trim();
        title = title.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ").trim();
        if (title.length() > 80) {
            title = title.substring(0, 80);
        }
        int l = title.length();
        while ((title = title.replaceAll("  ", " ")).length() != l) {
            l = title.length();
        }
        String keywords = extractor.getSummaryInformation().getKeywords();
        String[] keywlist = keywords != null && !keywords.isEmpty() ? CommonPattern.COMMA.split(keywords) : null;
        String subject = extractor.getSummaryInformation().getSubject();
        ArrayList<String> descriptions = new ArrayList<String>();
        if (subject != null && !subject.isEmpty()) {
            descriptions.add(subject);
        }
        Document[] docs = new Document[]{new Document(location, mimeType, StandardCharsets.UTF_8.name(), this, null, keywlist, docParser.singleList(title), extractor.getSummaryInformation().getAuthor(), extractor.getDocSummaryInformation().getCompany(), null, descriptions, 0.0, 0.0, contents.toString(), null, null, null, false, extractor.getSummaryInformation().getLastSaveDateTime())};
        try {
            extractor.close();
        }
        catch (IOException iOException) {
            // empty catch block
        }
        return docs;
    }
}

