/*
 * Decompiled with CFR 0.152.
 */
package net.yacy.crawler;

import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.IllegalCrawlProfileException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.document.TextParser;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.kelondro.workflow.WorkflowTask;
import net.yacy.peers.SeedDB;
import net.yacy.repository.Blacklist;
import net.yacy.repository.FilterEngine;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;

public final class CrawlStacker
implements WorkflowTask<Request> {
    public static String ERROR_NO_MATCH_MUST_MATCH_FILTER = "url does not match must-match filter ";
    public static String ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER = "url matches must-not-match filter ";
    public static String ERROR_REDIRECT = "Redirect of ";
    public static final String CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX = "double in";
    private static final ConcurrentLog log = new ConcurrentLog("STACKCRAWL");
    private final RobotsTxt robots;
    private final WorkflowProcessor<Request> requestQueue;
    public final CrawlQueues nextQueue;
    private final CrawlSwitchboard crawler;
    private final Segment indexSegment;
    private final SeedDB peers;
    private final boolean acceptLocalURLs;
    private final boolean acceptGlobalURLs;
    private final FilterEngine domainList;

    public CrawlStacker(RobotsTxt robots2, CrawlQueues cq, CrawlSwitchboard cs, Segment indexSegment, SeedDB peers, boolean acceptLocalURLs, boolean acceptGlobalURLs, FilterEngine domainList) {
        this.robots = robots2;
        this.nextQueue = cq;
        this.crawler = cs;
        this.indexSegment = indexSegment;
        this.peers = peers;
        this.acceptLocalURLs = acceptLocalURLs;
        this.acceptGlobalURLs = acceptGlobalURLs;
        this.domainList = domainList;
        this.requestQueue = new WorkflowProcessor<Request>("CrawlStacker", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, 10000, null, WorkflowProcessor.availableCPU);
        log.info("STACKCRAWL thread initialized.");
    }

    public int size() {
        return this.requestQueue.getQueueSize();
    }

    public boolean isEmpty() {
        return this.requestQueue.queueIsEmpty();
    }

    public void clear() {
        this.requestQueue.clear();
    }

    public void announceClose() {
        log.info("Flushing remaining " + this.size() + " crawl stacker job entries.");
        this.requestQueue.shutdown();
    }

    public synchronized void close() {
        log.info("Shutdown. waiting for remaining " + this.size() + " crawl stacker job entries. please wait.");
        this.requestQueue.shutdown();
        for (int i = 0; i < 10 && this.size() > 0; ++i) {
            try {
                Thread.sleep(1000L);
                continue;
            }
            catch (InterruptedException interruptedException) {
                // empty catch block
            }
        }
        log.info("Shutdown. Closing stackCrawl queue.");
        this.clear();
    }

    @Override
    public Request process(Request entry2) {
        if (entry2 == null) {
            return null;
        }
        try {
            String rejectReason = this.stackCrawl(entry2);
            if (rejectReason != null && !rejectReason.startsWith(CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX)) {
                CrawlProfile profile2 = this.crawler.get(UTF8.getBytes(entry2.profileHandle()));
                this.nextQueue.errorURL.push(entry2.url(), entry2.depth(), profile2, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
            }
        }
        catch (Exception e) {
            log.warn("Error while processing stackCrawl entry.\nEntry: " + entry2.toString() + "Error: " + e.toString(), e);
            return null;
        }
        return null;
    }

    public void enqueueEntry(Request entry2) {
        if (log.isFinest()) {
            log.finest("ENQUEUE " + String.valueOf(entry2.url()) + ", referer=" + String.valueOf(entry2.referrerhash()) + ", initiator=" + (entry2.initiator() == null ? "" : ASCII.String(entry2.initiator())) + ", name=" + entry2.name() + ", appdate=" + String.valueOf(entry2.appdate()) + ", depth=" + entry2.depth());
        }
        this.requestQueue.enQueue(entry2);
    }

    public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final List<AnchorURL> hyperlinks, final int timezoneOffset) {
        new Thread(this, "enqueueEntriesAsynchronous"){
            final /* synthetic */ CrawlStacker this$0;
            {
                this.this$0 = this$0;
                super(arg0);
            }

            @Override
            public void run() {
                this.this$0.enqueueEntries(initiator, profileHandle, hyperlinks, true, timezoneOffset);
            }
        }.start();
    }

    public void enqueueEntries(byte[] initiator, String profileHandle, List<AnchorURL> hyperlinks, boolean replace, int timezoneOffset) {
        byte[] handle = UTF8.getBytes(profileHandle);
        CrawlProfile profile2 = this.crawler.get(handle);
        if (profile2 == null) {
            String error = hyperlinks.size() == 1 ? "Rejected URL : " + hyperlinks.get(0).toNormalform(false) + ". Reason : LOST STACKER PROFILE HANDLE '" + profileHandle + "'" : "Rejected " + hyperlinks.size() + " crawl entries. Reason : LOST STACKER PROFILE HANDLE '" + profileHandle + "'";
            log.info(error);
            throw new IllegalCrawlProfileException("Profile " + profileHandle + " is no more active");
        }
        if (replace) {
            HashSet<String> hosthashes = new HashSet<String>();
            for (AnchorURL url : hyperlinks) {
                if (url == null) continue;
                hosthashes.add(url.hosthash());
            }
            this.nextQueue.errorURL.removeHosts(hosthashes);
        }
        for (AnchorURL url : hyperlinks) {
            if (url == null) continue;
            byte[] urlhash = url.hash();
            if (replace) {
                this.indexSegment.fulltext().remove(urlhash);
                Object u = url.toNormalform(true);
                if (((String)u).endsWith("/")) {
                    u = (String)u + "index.html";
                } else if (!((String)u).contains(".")) {
                    u = (String)u + "/index.html";
                }
                try {
                    byte[] uh = new DigestURL((String)u).hash();
                    this.indexSegment.fulltext().remove(uh);
                    this.nextQueue.noticeURL.removeByURLHash(uh);
                }
                catch (MalformedURLException malformedURLException) {
                    // empty catch block
                }
            }
            if (url.getProtocol().equals("ftp")) {
                this.enqueueEntriesFTP(initiator, profile2, url, replace, timezoneOffset);
                continue;
            }
            this.enqueueEntry(new Request(initiator, url, null, url.getNameProperty(), new Date(), profileHandle, 0, timezoneOffset));
        }
    }

    public void enqueueEntriesFTP(final byte[] initiator, final CrawlProfile profile2, final DigestURL ftpURL, final boolean replace, final int timezoneOffset) {
        final CrawlQueues cq = this.nextQueue;
        String userInfo = ftpURL.getUserInfo();
        int p = userInfo == null ? -1 : userInfo.indexOf(58);
        final String user = userInfo == null ? "anonymous" : userInfo.substring(0, p);
        final String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1);
        final String host = ftpURL.getHost();
        final int port = ftpURL.getPort();
        final int pathParts = ftpURL.getPaths().length;
        new Thread(this, "enqueueEntriesFTP"){
            final /* synthetic */ CrawlStacker this$0;
            {
                this.this$0 = this$0;
                super(arg0);
            }

            @Override
            public void run() {
                try {
                    FTPClient.entryInfo entry2;
                    BlockingQueue<FTPClient.entryInfo> queue = FTPClient.sitelist(host, port, user, pw, ftpURL.getPath(), profile2.depth());
                    while ((entry2 = queue.take()) != FTPClient.POISON_entryInfo) {
                        DigestURL url = null;
                        try {
                            url = new DigestURL("ftp://" + user + ":" + pw + "@" + host + (String)(port == 21 ? "" : ":" + port) + String.valueOf(MultiProtocolURL.escape(entry2.name)));
                        }
                        catch (MalformedURLException e) {
                            continue;
                        }
                        byte[] urlhash = url.hash();
                        if (replace) {
                            this.this$0.indexSegment.fulltext().remove(urlhash);
                            cq.noticeURL.removeByURLHash(urlhash);
                        }
                        int nextDepth = Math.max(0, url.getPaths().length - pathParts);
                        this.this$0.enqueueEntry(new Request(initiator, url, null, MultiProtocolURL.unescape(entry2.name), entry2.date, profile2.handle(), nextDepth, timezoneOffset));
                    }
                }
                catch (IOException e1) {
                    ConcurrentLog.logException(e1);
                }
                catch (InterruptedException interruptedException) {
                    // empty catch block
                }
            }
        }.start();
    }

    public String stackSimpleCrawl(DigestURL url) {
        CrawlProfile pe = this.crawler.defaultPackProfile;
        return this.stackCrawl(new Request(this.peers.mySeed().hash.getBytes(), url, null, "CRAWLING-ROOT", new Date(), pe.handle(), 0, 0));
    }

    public String stackCrawl(Request entry2) {
        boolean global;
        byte[] handle = UTF8.getBytes(entry2.profileHandle());
        CrawlProfile profile2 = this.crawler.get(handle);
        if (profile2 == null) {
            String error = "LOST STACKER PROFILE HANDLE '" + entry2.profileHandle() + "' for URL " + entry2.url().toNormalform(true);
            log.info(error);
            return error;
        }
        Object error = this.checkAcceptanceChangeable(entry2.url(), profile2, entry2.depth());
        if (error != null) {
            return error;
        }
        error = this.checkAcceptanceInitially(entry2.url(), profile2);
        if (error != null) {
            return error;
        }
        boolean local = Base64Order.enhancedCoder.equal(entry2.initiator(), UTF8.getBytes(this.peers.mySeed().hash));
        boolean proxy = (entry2.initiator() == null || entry2.initiator().length == 0 || ASCII.String(entry2.initiator()).equals("------------")) && profile2.handle().equals(this.crawler.defaultProxyProfile.handle());
        boolean remote = profile2.handle().equals(this.crawler.defaultRemoteProfile.handle());
        boolean bl = global = profile2.remoteIndexing() && entry2.depth() == profile2.depth() && (this.peers.mySeed().isSenior() || this.peers.mySeed().isPrincipal());
        if (!(local || global || remote || proxy)) {
            error = "URL '" + entry2.url().toString() + "' cannot be crawled. initiator = " + (entry2.initiator() == null ? "" : ASCII.String(entry2.initiator())) + ", profile.handle = " + profile2.handle();
            log.severe((String)error);
            return error;
        }
        String warning = null;
        if (!profile2.isCrawlerAlwaysCheckMediaType() && TextParser.supportsExtension(entry2.url()) != null) {
            if (profile2.isIndexNonParseableUrls()) {
                warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry2, profile2, this.robots);
                if (warning != null && log.isFine()) {
                    log.fine("CrawlStacker.stackCrawl of URL " + entry2.url().toNormalform(true) + " - not pushed to " + String.valueOf((Object)NoticedURL.StackType.NOLOAD) + " stack : " + warning);
                }
                return null;
            }
            error = "URL '" + entry2.url().toString() + "' file extension is not supported and indexing of linked non-parsable documents is disabled.";
            log.info((String)error);
            return error;
        }
        if (global) {
            if (proxy) {
                log.warn("URL '" + entry2.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy, profile.handle = " + profile2.handle());
            }
            if (remote) {
                log.warn("URL '" + entry2.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry2.initiator()) + ", profile.handle = " + profile2.handle());
            }
            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry2, profile2, this.robots);
        } else if (local) {
            if (proxy) {
                log.warn("URL '" + entry2.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy, profile.handle = " + profile2.handle());
            }
            if (remote) {
                log.warn("URL '" + entry2.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry2.initiator()) + ", profile.handle = " + profile2.handle());
            }
            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry2, profile2, this.robots);
        } else if (proxy) {
            if (remote) {
                log.warn("URL '" + entry2.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry2.initiator()) + ", profile.handle = " + profile2.handle());
            }
            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry2, profile2, this.robots);
        } else if (remote) {
            warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry2, profile2, this.robots);
        }
        if (warning != null && log.isFine()) {
            log.fine("CrawlStacker.stackCrawl of URL " + entry2.url().toNormalform(true) + " - not pushed: " + warning);
        }
        return null;
    }

    public String checkAcceptanceInitially(DigestURL url, CrawlProfile profile2) {
        AtomicInteger dp;
        HarvestProcess dbocc = this.nextQueue.exists(url.hash());
        if (dbocc != null) {
            return "double in: " + dbocc.name();
        }
        String urls2 = url.toNormalform(false);
        long oldDate = this.indexSegment.getLoadTime(url.hash());
        int maxAllowedPagesPerDomain = profile2.domMaxPages();
        if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0 && (dp = profile2.getCount(url.getHost())) != null && dp.get() >= maxAllowedPagesPerDomain) {
            if (log.isFine()) {
                log.fine("URL '" + urls2 + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
            }
            return "crawl stack domain counter exceeded (test by profile)";
        }
        if (oldDate < 0L) {
            return null;
        }
        boolean recrawl = profile2.recrawlIfOlder() > oldDate;
        String urlstring = url.toNormalform(false);
        if (recrawl) {
            if (log.isFine()) {
                log.fine("RE-CRAWL of URL '" + urlstring + "': this url was crawled " + (System.currentTimeMillis() - oldDate) / 60000L / 60L / 24L + " days ago.");
            }
        } else {
            return "double in: local index, recrawl rejected. Document date = " + ISO8601Formatter.FORMATTER.format(new Date(oldDate)) + " is not older than crawl profile recrawl minimum date = " + ISO8601Formatter.FORMATTER.format(new Date(profile2.recrawlIfOlder()));
        }
        return null;
    }

    public String checkAcceptanceChangeable(DigestURL url, CrawlProfile profile2, int depth) {
        Locale locale;
        String urlProtocol = url.getProtocol();
        String urlstring = url.toNormalform(true);
        if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
            log.severe("Unsupported protocol in URL '" + urlstring + "'.");
            return "unsupported protocol";
        }
        String urlRejectReason = this.urlInAcceptedDomain(url);
        if (urlRejectReason != null) {
            if (log.isFine()) {
                log.fine("URL not in accepted Domain (" + urlRejectReason + ")");
            }
            return "denied_(" + urlRejectReason + ")";
        }
        if (Switchboard.urlBlacklist.isListed(Blacklist.BlacklistType.CRAWLER, url)) {
            log.fine("URL '" + urlstring + "' is in blacklist.");
            return "url in blacklist";
        }
        if (depth > 0 && !profile2.urlMustMatchPattern().matcher(urlstring).matches()) {
            String patternStr = profile2.formattedUrlMustMatchPattern();
            if (log.isFine()) {
                log.fine("URL '" + urlstring + "' does not match must-match crawling filter '" + patternStr + "'.");
            }
            return ERROR_NO_MATCH_MUST_MATCH_FILTER + patternStr;
        }
        if (depth > 0 && profile2.urlMustNotMatchPattern().matcher(urlstring).matches()) {
            if (log.isFine()) {
                log.fine("URL '" + urlstring + "' matches must-not-match crawling filter '" + profile2.urlMustNotMatchPattern().toString() + "'.");
            }
            return ERROR_MATCH_WITH_MUST_NOT_MATCH_FILTER + profile2.urlMustNotMatchPattern().toString();
        }
        if (url.isIndividual() && !profile2.crawlingQ()) {
            if (log.isFine()) {
                log.fine("URL '" + urlstring + "' is CGI URL.");
            }
            return "individual url (sessionid etc) not wanted";
        }
        if (url.isPOST() && !profile2.crawlingQ()) {
            if (log.isFine()) {
                log.fine("URL '" + urlstring + "' is post URL.");
            }
            return "post url not allowed";
        }
        if (depth > 0 && profile2.ipMustMatchPattern() != CrawlProfile.MATCH_ALL_PATTERN && url.getHost() != null && !profile2.ipMustMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
            if (log.isFine()) {
                log.fine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile2.ipMustMatchPattern().toString() + "'.");
            }
            return "ip " + url.getInetAddress().getHostAddress() + " of url does not match must-match filter";
        }
        if (depth > 0 && profile2.ipMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && url.getHost() != null && profile2.ipMustNotMatchPattern().matcher(url.getInetAddress().getHostAddress()).matches()) {
            if (log.isFine()) {
                log.fine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' matches must-not-match crawling filter '" + profile2.ipMustNotMatchPattern().toString() + "'.");
            }
            return "ip " + url.getInetAddress().getHostAddress() + " of url matches must-not-match filter";
        }
        String[] countryMatchList = profile2.countryMustMatchList();
        if (depth > 0 && countryMatchList != null && countryMatchList.length > 0 && (locale = url.getLocale()) != null) {
            String c0 = locale.getCountry();
            boolean granted = false;
            for (String c : countryMatchList) {
                if (!c0.equals(c)) continue;
                granted = true;
                break;
            }
            if (!granted) {
                if (log.isFine()) {
                    log.fine("IP " + url.getInetAddress().getHostAddress() + " of URL '" + urlstring + "' does not match must-match crawling filter '" + profile2.ipMustMatchPattern().toString() + "'.");
                }
                return "country " + c0 + " of url does not match must-match filter for countries";
            }
        }
        return null;
    }

    public String urlInAcceptedDomain(DigestURL url) {
        if (url == null) {
            return "url is null";
        }
        if (this.domainList != null && !this.domainList.isListed(url, null)) {
            return "the url '" + String.valueOf(url) + "' is not in domainList of this network";
        }
        boolean local = url.isLocal();
        if (this.acceptLocalURLs && local) {
            return null;
        }
        if (this.acceptGlobalURLs && !local) {
            return null;
        }
        String host = url.getHost();
        if (host == null) {
            return "url.host is null (you must switch to intranet mode to crawl these sources)";
        }
        InetAddress ia = Domains.dnsResolve(host);
        return local ? "the host '" + host + "' is local, but local addresses are not accepted: " + (ia == null ? "DNS lookup resulted in null (unknown host name)" : ia.getHostAddress()) : "the host '" + host + "' is global, but global addresses are not accepted: " + (ia == null ? "null" : ia.getHostAddress());
    }

    public String urlInAcceptedDomainHash(byte[] urlhash) {
        if (urlhash == null) {
            return "url is null";
        }
        boolean local = DigestURL.isLocal(urlhash);
        if (this.acceptLocalURLs && local) {
            return null;
        }
        if (this.acceptGlobalURLs && !local) {
            return null;
        }
        return local ? "the urlhash '" + ASCII.String(urlhash) + "' is local, but local addresses are not accepted" : "the urlhash '" + ASCII.String(urlhash) + "' is global, but global addresses are not accepted";
    }

    public boolean acceptLocalURLs() {
        return this.acceptLocalURLs;
    }

    public boolean acceptGlobalURLs() {
        return this.acceptGlobalURLs;
    }
}

