package it.unimi.di.law.bubing.test;

import com.google.common.io.Files;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import it.unimi.di.law.bubing.parser.HTMLParser;
import it.unimi.di.law.bubing.store.UnbufferedFileStore;
import it.unimi.di.law.bubing.store.WarcStore;
import it.unimi.di.law.warc.io.CompressedWarcReader;
import it.unimi.di.law.warc.records.HttpResponseWarcRecord;
import it.unimi.di.law.warc.records.WarcRecord;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.io.Charsets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/bubing-0.9.11.jar:it/unimi/di/law/bubing/test/ExtractPagesAndCheckOccurrenceOfLinks.class */
public class ExtractPagesAndCheckOccurrenceOfLinks {
    HTMLParser<Void> htmlParser = new HTMLParser<>();
    private static String[] urlToBeFoundArray;
    private static String[] urlRepeatedArray;
    private static int[] fromUrlToBeFoundToComponent;
    private static Object2IntOpenHashMap<String> urlToBeFoundMap;
    private static boolean[] found;
    private static boolean[] foundComponent;
    private static LongBigList index;
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) ExtractPagesAndCheckOccurrenceOfLinks.class);
    private static String urlToBeFoundList = "member";
    private static String urlRepeatedList = "repeated";
    private static String indexFileName = "store.index";
    private static String storeFileName = WarcStore.STORE_NAME;

    ExtractPagesAndCheckOccurrenceOfLinks() throws IOException, ClassNotFoundException {
        index = (LongBigList) BinIO.loadObject(indexFileName);
        String files = Files.toString(new File(urlToBeFoundList), Charsets.UTF_8);
        String files2 = Files.toString(new File(urlRepeatedList), Charsets.UTF_8);
        String[] split = files.split("\n");
        urlToBeFoundArray = new String[split.length];
        fromUrlToBeFoundToComponent = new int[split.length];
        for (int i = 0; i < split.length; i++) {
            LOGGER.trace("Parsing line " + split[i]);
            urlToBeFoundArray[i] = split[i].split("\t")[2];
        }
        int[] iArr = new int[urlToBeFoundArray.length];
        for (int i2 = 0; i2 < urlToBeFoundArray.length; i2++) {
            iArr[i2] = i2;
        }
        int i3 = -1;
        for (int i4 = 0; i4 < split.length; i4++) {
            int parseInt = Integer.parseInt(split[i4].split("\t")[1]);
            fromUrlToBeFoundToComponent[i4] = parseInt;
            if (parseInt > i3) {
                i3 = parseInt;
            }
        }
        LOGGER.info("The maximum component is " + i3);
        foundComponent = new boolean[i3 + 1];
        urlRepeatedArray = files2.split("\n");
        found = new boolean[urlToBeFoundArray.length];
        LOGGER.debug("Creating map urlToBeFoundMap with keys:" + Arrays.toString(urlToBeFoundArray));
        urlToBeFoundMap = new Object2IntOpenHashMap<>(urlToBeFoundArray, iArr);
        CompressedWarcReader compressedWarcReader = new CompressedWarcReader(new FastBufferedInputStream(new FileInputStream(new File(storeFileName))));
        ProgressLogger progressLogger = new ProgressLogger(LOGGER);
        progressLogger.expectedUpdates = urlRepeatedArray.length;
        for (String str : urlRepeatedArray) {
            progressLogger.lightUpdate();
            String[] split2 = str.split("\t");
            compressedWarcReader.position(index.getLong(Integer.parseInt(split2[0])));
            WarcRecord read = compressedWarcReader.read();
            LOGGER.debug("Processing page " + split2[1]);
            process(read);
        }
        progressLogger.done();
    }

    private String outlinksToString(HTMLParser.SetLinkReceiver setLinkReceiver) {
        String str = "";
        Iterator<URI> it2 = setLinkReceiver.iterator();
        while (it2.hasNext()) {
            str = str + it2.next().toString() + "\t";
        }
        return str;
    }

    public void process(WarcRecord warcRecord) {
        URI warcTargetURI = warcRecord.getWarcTargetURI();
        LOGGER.debug("Processing record for page " + warcTargetURI);
        LOGGER.trace("The record is " + warcRecord.toString());
        HTMLParser.SetLinkReceiver setLinkReceiver = new HTMLParser.SetLinkReceiver();
        try {
            this.htmlParser.parse(warcRecord.getWarcTargetURI(), (HttpResponseWarcRecord) warcRecord, setLinkReceiver);
            Set<URI> set = setLinkReceiver.urls;
            LOGGER.debug("URI:\t" + warcTargetURI.toString() + "\tLINKS:\t" + outlinksToString(setLinkReceiver));
            for (URI uri : set) {
                if (urlToBeFoundMap.containsKey(uri.toString())) {
                    int i = urlToBeFoundMap.getInt(uri.toString());
                    found[i] = true;
                    foundComponent[fromUrlToBeFoundToComponent[i]] = true;
                    LOGGER.info("Found URL " + uri + " in page with URL " + warcTargetURI + " - Found link to COMPONENT " + fromUrlToBeFoundToComponent[i]);
                }
            }
        } catch (Exception e) {
            LOGGER.error("Unexpected exception during parsing", (Throwable) e);
        }
    }

    public static void main(String[] strArr) throws IOException, JSAPException, ClassNotFoundException {
        SimpleJSAP simpleJSAP = new SimpleJSAP(ExtractPagesAndCheckOccurrenceOfLinks.class.getName(), "This program wants to test for each component in member whether there exists an url in the component contained in a page specified by repeated. In other words, for each url in repeated, we inspect the corresponding page and we mark the urls in member (and the components) seen  ", new Parameter[]{new FlaggedOption("member", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, 'm', "member", "The three colum file containing the list of URLs to be found: the first column is the position, the second is the component, the third is the url name"), new FlaggedOption("repeated", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, 'r', "repeated", "The two column file containing the list of URLs whose record has to be inspected: the first column is the position while the second is the url name"), new FlaggedOption("index", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, 'i', "index", "The index of the positions"), new FlaggedOption(UnbufferedFileStore.STORE_NAME, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, 's', UnbufferedFileStore.STORE_NAME, "The store file")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            return;
        }
        urlToBeFoundList = parse.getString("member");
        urlRepeatedList = parse.getString("repeated");
        indexFileName = parse.getString("index");
        storeFileName = parse.getString(UnbufferedFileStore.STORE_NAME);
        new ExtractPagesAndCheckOccurrenceOfLinks();
        for (int i = 0; i < foundComponent.length; i++) {
            if (!foundComponent[i]) {
                System.out.println(i);
            }
        }
    }
}
