package it.unimi.di.law.bubing.test;

import com.google.common.base.Charsets;
import com.google.common.io.Files;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import it.unimi.di.law.bubing.parser.HTMLParser;
import it.unimi.di.law.bubing.store.UnbufferedFileStore;
import it.unimi.di.law.warc.io.CompressedWarcReader;
import it.unimi.di.law.warc.io.WarcFormatException;
import it.unimi.di.law.warc.io.gzarc.GZIPArchive;
import it.unimi.di.law.warc.records.HttpResponseWarcRecord;
import it.unimi.di.law.warc.records.WarcRecord;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.longs.LongBigList;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URI;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/bubing-0.9.11.jar:it/unimi/di/law/bubing/test/LoadPage.class */
public class LoadPage {
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) LoadPage.class);

    public static void main(String[] strArr) throws IOException, JSAPException, ClassNotFoundException {
        long[] jArr;
        SimpleJSAP simpleJSAP = new SimpleJSAP(LoadPage.class.getName(), "LoadPage", new Parameter[]{new FlaggedOption("position", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'p', "position", "The position"), new FlaggedOption("index", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, 'i', "index", "The index of the positions"), new FlaggedOption("url", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'u', "url", "The url"), new FlaggedOption("map", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'm', "map", "The map url 2 positions (usually created by using constposurl)"), new FlaggedOption(UnbufferedFileStore.STORE_NAME, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, true, 's', UnbufferedFileStore.STORE_NAME, "The store file"), new Switch("file", 'f', "file", "if true positions or url are name of files"), new Switch("all", 'a', "all", "if true the buffer of HTMLParser is infinite")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            return;
        }
        if (!parse.userSpecified("position") && !parse.userSpecified("url")) {
            LOGGER.error("You should specify the url or the store position");
        }
        if (parse.userSpecified("position") && parse.userSpecified("url")) {
            LOGGER.error("You cannot specify both the url and the store position");
        }
        if ((parse.userSpecified("url") && !parse.userSpecified("map")) || (!parse.userSpecified("url") && parse.userSpecified("map"))) {
            LOGGER.error("You should specify both map and url");
        }
        boolean z = parse.getBoolean("all");
        String string = parse.getString("index");
        String string2 = parse.getString(UnbufferedFileStore.STORE_NAME);
        LongBigList longBigList = (LongBigList) BinIO.loadObject(string);
        if (!parse.getBoolean("file")) {
            jArr = new long[1];
            if (parse.userSpecified("position")) {
                jArr[0] = Long.parseLong(parse.getString("position"));
            } else {
                String string3 = parse.getString("url");
                Object2LongFunction object2LongFunction = (Object2LongFunction) BinIO.loadObject(parse.getString("map"));
                object2LongFunction.defaultReturnValue(-1L);
                jArr[0] = object2LongFunction.getLong(string3);
            }
        } else if (parse.userSpecified("position")) {
            String[] split = Files.toString(new File(parse.getString("position")), Charsets.UTF_8).split("\n");
            jArr = new long[split.length];
            for (int i = 0; i < split.length; i++) {
                jArr[i] = Long.parseLong(split[i]);
            }
        } else {
            String string4 = parse.getString("url");
            Object2LongFunction object2LongFunction2 = (Object2LongFunction) BinIO.loadObject(parse.getString("map"));
            object2LongFunction2.defaultReturnValue(-1L);
            String[] split2 = Files.toString(new File(string4), Charsets.UTF_8).split("\n");
            jArr = new long[split2.length];
            for (int i2 = 0; i2 < split2.length; i2++) {
                jArr[i2] = object2LongFunction2.getLong(split2[i2]);
            }
        }
        for (long j : jArr) {
            extractURLsandContent(j, string2, longBigList, z);
        }
    }

    public static Set<URI> extractURLsandContent(long j, String str, LongBigList longBigList, boolean z) throws FileNotFoundException, IOException, WarcFormatException, GZIPArchive.FormatException {
        if (j == -1) {
            LOGGER.error("The position was -1");
            return null;
        }
        CompressedWarcReader compressedWarcReader = new CompressedWarcReader(new FastBufferedInputStream(new FileInputStream(new File(str))));
        long j2 = longBigList.getLong(j);
        compressedWarcReader.position(j2);
        WarcRecord read = compressedWarcReader.read();
        LOGGER.info("Processing page " + read.getWarcTargetURI());
        HTMLParser hTMLParser = z ? new HTMLParser() : new HTMLParser(null, null, false, 131072);
        URI warcTargetURI = read.getWarcTargetURI();
        LOGGER.debug("Processing record for page " + warcTargetURI);
        LOGGER.trace("The record is " + read.toString());
        HTMLParser.SetLinkReceiver setLinkReceiver = new HTMLParser.SetLinkReceiver();
        try {
            hTMLParser.parse(read.getWarcTargetURI(), (HttpResponseWarcRecord) read, setLinkReceiver);
        } catch (Exception e) {
            LOGGER.error("Unexpected exception during parsing", (Throwable) e);
        }
        Set<URI> set = setLinkReceiver.urls;
        LOGGER.info("URI:\t" + warcTargetURI.toString() + "\tLINKS:\t" + outlinksToString(set));
        LOGGER.info("HEADERs:\t" + read.toString());
        compressedWarcReader.position(j2);
        IOUtils.copy(((HttpResponseWarcRecord) compressedWarcReader.read()).getEntity().getContent(), System.out);
        return set;
    }

    private static String outlinksToString(Set<URI> set) {
        String str = "";
        Iterator<URI> it2 = set.iterator();
        while (it2.hasNext()) {
            str = str + it2.next().toString() + "\t";
        }
        return str;
    }
}
