package it.unimi.di.law.warc.processors;

import it.unimi.di.law.bubing.spam.sztaki.SztakiSpamDetector;
import it.unimi.di.law.spam.SpamConfig;
import it.unimi.di.law.spam.TermsFrequencyAccumulator;
import it.unimi.di.law.warc.processors.ParallelFilteredProcessorRunner;
import it.unimi.di.law.warc.processors.SpamTermExtractorProcessor;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.fastutil.shorts.Short2ShortOpenHashMap;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.IOException;
import java.io.PrintStream;
import java.net.URI;
import java.util.Date;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.FastDateFormat;
import org.hsqldb.DatabaseURL;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/bubing-0.9.11.jar:it/unimi/di/law/warc/processors/SpamTermHostWriter.class */
public class SpamTermHostWriter implements ParallelFilteredProcessorRunner.Writer<SpamTermExtractorProcessor.UriTermsFrequencySize> {
    private static final Logger LOGGER;
    private static Object2IntOpenHashMap<String> host2WrittenPages;
    public static final Object2ObjectOpenHashMap<String, TermsFrequencyAccumulator> host2Terms;
    private static PrintStream out;
    private static SztakiSpamDetector detector;
    static ProgressLogger pl;
    static final /* synthetic */ boolean $assertionsDisabled;

    /* JADX WARN: Multi-variable type inference failed */
    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() throws IOException {
        pl.done();
        pl.start("Starting writing");
        ObjectIterator<String> it2 = host2Terms.object2ObjectEntrySet().iterator();
        while (it2.hasNext()) {
            Object2ObjectMap.Entry entry = (Object2ObjectMap.Entry) it2.next();
            pl.lightUpdate();
            LOGGER.info("Processing HOST " + ((String) entry.getKey()));
            writeDataForEntry(SpamConfig.PW_WARC, (String) entry.getKey(), (TermsFrequencyAccumulator) entry.getValue());
            if (entry != null) {
                for (int i = 0; i < SpamConfig.PAGE_THRESHOLDs.length; i++) {
                    if (host2WrittenPages.getInt(entry.getKey()) < SpamConfig.PAGE_THRESHOLDs[i]) {
                        writeDataForEntry(SpamConfig.PAGE_THRESHOLDs_STREAM[i], (String) entry.getKey(), (TermsFrequencyAccumulator) entry.getValue());
                    }
                }
            }
        }
        SpamConfig.close();
        pl.done();
    }

    /* JADX WARN: Type inference failed for: r0v28, types: [it.unimi.dsi.fastutil.objects.ObjectSet] */
    private void writeDataForEntry(FastBufferedOutputStream fastBufferedOutputStream, String str, TermsFrequencyAccumulator termsFrequencyAccumulator) {
        try {
            String str2 = DatabaseURL.S_HTTP + str + "/";
            fastBufferedOutputStream.write(toByteArray("WARC/1.0\r\n"));
            String format = FastDateFormat.getInstance("yyyy-MM-dd'T'HH:mm:ss'Z'").format(new Date());
            fastBufferedOutputStream.write(toByteArray("WARC-Date: " + format + "\r\n"));
            fastBufferedOutputStream.write(toByteArray("WARC-Target-URI: " + str2 + "\r\n"));
            fastBufferedOutputStream.write(toByteArray("WARC-Type: " + StringUtils.lowerCase("response") + "\r\n"));
            String str3 = "HTTP/1.1 200 OK\r\nDate: " + format + "\r\nLocation: " + str2 + "\r\nContent-Type: text/html";
            String str4 = "SIZE: " + ((int) termsFrequencyAccumulator.sum()) + "\r\n";
            ObjectIterator it2 = termsFrequencyAccumulator.getTermFrequencyMap().entrySet().iterator();
            while (it2.hasNext()) {
                Map.Entry entry = (Map.Entry) it2.next();
                str4 = str4 + entry.getKey() + ":" + entry.getValue() + " ";
            }
            String str5 = str4 + "\r\n";
            fastBufferedOutputStream.write(toByteArray("Content-Length: " + toByteArray(str3 + "\r\n\r\n" + str5).length + "\r\n\r\n"));
            fastBufferedOutputStream.write(toByteArray(str3 + "\r\n\r\n"));
            fastBufferedOutputStream.write(toByteArray(str5 + "\r\n\r\n"));
        } catch (Exception e) {
            LOGGER.warn("Unexpected Exception while writing terms for host: " + str + "\n" + e.getMessage());
        }
    }

    @Override // it.unimi.di.law.warc.processors.ParallelFilteredProcessorRunner.Writer
    public void write(SpamTermExtractorProcessor.UriTermsFrequencySize uriTermsFrequencySize, long j, PrintStream printStream) throws IOException {
        pl.lightUpdate();
        if (uriTermsFrequencySize == null) {
            return;
        }
        URI uri = uriTermsFrequencySize.getURI();
        Short2ShortOpenHashMap termFrequencyMap = uriTermsFrequencySize.getTermFrequencyMap();
        String host = uri.getHost();
        LOGGER.trace("Getting info for host " + host + " from URI " + uri);
        TermsFrequencyAccumulator termsFrequencyAccumulator = host2Terms.get(host);
        if (termsFrequencyAccumulator != null) {
            termsFrequencyAccumulator.add(termFrequencyMap);
        } else {
            host2Terms.put(host, new TermsFrequencyAccumulator(termFrequencyMap));
        }
        out = printStream;
        host2WrittenPages.addTo(host, 1);
        if (termsFrequencyAccumulator != null) {
            for (int i = 0; i < SpamConfig.PAGE_THRESHOLDs.length; i++) {
                if (host2WrittenPages.getInt(host) == SpamConfig.PAGE_THRESHOLDs[i]) {
                    writeDataForEntry(SpamConfig.PAGE_THRESHOLDs_STREAM[i], host, termsFrequencyAccumulator);
                }
            }
        }
    }

    private static byte[] toByteArray(String str) {
        byte[] bArr = new byte[str.length()];
        int length = str.length();
        while (true) {
            int i = length;
            length--;
            if (i == 0) {
                return bArr;
            }
            if (!$assertionsDisabled && str.charAt(length) >= 128) {
                throw new AssertionError(str.charAt(length));
            }
            bArr[length] = (byte) str.charAt(length);
        }
    }

    static {
        $assertionsDisabled = !SpamTermHostWriter.class.desiredAssertionStatus();
        LOGGER = LoggerFactory.getLogger((Class<?>) SpamTermHostWriter.class);
        host2WrittenPages = new Object2IntOpenHashMap<>();
        host2Terms = new Object2ObjectOpenHashMap<>();
        pl = new ProgressLogger(LOGGER);
        pl.start("Aggregation by host starts...");
        try {
            detector = (SztakiSpamDetector) BinIO.loadObject(SpamConfig.SZTAKI_SPAM_DETECTOR_FILE + ".detector");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
