package it.unimi.di.law.warc.processors;

import com.google.common.base.CharMatcher;
import it.unimi.di.law.spam.PorterStemmer;
import it.unimi.di.law.spam.SpamConfig;
import it.unimi.di.law.warc.processors.ParallelFilteredProcessorRunner;
import it.unimi.di.law.warc.records.HttpResponseWarcRecord;
import it.unimi.di.law.warc.records.WarcRecord;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.fastutil.shorts.Short2ShortMap;
import it.unimi.dsi.fastutil.shorts.Short2ShortOpenHashMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import net.htmlparser.jericho.Source;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/bubing-0.9.11.jar:it/unimi/di/law/warc/processors/SpamTermExtractorProcessor.class */
public class SpamTermExtractorProcessor implements ParallelFilteredProcessorRunner.Processor<UriTermsFrequencySize> {
    static Object2LongOpenHashMap<MutableString> termSetOnthology;
    public static final SpamTermExtractorProcessor INSTANCE;
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) SpamTermExtractorProcessor.class);
    public static final boolean TRACE_ENABLED = LOGGER.isTraceEnabled();
    private static Object2IntOpenHashMap<String> host2ProcessedPages = new Object2IntOpenHashMap<>();
    private static Object host2ProcessedPagesLock = new Object();
    static ProgressLogger pl = new ProgressLogger(LOGGER);

    /* loaded from: input_file:WEB-INF/lib/bubing-0.9.11.jar:it/unimi/di/law/warc/processors/SpamTermExtractorProcessor$UriTermsFrequencySize.class */
    public static final class UriTermsFrequencySize {
        URI uri;
        int size;
        Short2ShortOpenHashMap termFrequencyMap;

        public UriTermsFrequencySize(URI uri, Short2ShortOpenHashMap short2ShortOpenHashMap, int i) {
            this.uri = uri;
            this.termFrequencyMap = short2ShortOpenHashMap;
            this.size = i;
        }

        public String toString() {
            String str = "";
            String str2 = "";
            ObjectIterator<Short2ShortMap.Entry> it2 = this.termFrequencyMap.short2ShortEntrySet().iterator();
            while (it2.hasNext()) {
                Short2ShortMap.Entry next = it2.next();
                if (next.getShortValue() != 0) {
                    str = str + " " + ((int) next.getShortKey());
                    str2 = str2 + " " + ((int) next.getShortValue());
                }
            }
            return "----------------------------------------------\nURI: " + this.uri.toString() + "\nSIZE: " + this.size + "\nTERMS: " + str + "\nFREQUENCY: " + str2 + "\n----------------------------------------------\n";
        }

        public URI getURI() {
            return this.uri;
        }

        public Short2ShortOpenHashMap getTermFrequencyMap() {
            return this.termFrequencyMap;
        }

        public long getSize() {
            return this.size;
        }
    }

    private SpamTermExtractorProcessor() {
    }

    public static SpamTermExtractorProcessor getInstance() {
        return INSTANCE;
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() throws IOException {
    }

    @Override // it.unimi.dsi.lang.FlyweightPrototype
    public ParallelFilteredProcessorRunner.Processor<UriTermsFrequencySize> copy() {
        return INSTANCE;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // it.unimi.di.law.warc.processors.ParallelFilteredProcessorRunner.Processor
    public UriTermsFrequencySize process(WarcRecord warcRecord, long j) {
        pl.lightUpdate();
        URI warcTargetURI = warcRecord.getWarcTargetURI();
        String host = warcTargetURI.getHost();
        if (SpamConfig.PAGE_MAXIMUM_THRESHOLD != Integer.MAX_VALUE) {
            synchronized (host2ProcessedPagesLock) {
                if (host2ProcessedPages.getInt(host) > SpamConfig.PAGE_MAXIMUM_THRESHOLD) {
                    LOGGER.debug("Skipping page " + warcTargetURI.toString() + ": page limit reached for host " + host);
                    return null;
                }
                host2ProcessedPages.addTo(host, 1);
            }
        }
        new PorterStemmer();
        if (TRACE_ENABLED) {
            LOGGER.debug("Processing " + warcTargetURI + "...");
        }
        Short2ShortOpenHashMap short2ShortOpenHashMap = new Short2ShortOpenHashMap();
        short2ShortOpenHashMap.defaultReturnValue((short) 0);
        try {
            String trim = StringUtils.trim(CharMatcher.WHITESPACE.collapseFrom(StringUtils.lowerCase(CharMatcher.JAVA_LETTER_OR_DIGIT.or(CharMatcher.WHITESPACE).negate().replaceFrom((CharSequence) new Source(IOUtils.toString(((HttpResponseWarcRecord) warcRecord).response().getEntity().getContent())).getTextExtractor().toString(), ' ')), ' '));
            String str = "";
            int i = 0;
            FastBufferedReader fastBufferedReader = new FastBufferedReader(new InputStreamReader(IOUtils.toInputStream(trim)));
            MutableString mutableString = new MutableString();
            MutableString mutableString2 = new MutableString();
            while (fastBufferedReader.next(mutableString, mutableString2)) {
                MutableString mutableString3 = new MutableString(mutableString.toString());
                short s = (short) termSetOnthology.getLong(mutableString3);
                if (TRACE_ENABLED) {
                    str = str + ((Object) mutableString3) + " : " + ((int) s) + "\n";
                }
                if (s != -1) {
                    i++;
                    short2ShortOpenHashMap.addTo(s, (short) 1);
                }
            }
            fastBufferedReader.close();
            if (TRACE_ENABLED) {
                LOGGER.debug("==========================" + warcTargetURI + "==========================" + trim + "\n" + str);
            }
            return new UriTermsFrequencySize(warcTargetURI, short2ShortOpenHashMap, i);
        } catch (Exception e) {
            System.err.println("Exception while processing URL " + warcTargetURI);
            e.printStackTrace();
            return null;
        }
    }

    static {
        termSetOnthology = null;
        pl.start("Reading and Processing starts...");
        try {
            termSetOnthology = (Object2LongOpenHashMap) BinIO.loadObject(SpamConfig.SZTAKI_SPAM_DETECTOR_FILE + ".map");
            termSetOnthology.defaultReturnValue(-1L);
        } catch (Exception e) {
            e.printStackTrace();
        }
        INSTANCE = new SpamTermExtractorProcessor();
    }
}
