package it.unimi.di.big.mg4j.document;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory;
import it.unimi.di.law.bubing.parser.HTMLParser;
import it.unimi.di.law.warc.io.UncompressedWarcReader;
import it.unimi.di.law.warc.io.WarcReader;
import it.unimi.di.law.warc.records.HttpResponseWarcRecord;
import it.unimi.di.law.warc.records.WarcHeader;
import it.unimi.di.law.warc.records.WarcRecord;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.InspectableFileCachedInputStream;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.zip.GZIPInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.naming.factory.Constants;
import org.eclipse.jetty.http.HttpHeaderValues;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* JADX WARN: Classes with same name are omitted:
  
 */
/* loaded from: input_file:WEB-INF/lib/mg4j-big-5.4.3.jar:it/unimi/di/big/mg4j/document/WarcDocumentSequence.class */
public class WarcDocumentSequence extends AbstractDocumentSequence implements Serializable {
    private static final long serialVersionUID = 0;
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) WarcDocumentSequence.class);
    public static final String DEFAULT_BUFFER_SIZE = "64Ki";
    protected final DocumentFactory factory;
    protected final int bufferSize;
    protected final boolean useGzip;
    protected final String[] warcFile;

    protected WarcDocumentSequence(WarcDocumentSequence warcDocumentSequence) {
        this.factory = warcDocumentSequence.factory;
        this.warcFile = warcDocumentSequence.warcFile;
        this.useGzip = warcDocumentSequence.useGzip;
        this.bufferSize = warcDocumentSequence.bufferSize;
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentSequence
    public DocumentFactory factory() {
        return this.factory;
    }

    protected Document getCurrentDocument(WarcRecord warcRecord) throws IOException {
        String charsetNameFromHeader;
        HttpResponseWarcRecord httpResponseWarcRecord = (HttpResponseWarcRecord) warcRecord;
        String str = "ISO-8859-1";
        HttpEntity entity = httpResponseWarcRecord.getEntity();
        Header contentType = entity.getContentType();
        if (contentType != null && (charsetNameFromHeader = HTMLParser.getCharsetNameFromHeader(contentType.getValue())) != null) {
            str = charsetNameFromHeader;
        }
        InputStream content = entity.getContent();
        Header warcHeader = httpResponseWarcRecord.getWarcHeader(WarcHeader.Name.BUBING_GUESSED_CHARSET);
        if (warcHeader != null) {
            str = warcHeader.getValue();
        } else if (content instanceof InspectableFileCachedInputStream) {
            InspectableFileCachedInputStream inspectableFileCachedInputStream = (InspectableFileCachedInputStream) content;
            String charsetName = HTMLParser.getCharsetName(inspectableFileCachedInputStream.buffer, inspectableFileCachedInputStream.inspectable);
            if (charsetName != null) {
                str = charsetName;
            }
        }
        Reference2ObjectOpenHashMap reference2ObjectOpenHashMap = new Reference2ObjectOpenHashMap();
        reference2ObjectOpenHashMap.put(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, str);
        Header firstHeader = httpResponseWarcRecord.getWarcHeaders().getFirstHeader("WARC-TREC-ID");
        if (firstHeader != null) {
            reference2ObjectOpenHashMap.put(PropertyBasedDocumentFactory.MetadataKeys.TITLE, firstHeader.getValue());
        }
        reference2ObjectOpenHashMap.put(PropertyBasedDocumentFactory.MetadataKeys.URI, httpResponseWarcRecord.getWarcTargetURI());
        if (contentType != null) {
            reference2ObjectOpenHashMap.put(PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE, contentType.getValue());
        }
        return this.factory.getDocument(entity.getContent(), reference2ObjectOpenHashMap);
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentSequence
    public DocumentIterator iterator() throws IOException {
        return new AbstractDocumentIterator() { // from class: it.unimi.di.big.mg4j.document.WarcDocumentSequence.1
            private InputStream currentStream;
            private int n;
            private WarcReader reader;

            @Override // it.unimi.di.big.mg4j.document.DocumentIterator
            public Document nextDocument() throws IOException {
                WarcRecord warcRecord;
                InputStream fastBufferedInputStream;
                while (true) {
                    if (this.currentStream == null) {
                        if (this.n == WarcDocumentSequence.this.warcFile.length) {
                            return null;
                        }
                        if (WarcDocumentSequence.this.useGzip) {
                            String[] strArr = WarcDocumentSequence.this.warcFile;
                            int i = this.n;
                            this.n = i + 1;
                            fastBufferedInputStream = new GZIPInputStream(new FileInputStream(strArr[i]), WarcDocumentSequence.this.bufferSize);
                        } else {
                            String[] strArr2 = WarcDocumentSequence.this.warcFile;
                            int i2 = this.n;
                            this.n = i2 + 1;
                            fastBufferedInputStream = new FastBufferedInputStream(new FileInputStream(strArr2[i2]), WarcDocumentSequence.this.bufferSize);
                        }
                        this.currentStream = fastBufferedInputStream;
                        this.reader = new UncompressedWarcReader(this.currentStream);
                    }
                    do {
                        warcRecord = null;
                        try {
                            warcRecord = this.reader.read();
                        } catch (Exception e) {
                            WarcDocumentSequence.LOGGER.error("Unexpected exception reading WARC file", (Throwable) e);
                        }
                        if (warcRecord == null) {
                            break;
                        }
                    } while (warcRecord.getWarcType() != WarcRecord.Type.RESPONSE);
                    return WarcDocumentSequence.this.getCurrentDocument(warcRecord);
                    this.currentStream.close();
                    this.currentStream = null;
                }
            }
        };
    }

    public WarcDocumentSequence(String[] strArr, DocumentFactory documentFactory, boolean z, int i) {
        this.warcFile = strArr;
        this.useGzip = z;
        this.bufferSize = i;
        this.factory = documentFactory;
    }

    public static void main(String[] strArr) throws Exception {
        SimpleJSAP simpleJSAP = new SimpleJSAP(WarcDocumentSequence.class.getName(), "Saves a serialised Warc document sequence based on a set of file names.", new Parameter[]{new FlaggedOption(Constants.FACTORY, JSAP.CLASS_PARSER, IdentityDocumentFactory.class.getName(), false, 'f', Constants.FACTORY, "A document factory with a standard constructor."), new FlaggedOption("property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'p', "property", "A 'key=value' specification, or the name of a property file").setAllowMultipleDeclarations(true), new Switch(HttpHeaderValues.GZIP, 'z', HttpHeaderValues.GZIP, "Expect gzip-ed WARC content (files should end in .warc.gz)."), new FlaggedOption("bufferSize", JSAP.INTSIZE_PARSER, "64Ki", false, 'b', "buffer-size", "The size of an I/O buffer."), new UnflaggedOption("sequence", JSAP.STRING_PARSER, true, "The filename for the serialized sequence."), new UnflaggedOption("basename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, true, "A list of basename files that will be indexed. If missing, a list of files will be read from standard input.")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            System.exit(1);
        }
        PropertyBasedDocumentFactory propertyBasedDocumentFactory = PropertyBasedDocumentFactory.getInstance((Class<?>) parse.getClass(Constants.FACTORY), parse.getStringArray("property"));
        boolean z = parse.getBoolean(HttpHeaderValues.GZIP);
        String[] stringArray = parse.getStringArray("basename");
        if (stringArray.length == 0) {
            stringArray = (String[]) IOUtils.readLines(System.in).toArray(new String[0]);
        }
        if (stringArray.length == 0) {
            LOGGER.warn("Empty fileset");
        }
        BinIO.storeObject(new WarcDocumentSequence(stringArray, propertyBasedDocumentFactory, z, parse.getInt("bufferSize")), parse.getString("sequence"));
    }
}
