package it.unimi.di.big.mg4j.document;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.di.big.mg4j.document.DocumentFactory;
import it.unimi.di.big.mg4j.tool.Scan;
import it.unimi.di.big.mg4j.util.MG4JClassParser;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.naming.factory.Constants;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* JADX WARN: Classes with same name are omitted:
  
 */
/* loaded from: input_file:WEB-INF/lib/mg4j-big-5.4.3.jar:it/unimi/di/big/mg4j/document/ZipDocumentCollectionBuilder.class */
public class ZipDocumentCollectionBuilder implements DocumentCollectionBuilder {
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) ZipDocumentCollectionBuilder.class);
    private static final boolean DEBUG = false;
    private final String basename;
    private String basenameSuffix;
    private ZipOutputStream zipOut;
    private DataOutputStream zipDataOutputStream;
    private int numberOfDocuments;
    private final boolean exact;
    private final DocumentFactory factory;
    private boolean inTextField;

    public ZipDocumentCollectionBuilder(String str, DocumentFactory documentFactory, boolean z) {
        this.basename = str;
        this.factory = documentFactory;
        this.exact = z;
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollectionBuilder
    public void open(CharSequence charSequence) throws FileNotFoundException {
        this.basenameSuffix = this.basename + ((Object) charSequence);
        ZipOutputStream zipOutputStream = new ZipOutputStream(new FileOutputStream(this.basenameSuffix + ".zip"));
        this.zipOut = zipOutputStream;
        this.zipDataOutputStream = new DataOutputStream(zipOutputStream);
        this.numberOfDocuments = 0;
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollectionBuilder
    public String basename() {
        return this.basename;
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollectionBuilder
    public void startDocument(CharSequence charSequence, CharSequence charSequence2) throws IOException {
        ZipEntry zipEntry = new ZipEntry(Integer.toString(this.numberOfDocuments));
        zipEntry.setComment(charSequence.toString());
        this.zipOut.putNextEntry(zipEntry);
        new MutableString(charSequence2 != null ? charSequence2 : "").writeSelfDelimUTF8(this.zipOut);
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollectionBuilder
    public void endDocument() throws IOException {
        this.zipOut.closeEntry();
        this.numberOfDocuments++;
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollectionBuilder
    public void startTextField() {
        this.inTextField = true;
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollectionBuilder
    public void nonTextField(Object obj) throws IOException {
        ObjectOutputStream objectOutputStream = new ObjectOutputStream(this.zipOut);
        objectOutputStream.writeObject(obj);
        objectOutputStream.flush();
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollectionBuilder
    public void virtualField(List<Scan.VirtualDocumentFragment> list) throws IOException {
        this.zipDataOutputStream.writeInt(list.size());
        for (Scan.VirtualDocumentFragment virtualDocumentFragment : list) {
            virtualDocumentFragment.documentSpecifier().writeSelfDelimUTF8(this.zipOut);
            virtualDocumentFragment.text().writeSelfDelimUTF8(this.zipOut);
        }
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollectionBuilder
    public void endTextField() throws IOException {
        if (!this.inTextField) {
            throw new IllegalStateException();
        }
        this.inTextField = false;
        this.zipOut.write(0);
        if (this.exact) {
            this.zipOut.write(0);
        }
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollectionBuilder
    public void add(MutableString mutableString, MutableString mutableString2) throws IOException {
        if (this.inTextField) {
            if (this.exact || mutableString.length() > 0) {
                mutableString.writeSelfDelimUTF8(this.zipOut);
            }
            if (this.exact) {
                mutableString2.writeSelfDelimUTF8(this.zipOut);
            }
        }
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentCollectionBuilder
    public void close() throws IOException {
        if (this.numberOfDocuments == 0) {
            this.zipOut.putNextEntry(new ZipEntry("dummy"));
        }
        this.zipDataOutputStream.close();
        ZipDocumentCollection zipDocumentCollection = new ZipDocumentCollection(this.basenameSuffix + ".zip", this.factory, this.numberOfDocuments, this.exact);
        BinIO.storeObject(zipDocumentCollection, this.basenameSuffix + DocumentCollection.DEFAULT_EXTENSION);
        zipDocumentCollection.close();
    }

    public void build(DocumentSequence documentSequence) throws IOException {
        this.numberOfDocuments = 0;
        DocumentIterator it2 = documentSequence.iterator();
        if (this.factory != documentSequence.factory()) {
            throw new IllegalStateException("The factory provided by the constructor does not correspond to the factory of the input sequence");
        }
        int numberOfFields = this.factory.numberOfFields();
        MutableString mutableString = new MutableString();
        MutableString mutableString2 = new MutableString();
        open("");
        while (true) {
            Document nextDocument = it2.nextDocument();
            if (nextDocument == null) {
                it2.close();
                close();
                return;
            }
            startDocument(nextDocument.title(), nextDocument.uri());
            for (int i = 0; i < numberOfFields; i++) {
                Object content = nextDocument.content(i);
                if (this.factory.fieldType(i) == DocumentFactory.FieldType.TEXT) {
                    startTextField();
                    WordReader wordReader = nextDocument.wordReader(i);
                    wordReader.setReader((Reader) content);
                    while (wordReader.next(mutableString, mutableString2)) {
                        add(mutableString, mutableString2);
                    }
                    endTextField();
                } else if (this.factory.fieldType(i) == DocumentFactory.FieldType.VIRTUAL) {
                    virtualField((List) content);
                } else {
                    nonTextField(content);
                }
            }
            nextDocument.close();
            endDocument();
        }
    }

    public static void main(String[] strArr) throws JSAPException, IOException, ClassNotFoundException, InvocationTargetException, NoSuchMethodException, IllegalAccessException, InstantiationException, IllegalArgumentException, SecurityException {
        SimpleJSAP simpleJSAP = new SimpleJSAP(ZipDocumentCollectionBuilder.class.getName(), "Produces a zip document collection from an existing document sequence.", new Parameter[]{new FlaggedOption("sequence", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'S', "sequence", "A serialised document sequence that will be used instead of stdin."), new FlaggedOption(Constants.FACTORY, MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), false, 'f', Constants.FACTORY, "A document factory with a standard constructor."), new FlaggedOption("property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'p', "property", "A 'key=value' specification, or the name of a property file").setAllowMultipleDeclarations(true), new FlaggedOption("delimiter", JSAP.INTEGER_PARSER, Integer.toString(10), false, 'd', "delimiter", "The document delimiter."), new Switch("approximated", 'a', "approximated", "If specified, non-words will not be copied."), new FlaggedOption("logInterval", JSAP.LONG_PARSER, Long.toString(10000L), false, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds."), new UnflaggedOption("basename", JSAP.STRING_PARSER, true, "The basename for the collection.")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            return;
        }
        DocumentSequence sequence = Scan.getSequence(parse.getString("sequence"), parse.getClass(Constants.FACTORY), parse.getStringArray("property"), parse.getInt("delimiter"), LOGGER);
        ProgressLogger progressLogger = new ProgressLogger(LOGGER, "documents");
        if (sequence instanceof DocumentCollection) {
            progressLogger.expectedUpdates = ((DocumentCollection) sequence).size();
        }
        ZipDocumentCollectionBuilder zipDocumentCollectionBuilder = new ZipDocumentCollectionBuilder(parse.getString("basename"), sequence.factory(), !parse.getBoolean("approximated"));
        zipDocumentCollectionBuilder.open("");
        zipDocumentCollectionBuilder.build(sequence);
    }
}
