package it.unimi.di.big.mg4j.tool;

import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Iterator;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* JADX WARN: Classes with same name are omitted:
  
 */
/* loaded from: input_file:WEB-INF/lib/mg4j-big-5.4.3.jar:it/unimi/di/big/mg4j/tool/FilterOutWikipediaDuplicates.class */
public class FilterOutWikipediaDuplicates {
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) FilterOutWikipediaDuplicates.class);

    public static void main(String[] strArr) throws IOException, JSAPException, XMLStreamException {
        SimpleJSAP simpleJSAP = new SimpleJSAP(FilterOutWikipediaDuplicates.class.getName(), "Reads a Wikipedia XML dump and outputs the same dump after eliminating duplicate pages. A duplicate page is a page whose title appeared earlier in the XML stream. ", new Parameter[]{new Switch(CompressorStreamFactory.BZIP2, 'b', CompressorStreamFactory.BZIP2, "The (input and output) files are compressed with bzip2"), new UnflaggedOption("infile", JSAP.STRING_PARSER, true, "The input file containing the Wikipedia dump (- for stdin)."), new UnflaggedOption("outfile", JSAP.STRING_PARSER, true, "The output file containing the Wikipedia dump (- for stdout).")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            return;
        }
        String string = parse.getString("infile");
        InputStream fileInputStream = "-".equals(string) ? System.in : new FileInputStream(string);
        if (parse.userSpecified(CompressorStreamFactory.BZIP2)) {
            fileInputStream = new BZip2CompressorInputStream(fileInputStream);
        }
        String string2 = parse.getString("outfile");
        OutputStream fileOutputStream = "-".equals(string2) ? System.out : new FileOutputStream(string2);
        if (parse.userSpecified(CompressorStreamFactory.BZIP2)) {
            if (fileOutputStream == System.out) {
                LOGGER.warn("Going to produce bzip'd output onto stdout");
            }
            fileOutputStream = new BZip2CompressorOutputStream(fileOutputStream);
        }
        XMLInputFactory newInstance = XMLInputFactory.newInstance();
        newInstance.setProperty(XMLInputFactory.IS_VALIDATING, Boolean.FALSE);
        newInstance.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
        XMLEventReader createXMLEventReader = newInstance.createXMLEventReader(fileInputStream);
        XMLEventWriter createXMLEventWriter = XMLOutputFactory.newInstance().createXMLEventWriter(fileOutputStream);
        ArrayList arrayList = new ArrayList();
        boolean z = false;
        boolean z2 = true;
        StringBuilder sb = new StringBuilder();
        ObjectOpenHashSet objectOpenHashSet = new ObjectOpenHashSet();
        ProgressLogger progressLogger = new ProgressLogger(LOGGER);
        progressLogger.itemsName = "pages";
        while (createXMLEventReader.hasNext()) {
            XMLEvent xMLEvent = (XMLEvent) createXMLEventReader.next();
            if (xMLEvent.isStartElement()) {
                if (xMLEvent.asStartElement().getName().getLocalPart().equalsIgnoreCase("page")) {
                    z = true;
                    arrayList.clear();
                }
                if (z) {
                    arrayList.add(xMLEvent);
                    if (xMLEvent.asStartElement().getName().getLocalPart().equalsIgnoreCase("title")) {
                        z2 = true;
                        sb.setLength(0);
                    }
                } else {
                    createXMLEventWriter.add(xMLEvent);
                }
            } else if (xMLEvent.isEndElement()) {
                if (z) {
                    arrayList.add(xMLEvent);
                    if (xMLEvent.asEndElement().getName().getLocalPart().equalsIgnoreCase("title")) {
                        z2 = false;
                    }
                } else {
                    createXMLEventWriter.add(xMLEvent);
                }
                if (xMLEvent.asEndElement().getName().getLocalPart().equalsIgnoreCase("page")) {
                    progressLogger.update();
                    String sb2 = sb.toString();
                    if (objectOpenHashSet.contains(sb2)) {
                        LOGGER.info("Skipping duplicate page: " + sb2);
                    } else {
                        objectOpenHashSet.add(sb2);
                        Iterator it2 = arrayList.iterator();
                        while (it2.hasNext()) {
                            createXMLEventWriter.add((XMLEvent) it2.next());
                        }
                    }
                }
            } else if (z) {
                arrayList.add(xMLEvent);
                if (z2 && xMLEvent.isCharacters()) {
                    sb.append(xMLEvent.asCharacters().getData());
                }
            } else {
                createXMLEventWriter.add(xMLEvent);
            }
        }
        createXMLEventWriter.close();
        if (fileOutputStream != System.out) {
            fileOutputStream.close();
        }
        progressLogger.done();
    }
}
