package it.unimi.di.big.mg4j.document;

import com.google.common.base.Charsets;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.html.HtmlEscapers;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import gate.creole.ANNIEConstants;
import info.bliki.wiki.filter.Encoder;
import info.bliki.wiki.filter.HTMLConverter;
import info.bliki.wiki.filter.PlainTextConverter;
import info.bliki.wiki.model.WikiModel;
import it.unimi.di.big.mg4j.document.DocumentFactory;
import it.unimi.di.big.mg4j.document.HtmlDocumentFactory;
import it.unimi.di.big.mg4j.document.PropertyBasedDocumentFactory;
import it.unimi.di.big.mg4j.tool.URLMPHVirtualDocumentResolver;
import it.unimi.di.big.mg4j.tool.VirtualDocumentResolver;
import it.unimi.di.big.mg4j.util.parser.callback.AnchorExtractor;
import it.unimi.dsi.big.io.FileLinesCollection;
import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.big.util.StringMap;
import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.objects.AbstractObject2LongFunction;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2LongFunction;
import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectBigList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.sux4j.mph.GOV3Function;
import it.unimi.dsi.util.TextPattern;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xbill.DNS.TTL;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/* JADX WARN: Classes with same name are omitted:
  
 */
/* loaded from: input_file:WEB-INF/lib/mg4j-big-5.4.3.jar:it/unimi/di/big/mg4j/document/WikipediaDocumentSequence.class */
public class WikipediaDocumentSequence extends AbstractDocumentSequence implements Serializable {
    private static final long serialVersionUID = 1;
    private final DocumentFactory factory;
    private final boolean bzipped;
    private final boolean parseText;
    private final boolean keepNamespaced;
    private final String wikipediaXmlDump;
    private final String baseURL;
    private final String linkBaseURL;
    private final String imageBaseURL;
    private ImmutableSet<MutableString> nameSpaces;
    private final ObjectArrayList<AnchorExtractor.Anchor> redirectAnchors;
    private final MyWikiModel wikiModel;
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) WikipediaDocumentSequence.class);
    private static final TextPattern CATEGORY_START = new TextPattern("[[Category:");
    private static final TextPattern BRACKETS_CLOSED = new TextPattern("]]");
    private static final TextPattern BRACES_CLOSED = new TextPattern("}}");
    private static final TextPattern DISAMBIGUATION = new TextPattern("{{disambiguation");
    private static final TextPattern BRACKETS_OPEN = new TextPattern("[[");
    private static final char[] END_OF_DISAMBIGUATION_LINK = {'|', ']'};
    private static final DocumentAndFactory END = new DocumentAndFactory(null, null);

    /* JADX WARN: Classes with same name are omitted:
      
     */
    /* loaded from: input_file:WEB-INF/lib/mg4j-big-5.4.3.jar:it/unimi/di/big/mg4j/document/WikipediaDocumentSequence$DocumentAndFactory.class */
    private static final class DocumentAndFactory {
        public final Document document;
        public final DocumentFactory factory;

        public DocumentAndFactory(Document document, DocumentFactory documentFactory) {
            this.document = document;
            this.factory = documentFactory;
        }
    }

    /* JADX WARN: Classes with same name are omitted:
      
     */
    /* loaded from: input_file:WEB-INF/lib/mg4j-big-5.4.3.jar:it/unimi/di/big/mg4j/document/WikipediaDocumentSequence$MetadataKeys.class */
    public enum MetadataKeys {
        ID,
        LASTEDIT,
        CATEGORY,
        FIRSTPAR,
        REDIRECT
    }

    /* JADX WARN: Classes with same name are omitted:
      
     */
    /* loaded from: input_file:WEB-INF/lib/mg4j-big-5.4.3.jar:it/unimi/di/big/mg4j/document/WikipediaDocumentSequence$MyWikiModel.class */
    private static class MyWikiModel extends WikiModel {
        private final boolean keepTemplates;

        public MyWikiModel(String str, String str2, boolean z) {
            super(str, str2);
            this.keepTemplates = z;
        }

        @Override // info.bliki.wiki.model.AbstractWikiModel, info.bliki.wiki.model.IWikiModel
        public void substituteTemplateCall(String str, Map<String, String> map, Appendable appendable) throws IOException {
            if (this.keepTemplates) {
                super.substituteTemplateCall(str, map, appendable);
            }
        }
    }

    /* JADX WARN: Classes with same name are omitted:
      
     */
    /* loaded from: input_file:WEB-INF/lib/mg4j-big-5.4.3.jar:it/unimi/di/big/mg4j/document/WikipediaDocumentSequence$SignedRedirectedStringMap.class */
    public static final class SignedRedirectedStringMap extends AbstractObject2LongFunction<CharSequence> implements StringMap<CharSequence> {
        private static final long serialVersionUID = 1;
        private final long numberOfDocuments;
        private Object2LongFunction<CharSequence> signedFunction;
        private final long[] target;

        public SignedRedirectedStringMap(long j, Object2LongFunction<CharSequence> object2LongFunction, long[] jArr) {
            this.numberOfDocuments = j;
            this.signedFunction = object2LongFunction;
            this.target = jArr;
        }

        @Override // it.unimi.dsi.fastutil.objects.Object2LongFunction
        public long getLong(Object obj) {
            long j = this.signedFunction.getLong(obj);
            if (j == -1) {
                return -1L;
            }
            return j < this.numberOfDocuments ? j : this.target[(int) (j - this.numberOfDocuments)];
        }

        @Override // it.unimi.dsi.fastutil.Function
        public boolean containsKey(Object obj) {
            return this.signedFunction.getLong(obj) != -1;
        }

        @Override // it.unimi.dsi.fastutil.Size64
        public long size64() {
            return this.numberOfDocuments;
        }

        @Override // it.unimi.dsi.fastutil.Function, it.unimi.dsi.fastutil.Size64
        @Deprecated
        public int size() {
            return (int) Math.min(TTL.MAX_VALUE, size64());
        }

        @Override // it.unimi.dsi.big.util.StringMap
        public ObjectBigList<? extends CharSequence> list() {
            return null;
        }
    }

    /* JADX WARN: Classes with same name are omitted:
      
     */
    /* loaded from: input_file:WEB-INF/lib/mg4j-big-5.4.3.jar:it/unimi/di/big/mg4j/document/WikipediaDocumentSequence$WikipediaHeaderFactory.class */
    public static final class WikipediaHeaderFactory extends AbstractDocumentFactory {
        private static final long serialVersionUID = 1;
        private static final Object2IntOpenHashMap<String> FIELD_2_INDEX = new Object2IntOpenHashMap<>(new String[]{"title", "id", "lastedit", ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME, "firstpar", "redirect"}, new int[]{0, 1, 2, 3, 4, 5});
        private final WordReader wordReader = new FastBufferedReader();

        @Override // it.unimi.di.big.mg4j.document.DocumentFactory
        public int numberOfFields() {
            return 6;
        }

        @Override // it.unimi.di.big.mg4j.document.DocumentFactory
        public String fieldName(int i) {
            switch (i) {
                case 0:
                    return "title";
                case 1:
                    return "id";
                case 2:
                    return "lastedit";
                case 3:
                    return ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME;
                case 4:
                    return "firstpar";
                case 5:
                    return "redirect";
                default:
                    throw new IllegalArgumentException();
            }
        }

        @Override // it.unimi.di.big.mg4j.document.DocumentFactory
        public int fieldIndex(String str) {
            return FIELD_2_INDEX.getInt(str);
        }

        @Override // it.unimi.di.big.mg4j.document.DocumentFactory
        public DocumentFactory.FieldType fieldType(int i) {
            switch (i) {
                case 0:
                    return DocumentFactory.FieldType.TEXT;
                case 1:
                    return DocumentFactory.FieldType.INT;
                case 2:
                    return DocumentFactory.FieldType.DATE;
                case 3:
                    return DocumentFactory.FieldType.TEXT;
                case 4:
                    return DocumentFactory.FieldType.TEXT;
                case 5:
                    return DocumentFactory.FieldType.VIRTUAL;
                default:
                    throw new IllegalArgumentException();
            }
        }

        @Override // it.unimi.di.big.mg4j.document.DocumentFactory
        public Document getDocument(InputStream inputStream, final Reference2ObjectMap<Enum<?>, Object> reference2ObjectMap) throws IOException {
            return new AbstractDocument() { // from class: it.unimi.di.big.mg4j.document.WikipediaDocumentSequence.WikipediaHeaderFactory.1
                @Override // it.unimi.di.big.mg4j.document.Document
                public WordReader wordReader(int i) {
                    return WikipediaHeaderFactory.this.wordReader;
                }

                @Override // it.unimi.di.big.mg4j.document.Document
                public CharSequence uri() {
                    return (CharSequence) reference2ObjectMap.get(PropertyBasedDocumentFactory.MetadataKeys.URI);
                }

                @Override // it.unimi.di.big.mg4j.document.Document
                public CharSequence title() {
                    return (CharSequence) reference2ObjectMap.get(PropertyBasedDocumentFactory.MetadataKeys.TITLE);
                }

                /* JADX WARN: Multi-variable type inference failed */
                @Override // it.unimi.di.big.mg4j.document.Document
                public Object content(int i) throws IOException {
                    ImmutableList copyOf;
                    switch (i) {
                        case 0:
                            return new FastBufferedReader((MutableString) reference2ObjectMap.get(PropertyBasedDocumentFactory.MetadataKeys.TITLE));
                        case 1:
                            return reference2ObjectMap.get(MetadataKeys.ID);
                        case 2:
                            return reference2ObjectMap.get(MetadataKeys.LASTEDIT);
                        case 3:
                            return new FastBufferedReader((MutableString) reference2ObjectMap.get(MetadataKeys.CATEGORY));
                        case 4:
                            return new FastBufferedReader((MutableString) reference2ObjectMap.get(MetadataKeys.FIRSTPAR));
                        case 5:
                            ObjectArrayList objectArrayList = (ObjectArrayList) reference2ObjectMap.get(MetadataKeys.REDIRECT);
                            synchronized (objectArrayList) {
                                objectArrayList.add(new AnchorExtractor.Anchor((MutableString) reference2ObjectMap.get(PropertyBasedDocumentFactory.MetadataKeys.URI), (MutableString) reference2ObjectMap.get(PropertyBasedDocumentFactory.MetadataKeys.TITLE)));
                                copyOf = ImmutableList.copyOf((Collection) objectArrayList);
                                objectArrayList.clear();
                            }
                            return copyOf;
                        default:
                            throw new IllegalArgumentException();
                    }
                }
            };
        }

        @Override // it.unimi.dsi.lang.FlyweightPrototype
        public DocumentFactory copy() {
            return new WikipediaHeaderFactory();
        }

        static {
            FIELD_2_INDEX.defaultReturnValue(-1);
        }
    }

    public WikipediaDocumentSequence(String str, boolean z, String str2, boolean z2) {
        this(str, z, str2, z2, false, false);
    }

    public WikipediaDocumentSequence(String str, boolean z, String str2, boolean z2, boolean z3, boolean z4) {
        this(str, z, str2, z2, z3, z4, 32, 512, 16, (String) null);
    }

    public WikipediaDocumentSequence(String str, boolean z, String str2, boolean z2, boolean z3, boolean z4, int i, int i2, int i3, String str3) {
        this.redirectAnchors = new ObjectArrayList<>();
        this.wikipediaXmlDump = str;
        this.bzipped = z;
        this.baseURL = str2;
        this.parseText = z2;
        this.keepNamespaced = z3;
        Reference2ObjectOpenHashMap reference2ObjectOpenHashMap = new Reference2ObjectOpenHashMap(new Enum[]{HtmlDocumentFactory.MetadataKeys.MAXPREANCHOR, HtmlDocumentFactory.MetadataKeys.MAXANCHOR, HtmlDocumentFactory.MetadataKeys.MAXPOSTANCHOR}, new Integer[]{Integer.valueOf(i), Integer.valueOf(i2), Integer.valueOf(i3)});
        if (str3 != null) {
            reference2ObjectOpenHashMap.put(HtmlDocumentFactory.MetadataKeys.DELIMITER, str3);
        }
        this.factory = new CompositeDocumentFactory(new DocumentFactory[]{new WikipediaHeaderFactory(), new HtmlDocumentFactory(reference2ObjectOpenHashMap)}, new String[]{"title", "id", "lastedit", ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME, "firstpar", "redirect", "text", "dummy", "anchor"});
        this.linkBaseURL = str2 + "${title}";
        this.imageBaseURL = str2 + "${image}";
        this.wikiModel = new MyWikiModel(this.imageBaseURL, this.linkBaseURL, z4);
    }

    public WikipediaDocumentSequence(String str, String str2, String str3, String str4) {
        this(str, Boolean.parseBoolean(str2), str3, Boolean.parseBoolean(str4));
    }

    public WikipediaDocumentSequence(String str, String str2, String str3, String str4, String str5, String str6) {
        this(str, Boolean.parseBoolean(str2), str3, Boolean.parseBoolean(str4), Boolean.parseBoolean(str5), Boolean.parseBoolean(str6));
    }

    public WikipediaDocumentSequence(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, String str9) {
        this(str, Boolean.parseBoolean(str2), str3, Boolean.parseBoolean(str4), Boolean.parseBoolean(str5), Boolean.parseBoolean(str6), Integer.parseInt(str7), Integer.parseInt(str8), Integer.parseInt(str9), (String) null);
    }

    public WikipediaDocumentSequence(String str, String str2, String str3, String str4, String str5, String str6, String str7, String str8, String str9, String str10) {
        this(str, Boolean.parseBoolean(str2), str3, Boolean.parseBoolean(str4), Boolean.parseBoolean(str5), Boolean.parseBoolean(str6), Integer.parseInt(str7), Integer.parseInt(str8), Integer.parseInt(str9), str10);
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentSequence
    public DocumentIterator iterator() throws IOException {
        SAXParserFactory newInstance = SAXParserFactory.newInstance();
        newInstance.setNamespaceAware(true);
        final MutableString mutableString = new MutableString();
        final ObjectOpenHashSet objectOpenHashSet = new ObjectOpenHashSet();
        final ArrayBlockingQueue arrayBlockingQueue = new ArrayBlockingQueue(16);
        int remainingCapacity = arrayBlockingQueue.remainingCapacity();
        while (true) {
            int i = remainingCapacity;
            remainingCapacity--;
            if (i == 0) {
                final ArrayBlockingQueue arrayBlockingQueue2 = new ArrayBlockingQueue(arrayBlockingQueue.size());
                try {
                    final SAXParser newSAXParser = newInstance.newSAXParser();
                    final DefaultHandler defaultHandler = new DefaultHandler() { // from class: it.unimi.di.big.mg4j.document.WikipediaDocumentSequence.1
                        private boolean inText;
                        private boolean inTitle;
                        private boolean inId;
                        private boolean inTimestamp;
                        private boolean inNamespaceDef;
                        private boolean redirect;
                        private final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
                        private MutableString text = new MutableString();
                        private MutableString title = new MutableString();
                        private MutableString id = new MutableString();
                        private MutableString timestamp = new MutableString();
                        private final Reference2ObjectMap<Enum<?>, Object> metadata = new Reference2ObjectOpenHashMap();

                        {
                            this.metadata.put(PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "UTF-8");
                            this.metadata.put(MetadataKeys.REDIRECT, WikipediaDocumentSequence.this.redirectAnchors);
                        }

                        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                        public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                            if ("page".equals(str2)) {
                                this.inTimestamp = false;
                                this.inId = false;
                                this.inTitle = false;
                                this.inText = false;
                                this.redirect = false;
                                this.text.length(0);
                                this.title.length(0);
                                this.id.length(0);
                                this.timestamp.length(0);
                                return;
                            }
                            if ("text".equals(str2)) {
                                this.inText = true;
                                return;
                            }
                            if ("title".equals(str2) && this.title.length() == 0) {
                                this.inTitle = true;
                                return;
                            }
                            if ("id".equals(str2) && this.id.length() == 0) {
                                this.inId = true;
                                return;
                            }
                            if ("timestamp".equals(str2) && this.timestamp.length() == 0) {
                                this.inTimestamp = true;
                                return;
                            }
                            if (!"redirect".equals(str2)) {
                                if ("namespace".equals(str2)) {
                                    this.inNamespaceDef = true;
                                    mutableString.length(0);
                                    return;
                                }
                                return;
                            }
                            this.redirect = true;
                            if (attributes.getValue("title") != null) {
                                synchronized (WikipediaDocumentSequence.this.redirectAnchors) {
                                    String encodeTitleToUrl = Encoder.encodeTitleToUrl(attributes.getValue("title"), true);
                                    WikipediaDocumentSequence.this.redirectAnchors.add(new AnchorExtractor.Anchor(new MutableString(WikipediaDocumentSequence.this.baseURL.length() + encodeTitleToUrl.length()).append(WikipediaDocumentSequence.this.baseURL).append(encodeTitleToUrl), this.title.copy()));
                                }
                            }
                        }

                        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                        public void endElement(String str, String str2, String str3) throws SAXException {
                            int indexOf;
                            if ("namespace".equals(str2)) {
                                if (mutableString.length() != 0) {
                                    objectOpenHashSet.add(mutableString.copy().toLowerCase());
                                    return;
                                }
                                return;
                            }
                            if ("namespaces".equals(str2)) {
                                WikipediaDocumentSequence.this.nameSpaces = ImmutableSet.copyOf((Collection) objectOpenHashSet);
                                return;
                            }
                            if (this.redirect) {
                                return;
                            }
                            if ("title".equals(str2)) {
                                this.metadata.put(PropertyBasedDocumentFactory.MetadataKeys.TITLE, this.title.copy());
                                String encodeTitleToUrl = Encoder.encodeTitleToUrl(this.title.toString(), true);
                                this.metadata.put(PropertyBasedDocumentFactory.MetadataKeys.URI, new MutableString(WikipediaDocumentSequence.this.baseURL.length() + encodeTitleToUrl.length()).append(WikipediaDocumentSequence.this.baseURL).append(encodeTitleToUrl));
                                this.inTitle = false;
                                return;
                            }
                            if ("id".equals(str2)) {
                                this.metadata.put(MetadataKeys.ID, Long.valueOf(this.id.toString()));
                                this.inId = false;
                                return;
                            }
                            if ("timestamp".equals(str2)) {
                                try {
                                    this.metadata.put(MetadataKeys.LASTEDIT, this.dateFormat.parse(this.timestamp.toString()));
                                    this.inTimestamp = false;
                                    return;
                                } catch (ParseException e) {
                                    throw new RuntimeException(e.getMessage(), e);
                                }
                            }
                            if ("text".equals(str2)) {
                                this.inText = false;
                                if (WikipediaDocumentSequence.this.keepNamespaced || (indexOf = this.title.indexOf(':')) == -1 || !WikipediaDocumentSequence.this.nameSpaces.contains(this.title.substring(0, indexOf).toLowerCase())) {
                                    try {
                                        MutableString mutableString2 = new MutableString();
                                        try {
                                            DocumentFactory documentFactory = (DocumentFactory) arrayBlockingQueue.take();
                                            if (WikipediaDocumentSequence.this.parseText) {
                                                if (WikipediaDocumentSequence.DISAMBIGUATION.search(this.text) != -1) {
                                                    MutableString mutableString3 = new MutableString();
                                                    int i2 = 0;
                                                    while (true) {
                                                        int search = WikipediaDocumentSequence.BRACKETS_OPEN.search(this.text, i2);
                                                        if (search == -1) {
                                                            break;
                                                        }
                                                        int i3 = search;
                                                        int indexOfAnyOf = this.text.indexOfAnyOf(WikipediaDocumentSequence.END_OF_DISAMBIGUATION_LINK, search);
                                                        if (indexOfAnyOf != -1) {
                                                            mutableString3.append(this.text.array(), search, indexOfAnyOf - search).append('|').append(this.title).append("]]\n");
                                                            i3 = indexOfAnyOf;
                                                        }
                                                        i2 = i3 + 1;
                                                    }
                                                    this.text.append(mutableString3);
                                                }
                                                MutableString mutableString4 = new MutableString();
                                                int i4 = 0;
                                                while (true) {
                                                    int search2 = WikipediaDocumentSequence.CATEGORY_START.search(this.text, i4);
                                                    if (search2 == -1) {
                                                        break;
                                                    }
                                                    TextPattern textPattern = WikipediaDocumentSequence.BRACKETS_CLOSED;
                                                    MutableString mutableString5 = this.text;
                                                    int length = search2 + WikipediaDocumentSequence.CATEGORY_START.length();
                                                    int search3 = textPattern.search(mutableString5, length);
                                                    if (search3 == -1) {
                                                        break;
                                                    }
                                                    mutableString4.append(this.text.subSequence(length, search3)).append(" OXOXO ");
                                                    i4 = search3;
                                                }
                                                this.metadata.put(MetadataKeys.CATEGORY, mutableString4);
                                                this.metadata.put(MetadataKeys.FIRSTPAR, new MutableString());
                                                String render = WikipediaDocumentSequence.this.wikiModel.render(new PlainTextConverter(true), this.text.toString());
                                                int i5 = 0;
                                                while (i5 < render.length()) {
                                                    if (!Character.isWhitespace(render.charAt(i5))) {
                                                        if (render.charAt(i5) == '{') {
                                                            int search4 = WikipediaDocumentSequence.BRACES_CLOSED.search(render, i5);
                                                            if (search4 == -1) {
                                                                break;
                                                            } else {
                                                                i5 = search4 + 1;
                                                            }
                                                        } else if (render.charAt(i5) == '[') {
                                                            int search5 = WikipediaDocumentSequence.BRACKETS_CLOSED.search(render, i5);
                                                            if (search5 == -1) {
                                                                break;
                                                            } else {
                                                                i5 = search5 + 1;
                                                            }
                                                        } else {
                                                            int indexOf2 = render.indexOf(10, i5);
                                                            if (indexOf2 != -1) {
                                                                this.metadata.put(MetadataKeys.FIRSTPAR, new MutableString(render.substring(i5, indexOf2)));
                                                            }
                                                        }
                                                    }
                                                    i5++;
                                                }
                                                try {
                                                    WikipediaDocumentSequence.this.wikiModel.render(new HTMLConverter(), this.text.toString(), mutableString2, false, true);
                                                    for (Map.Entry<String, String> entry : WikipediaDocumentSequence.this.wikiModel.getCategories().entrySet()) {
                                                        String key = entry.getKey();
                                                        if (entry.getValue().trim().length() != 0) {
                                                            mutableString2.append("\n<a href=\"").append(WikipediaDocumentSequence.this.baseURL).append("Category:").append(Encoder.encodeTitleToUrl(key, true)).append("\">").append(HtmlEscapers.htmlEscaper().escape(key)).append("</a>\n");
                                                        }
                                                    }
                                                } catch (Exception e2) {
                                                    WikipediaDocumentSequence.LOGGER.error("Unexpected exception while parsing " + ((Object) this.title), (Throwable) e2);
                                                }
                                            }
                                            arrayBlockingQueue2.put(new DocumentAndFactory(documentFactory.getDocument(IOUtils.toInputStream(mutableString2, Charsets.UTF_8), new Reference2ObjectOpenHashMap((Reference2ObjectMap) this.metadata)), documentFactory));
                                        } catch (InterruptedException e3) {
                                            throw new RuntimeException(e3.getMessage(), e3);
                                        }
                                    } catch (IOException e4) {
                                        throw new RuntimeException(e4.getMessage(), e4);
                                    } catch (InterruptedException e5) {
                                        throw new RuntimeException(e5.getMessage(), e5);
                                    }
                                }
                            }
                        }

                        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                        public void characters(char[] cArr, int i2, int i3) throws SAXException {
                            if (this.inText && WikipediaDocumentSequence.this.parseText) {
                                this.text.append(cArr, i2, i3);
                            }
                            if (this.inTitle) {
                                this.title.append(cArr, i2, i3);
                            }
                            if (this.inId) {
                                this.id.append(cArr, i2, i3);
                            }
                            if (this.inTimestamp) {
                                this.timestamp.append(cArr, i2, i3);
                            }
                            if (this.inNamespaceDef) {
                                mutableString.append(cArr, i2, i3);
                                this.inNamespaceDef = false;
                            }
                        }

                        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                        public void ignorableWhitespace(char[] cArr, int i2, int i3) throws SAXException {
                            if (this.inText && WikipediaDocumentSequence.this.parseText) {
                                this.text.append(cArr, i2, i3);
                            }
                            if (this.inTitle) {
                                this.title.append(cArr, i2, i3);
                            }
                        }
                    };
                    new Thread() { // from class: it.unimi.di.big.mg4j.document.WikipediaDocumentSequence.2
                        @Override // java.lang.Thread, java.lang.Runnable
                        public void run() {
                            try {
                                InputStream fileInputStream = new FileInputStream(WikipediaDocumentSequence.this.wikipediaXmlDump);
                                if (WikipediaDocumentSequence.this.bzipped) {
                                    fileInputStream = new BZip2CompressorInputStream(fileInputStream);
                                }
                                newSAXParser.parse(new InputSource(new InputStreamReader(new FastBufferedInputStream(fileInputStream), Charsets.UTF_8)), defaultHandler);
                                arrayBlockingQueue2.put(WikipediaDocumentSequence.END);
                            } catch (Exception e) {
                                throw new RuntimeException(e.getMessage(), e);
                            }
                        }
                    }.start();
                    return new AbstractDocumentIterator() { // from class: it.unimi.di.big.mg4j.document.WikipediaDocumentSequence.3
                        private DocumentFactory lastFactory;

                        @Override // it.unimi.di.big.mg4j.document.DocumentIterator
                        public Document nextDocument() throws IOException {
                            try {
                                DocumentAndFactory documentAndFactory = (DocumentAndFactory) arrayBlockingQueue2.take();
                                if (this.lastFactory != null) {
                                    arrayBlockingQueue.put(this.lastFactory);
                                }
                                if (documentAndFactory == WikipediaDocumentSequence.END) {
                                    return null;
                                }
                                this.lastFactory = documentAndFactory.factory;
                                return documentAndFactory.document;
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e.getMessage(), e);
                            }
                        }
                    };
                } catch (Exception e) {
                    throw new RuntimeException(e.getMessage(), e);
                }
            }
            arrayBlockingQueue.add(this.factory.copy());
        }
    }

    @Override // it.unimi.di.big.mg4j.document.DocumentSequence
    public DocumentFactory factory() {
        return this.factory;
    }

    /* JADX WARN: Type inference failed for: r6v12, types: [it.unimi.dsi.fastutil.longs.LongCollection] */
    public static void main(String[] strArr) throws ParserConfigurationException, SAXException, IOException, JSAPException, ClassNotFoundException {
        SimpleJSAP simpleJSAP = new SimpleJSAP(WikipediaDocumentSequence.class.getName(), "Computes the redirects of a Wikipedia dump and integrate them into an existing virtual document resolver for the dump.", new Parameter[]{new Switch(CompressorStreamFactory.BZIP2, 'b', CompressorStreamFactory.BZIP2, "The file is compressed with bzip2"), new Switch("iso", 'i', "iso", "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)."), new FlaggedOption("width", JSAP.INTEGER_PARSER, Integer.toString(64), false, 'w', "width", "The width, in bits, of the signatures used to sign the function from URIs to their rank."), new UnflaggedOption("file", JSAP.STRING_PARSER, true, "The file containing the Wikipedia dump."), new UnflaggedOption("baseURL", JSAP.STRING_PARSER, true, "The base URL for the collection (e.g., http://en.wikipedia.org/wiki/)."), new UnflaggedOption("uris", JSAP.STRING_PARSER, true, "The URIs of the documents in the collection (generated by ScanMetadata)."), new UnflaggedOption("vdr", JSAP.STRING_PARSER, true, "The name of a precomputed virtual document resolver for the collection."), new UnflaggedOption("redvdr", JSAP.STRING_PARSER, true, "The name of the resulting virtual document resolver.")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            return;
        }
        SAXParserFactory newInstance = SAXParserFactory.newInstance();
        newInstance.setNamespaceAware(true);
        final Object2ObjectOpenHashMap object2ObjectOpenHashMap = new Object2ObjectOpenHashMap();
        String string = parse.getString("baseURL");
        final ProgressLogger progressLogger = new ProgressLogger(LOGGER);
        progressLogger.itemsName = "redirects";
        progressLogger.start("Extracting redirects...");
        SAXParser newSAXParser = newInstance.newSAXParser();
        DefaultHandler defaultHandler = new DefaultHandler() { // from class: it.unimi.di.big.mg4j.document.WikipediaDocumentSequence.4
            private boolean inTitle;
            private MutableString title = new MutableString();

            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                if ("page".equals(str2)) {
                    this.inTitle = false;
                    this.title.length(0);
                } else if ("title".equals(str2) && this.title.length() == 0) {
                    this.inTitle = true;
                } else {
                    if (!"redirect".equals(str2) || attributes.getValue("title") == null) {
                        return;
                    }
                    ProgressLogger.this.update();
                    object2ObjectOpenHashMap.put(this.title.copy(), attributes.getValue("title"));
                }
            }

            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void endElement(String str, String str2, String str3) throws SAXException {
                if ("title".equals(str2)) {
                    this.inTitle = false;
                }
            }

            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void characters(char[] cArr, int i, int i2) throws SAXException {
                if (this.inTitle) {
                    this.title.append(cArr, i, i2);
                }
            }

            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void ignorableWhitespace(char[] cArr, int i, int i2) throws SAXException {
                if (this.inTitle) {
                    this.title.append(cArr, i, i2);
                }
            }
        };
        InputStream fileInputStream = new FileInputStream(parse.getString("file"));
        if (parse.userSpecified(CompressorStreamFactory.BZIP2)) {
            fileInputStream = new BZip2CompressorInputStream(fileInputStream);
        }
        newSAXParser.parse(new InputSource(new InputStreamReader(new FastBufferedInputStream(fileInputStream), Charsets.UTF_8)), defaultHandler);
        progressLogger.done();
        Object2LongLinkedOpenHashMap object2LongLinkedOpenHashMap = new Object2LongLinkedOpenHashMap();
        VirtualDocumentResolver virtualDocumentResolver = (VirtualDocumentResolver) BinIO.loadObject(parse.getString("vdr"));
        progressLogger.expectedUpdates = object2ObjectOpenHashMap.size();
        progressLogger.start("Examining redirects...");
        ObjectIterator it2 = object2ObjectOpenHashMap.entrySet().iterator();
        while (it2.hasNext()) {
            Map.Entry entry = (Map.Entry) it2.next();
            MutableString append = new MutableString().append(string).append(Encoder.encodeTitleToUrl(((MutableString) entry.getKey()).toString(), true));
            MutableString append2 = new MutableString().append(string).append(Encoder.encodeTitleToUrl((String) entry.getValue(), true));
            if (virtualDocumentResolver.resolve(append) == -1) {
                long resolve = virtualDocumentResolver.resolve(append2);
                if (resolve != -1) {
                    object2LongLinkedOpenHashMap.put((Object2LongLinkedOpenHashMap) append.copy(), resolve);
                } else {
                    LOGGER.warn("Failed redirect: " + ((Object) append) + " -> " + ((Object) append2));
                }
            } else {
                LOGGER.warn("URL " + ((Object) append) + " is already known to the virtual document resolver");
            }
            progressLogger.lightUpdate();
        }
        progressLogger.done();
        Iterable concat = Iterables.concat(new FileLinesCollection(parse.getString("uris"), "UTF-8"), object2LongLinkedOpenHashMap.keySet());
        BinIO.storeObject(new URLMPHVirtualDocumentResolver(new SignedRedirectedStringMap(virtualDocumentResolver.numberOfDocuments(), new ShiftAddXorSignedStringMap(concat.iterator(), new GOV3Function.Builder().keys(concat).transform(parse.userSpecified("iso") ? TransformationStrategies.iso() : TransformationStrategies.utf16()).build(), parse.getInt("width")), object2LongLinkedOpenHashMap.values2().toLongArray())), parse.getString("redvdr"));
    }
}
