package gate.corpora;

import com.google.gwt.uibinder.client.impl.AbstractUiRenderer;
import gate.Document;
import gate.GateConstants;
import gate.Resource;
import gate.TextualDocument;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.AutoInstance;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.event.StatusListener;
import gate.html.NekoHtmlDocumentHandler;
import gate.util.DocumentFormatException;
import gate.util.Out;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.URLConnection;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.cyberneko.html.HTMLConfiguration;
import org.cyberneko.html.HTMLScanner;
import org.eclipse.jetty.http.HttpHeaderValues;

@CreoleResource(name = "GATE HTML Document Format", isPrivate = true, autoinstances = {@AutoInstance(hidden = true)})
/* loaded from: input_file:WEB-INF/lib/gate-core-9.0.1.jar:gate/corpora/NekoHtmlDocumentFormat.class */
public class NekoHtmlDocumentFormat extends TextualDocumentFormat {
    private static final long serialVersionUID = -3163147687966075651L;
    private static final boolean DEBUG = false;
    private Set<String> ignorableTags = null;
    private static Pattern afterNewlinePattern = Pattern.compile(AbstractUiRenderer.ROOT_FAKE_NAME, 8);

    @CreoleParameter(comment = "HTML tags whose text content should be ignored", defaultValue = "script;style;iframe")
    public void setIgnorableTags(Set<String> set) {
        this.ignorableTags = set;
    }

    public Set<String> getIgnorableTags() {
        return this.ignorableTags;
    }

    @Override // gate.DocumentFormat
    public Boolean supportsRepositioning() {
        return Boolean.TRUE;
    }

    @Override // gate.corpora.TextualDocumentFormat, gate.DocumentFormat
    public void unpackMarkup(Document document) throws DocumentFormatException {
        unpackMarkup(document, null, null);
    }

    @Override // gate.corpora.TextualDocumentFormat, gate.DocumentFormat
    public void unpackMarkup(Document document, RepositioningInfo repositioningInfo, RepositioningInfo repositioningInfo2) throws DocumentFormatException {
        XMLInputSource xMLInputSource;
        if (document == null || (document.getSourceUrl() == null && document.getContent() == null)) {
            throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
        }
        StatusListener statusListener = new StatusListener() { // from class: gate.corpora.NekoHtmlDocumentFormat.1
            @Override // gate.event.StatusListener
            public void statusChanged(String str) {
                NekoHtmlDocumentFormat.this.fireStatusChanged(str);
            }
        };
        boolean hasContentButNoValidUrl = hasContentButNoValidUrl(document);
        NekoHtmlDocumentHandler nekoHtmlDocumentHandler = null;
        try {
            try {
                try {
                    HTMLConfiguration hTMLConfiguration = new HTMLConfiguration();
                    hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
                    hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
                    hTMLConfiguration.setFeature(NekoHtmlDocumentHandler.AUGMENTATIONS, true);
                    nekoHtmlDocumentHandler = new NekoHtmlDocumentHandler(document, null, this.ignorableTags);
                    nekoHtmlDocumentHandler.addStatusListener(statusListener);
                    nekoHtmlDocumentHandler.setRepositioningInfo(repositioningInfo);
                    nekoHtmlDocumentHandler.setAmpCodingInfo(repositioningInfo2);
                    nekoHtmlDocumentHandler.setLineOffsets(buildLineOffsets(document.getContent().toString()));
                    hTMLConfiguration.setDocumentHandler(nekoHtmlDocumentHandler);
                    hTMLConfiguration.setErrorHandler(nekoHtmlDocumentHandler);
                    if (hasContentButNoValidUrl) {
                        xMLInputSource = new XMLInputSource((String) null, (String) null, (String) null, new StringReader(document.getContent().toString()), (String) null);
                    } else if (document instanceof TextualDocument) {
                        String encoding = ((TextualDocument) document).getEncoding();
                        URLConnection openConnection = document.getSourceUrl().openConnection();
                        InputStream inputStream = openConnection.getInputStream();
                        if (HttpHeaderValues.GZIP.equals(openConnection.getContentEncoding())) {
                            inputStream = new GZIPInputStream(inputStream);
                        }
                        xMLInputSource = new XMLInputSource((String) null, document.getSourceUrl().toString(), document.getSourceUrl().toString(), new InputStreamReader(inputStream, encoding), encoding);
                        hTMLConfiguration.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
                    } else {
                        xMLInputSource = new XMLInputSource(null, document.getSourceUrl().toString(), document.getSourceUrl().toString());
                    }
                    hTMLConfiguration.parse(xMLInputSource);
                    ((DocumentImpl) document).setNextAnnotationId(nekoHtmlDocumentHandler.getCustomObjectsId());
                    if (nekoHtmlDocumentHandler != null) {
                        nekoHtmlDocumentHandler.removeStatusListener(statusListener);
                    }
                } catch (Exception e) {
                    document.getFeatures().put("parsingError", Boolean.TRUE);
                    Boolean bool = (Boolean) document.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
                    if (bool != null && bool.booleanValue()) {
                        throw new DocumentFormatException(e);
                    }
                    Out.println("Warning: Document remains unparsed. \n\n  Stack Dump: ");
                    e.printStackTrace(Out.getPrintWriter());
                    if (nekoHtmlDocumentHandler != null) {
                        nekoHtmlDocumentHandler.removeStatusListener(statusListener);
                    }
                }
            } catch (IOException e2) {
                throw new DocumentFormatException("I/O exception for " + document.getSourceUrl().toString(), e2);
            }
        } catch (Throwable th) {
            if (nekoHtmlDocumentHandler != null) {
                nekoHtmlDocumentHandler.removeStatusListener(statusListener);
            }
            throw th;
        }
    }

    private int[] buildLineOffsets(String str) {
        Matcher matcher = afterNewlinePattern.matcher(str);
        int i = 0;
        while (matcher.find()) {
            i++;
        }
        int[] iArr = new int[i];
        matcher.reset();
        for (int i2 = 0; i2 < iArr.length; i2++) {
            matcher.find();
            iArr[i2] = matcher.start();
        }
        return iArr;
    }

    @Override // gate.corpora.TextualDocumentFormat, gate.creole.AbstractResource, gate.Resource
    public Resource init() throws ResourceInstantiationException {
        MimeType mimeType = new MimeType("text", "html");
        mimeString2ClassHandlerMap.put(mimeType.getType() + "/" + mimeType.getSubtype(), this);
        mimeString2mimeTypeMap.put(mimeType.getType() + "/" + mimeType.getSubtype(), mimeType);
        mimeString2mimeTypeMap.put("application/xhtml+xml", mimeType);
        suffixes2mimeTypeMap.put("html", mimeType);
        suffixes2mimeTypeMap.put("htm", mimeType);
        magic2mimeTypeMap.put("<html", mimeType);
        setMimeType(mimeType);
        return this;
    }
}
