package it.unimi.di.law.bubing.parser;

import com.google.common.base.Charsets;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hasher;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.di.law.bubing.parser.Parser;
import it.unimi.di.law.bubing.util.BURL;
import it.unimi.di.law.bubing.util.ByteArrayCharSequence;
import it.unimi.di.law.bubing.util.Util;
import it.unimi.di.law.warc.filters.URIResponse;
import it.unimi.di.law.warc.records.WarcHeader;
import it.unimi.di.law.warc.records.WarcRecord;
import it.unimi.di.law.warc.util.StringHttpMessages;
import it.unimi.dsi.fastutil.io.InspectableFileCachedInputStream;
import it.unimi.dsi.fastutil.objects.ObjectLinkedOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.util.TextPattern;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.lang.reflect.InvocationTargetException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.security.NoSuchAlgorithmException;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.EndTag;
import net.htmlparser.jericho.EndTagType;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.HTMLElements;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import net.htmlparser.jericho.StreamedSource;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.codec.digest.MessageDigestAlgorithms;
import org.apache.commons.io.IOUtils;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.impl.client.HttpClients;
import org.hibernate.boot.jaxb.Origin;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/bubing-0.9.11.jar:it/unimi/di/law/bubing/parser/HTMLParser.class */
public class HTMLParser<T> implements Parser<T> {
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) HTMLParser.class);
    protected static final TextPattern URLEQUAL_PATTERN;
    public static final int CHAR_BUFFER_SIZE = 131072;
    protected final char[] buffer;
    protected String guessedCharset;
    protected final DigestAppendable digestAppendable;
    protected final Parser.TextProcessor<T> textProcessor;
    protected URI location;
    protected URI metaLocation;
    protected boolean crossAuthorityDuplicates;
    protected static final TextPattern META_PATTERN;
    protected static final Pattern HTTP_EQUIV_PATTERN;
    protected static final Pattern CONTENT_PATTERN;
    protected static final Pattern CHARSET_PATTERN;

    /* loaded from: input_file:WEB-INF/lib/bubing-0.9.11.jar:it/unimi/di/law/bubing/parser/HTMLParser$DigestAppendable.class */
    public static final class DigestAppendable implements Appendable {
        private static final boolean DEBUG = false;
        private PrintStream debugStream;
        private File debugFile;
        protected static final Reference2ObjectOpenHashMap<String, byte[]> startTags;
        protected static final Reference2ObjectOpenHashMap<String, byte[]> endTags;
        protected final HashFunction hashFunction;
        protected Hasher hasher;
        protected boolean lastAppendedWasSpace;
        protected byte[] digest;

        public DigestAppendable(HashFunction hashFunction) {
            this.hashFunction = hashFunction;
        }

        public void init(URI uri) {
            this.hasher = this.hashFunction.newHasher();
            this.digest = null;
            if (uri != null) {
                this.hasher.putUnencodedChars((CharSequence) uri.getHost());
                this.hasher.putByte((byte) 0);
            }
            this.lastAppendedWasSpace = false;
        }

        @Override // java.lang.Appendable
        public Appendable append(CharSequence charSequence, int i, int i2) {
            for (int i3 = i; i3 < i2; i3++) {
                append(charSequence.charAt(i3));
            }
            return this;
        }

        @Override // java.lang.Appendable
        public Appendable append(char c) {
            if (!Character.isWhitespace(c) && !Character.isDigit(c)) {
                this.hasher.putChar(c);
                this.lastAppendedWasSpace = false;
            } else if (!this.lastAppendedWasSpace) {
                this.hasher.putChar(' ');
                this.lastAppendedWasSpace = true;
            }
            return this;
        }

        @Override // java.lang.Appendable
        public Appendable append(CharSequence charSequence) {
            return append(charSequence, 0, charSequence.length());
        }

        /* JADX INFO: Access modifiers changed from: private */
        public void append(byte[] bArr) {
            this.hasher.putBytes(bArr);
        }

        public byte[] digest() {
            if (this.digest == null) {
                this.digest = this.hasher.hash().asBytes();
            }
            return this.digest;
        }

        public void startTag(StartTag startTag) {
            String attributeValue;
            String name = startTag.getName();
            append(startTags.get(name));
            if ((name == "iframe" || name == "frame") && (attributeValue = startTag.getAttributeValue("src")) != null) {
                append('\"');
                append(attributeValue);
                append('\"');
            }
            this.lastAppendedWasSpace = false;
        }

        public void endTag(EndTag endTag) {
            append(endTags.get(endTag.getName()));
            this.lastAppendedWasSpace = false;
        }

        static {
            List<String> elementNames = HTMLElements.getElementNames();
            startTags = new Reference2ObjectOpenHashMap<>(elementNames.size());
            endTags = new Reference2ObjectOpenHashMap<>(elementNames.size());
            startTags.defaultReturnValue(Util.toByteArray(Origin.UNKNOWN_FILE_PATH));
            endTags.defaultReturnValue(Util.toByteArray("</unknown>"));
            for (String str : elementNames) {
                startTags.put(str, Util.toByteArray("<" + str + ">"));
                endTags.put(str, Util.toByteArray("</" + str + ">"));
            }
        }
    }

    /* loaded from: input_file:WEB-INF/lib/bubing-0.9.11.jar:it/unimi/di/law/bubing/parser/HTMLParser$SetLinkReceiver.class */
    public static final class SetLinkReceiver implements Parser.LinkReceiver {
        public final Set<URI> urls = new ObjectLinkedOpenHashSet();

        @Override // it.unimi.di.law.bubing.parser.Parser.LinkReceiver
        public void location(URI uri) {
            this.urls.add(uri);
        }

        @Override // it.unimi.di.law.bubing.parser.Parser.LinkReceiver
        public void metaLocation(URI uri) {
            this.urls.add(uri);
        }

        @Override // it.unimi.di.law.bubing.parser.Parser.LinkReceiver
        public void metaRefresh(URI uri) {
            this.urls.add(uri);
        }

        @Override // it.unimi.di.law.bubing.parser.Parser.LinkReceiver
        public void link(URI uri) {
            this.urls.add(uri);
        }

        @Override // it.unimi.di.law.bubing.parser.Parser.LinkReceiver
        public void init(URI uri) {
            this.urls.clear();
        }

        @Override // java.lang.Iterable
        public Iterator<URI> iterator() {
            return this.urls.iterator();
        }

        @Override // it.unimi.di.law.bubing.parser.Parser.LinkReceiver
        public int size() {
            return this.urls.size();
        }
    }

    public HTMLParser(HashFunction hashFunction) {
        this(hashFunction, false);
    }

    public HTMLParser(HashFunction hashFunction, Parser.TextProcessor<T> textProcessor, boolean z, int i) {
        this.buffer = i != 0 ? new char[i] : null;
        this.digestAppendable = hashFunction == null ? null : new DigestAppendable(hashFunction);
        this.textProcessor = textProcessor;
        this.crossAuthorityDuplicates = z;
    }

    public HTMLParser(HashFunction hashFunction, boolean z) {
        this(hashFunction, null, z, 131072);
    }

    public HTMLParser(HashFunction hashFunction, Parser.TextProcessor<T> textProcessor, boolean z) {
        this(hashFunction, textProcessor, z, 131072);
    }

    public HTMLParser(String str) throws NoSuchAlgorithmException {
        this(BinaryParser.forName(str));
    }

    public HTMLParser(String str, String str2) throws NoSuchAlgorithmException {
        this(BinaryParser.forName(str), Util.parseBoolean(str2));
    }

    public HTMLParser(String str, String str2, String str3) throws NoSuchAlgorithmException, IllegalArgumentException, ClassNotFoundException, IllegalAccessException, InvocationTargetException, InstantiationException, NoSuchMethodException, IOException {
        this(BinaryParser.forName(str), (Parser.TextProcessor) ObjectParser.fromSpec(str2), Util.parseBoolean(str3));
    }

    public HTMLParser() {
        this(null, null, false, 131072);
    }

    protected void process(Parser.LinkReceiver linkReceiver, URI uri, String str) {
        URI parse;
        if (str == null || (parse = BURL.parse(str)) == null) {
            return;
        }
        linkReceiver.link(uri.resolve(parse));
    }

    @Override // it.unimi.di.law.bubing.parser.Parser
    public byte[] parse(URI uri, HttpResponse httpResponse, Parser.LinkReceiver linkReceiver) throws IOException {
        URI parse;
        URI parse2;
        int search;
        String substring;
        URI parse3;
        URI parse4;
        String charsetNameFromHeader;
        this.guessedCharset = "ISO-8859-1";
        HttpEntity entity = httpResponse.getEntity();
        Header contentType = entity.getContentType();
        if (contentType != null && (charsetNameFromHeader = getCharsetNameFromHeader(contentType.getValue())) != null) {
            this.guessedCharset = charsetNameFromHeader;
        }
        InputStream content = entity.getContent();
        Header warcHeader = httpResponse instanceof WarcRecord ? ((WarcRecord) httpResponse).getWarcHeader(WarcHeader.Name.BUBING_GUESSED_CHARSET) : null;
        if (warcHeader != null) {
            this.guessedCharset = warcHeader.getValue();
        } else if (content instanceof InspectableFileCachedInputStream) {
            InspectableFileCachedInputStream inspectableFileCachedInputStream = (InspectableFileCachedInputStream) content;
            String charsetName = getCharsetName(inspectableFileCachedInputStream.buffer, inspectableFileCachedInputStream.inspectable);
            if (charsetName != null) {
                this.guessedCharset = charsetName;
            }
        }
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug("Guessing charset \"{}\" for URL {}", this.guessedCharset, uri);
        }
        Charset charset = Charsets.ISO_8859_1;
        try {
            charset = Charset.forName(this.guessedCharset);
        } catch (IllegalCharsetNameException e) {
            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("Response for {} contained an illegal charset name: \"{}\"", uri, this.guessedCharset);
            }
        } catch (UnsupportedCharsetException e2) {
            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("Response for {} contained an unsupported charset: \"{}\"", uri, this.guessedCharset);
            }
        }
        linkReceiver.init(uri);
        if (this.textProcessor != null) {
            this.textProcessor.init(uri);
        }
        this.location = null;
        this.metaLocation = null;
        Header firstHeader = httpResponse.getFirstHeader("Location");
        if (firstHeader != null && (parse4 = BURL.parse(firstHeader.getValue())) != null) {
            if (!parse4.isAbsolute() && LOGGER.isDebugEnabled()) {
                LOGGER.debug("Found relative header location URL: \"{}\"", parse4);
            }
            URI resolve = uri.resolve(parse4);
            this.location = resolve;
            linkReceiver.location(resolve);
        }
        StreamedSource streamedSource = new StreamedSource(new InputStreamReader(content, charset));
        if (this.buffer != null) {
            streamedSource.setBuffer(this.buffer);
        }
        if (this.digestAppendable != null) {
            this.digestAppendable.init(this.crossAuthorityDuplicates ? null : uri);
        }
        URI uri2 = uri;
        int i = 0;
        int i2 = 0;
        Iterator<Segment> it2 = streamedSource.iterator();
        while (it2.hasNext()) {
            Segment next = it2.next();
            if (next.getEnd() > i) {
                i = next.getEnd();
                if (next instanceof StartTag) {
                    StartTag startTag = (StartTag) next;
                    if (startTag.getTagType() == StartTagType.NORMAL) {
                        String name = startTag.getName();
                        if ((name == "style" || name == "script") && !startTag.isSyntacticalEmptyElementTag()) {
                            i2++;
                        }
                        if (this.digestAppendable != null) {
                            this.digestAppendable.startTag(startTag);
                        }
                        if (linkReceiver != null) {
                            if (name == "iframe" || name == "frame" || name == HTMLElementName.EMBED) {
                                process(linkReceiver, uri2, startTag.getAttributeValue("src"));
                            } else if (name == "img" || name == "script") {
                                process(linkReceiver, uri2, startTag.getAttributeValue("src"));
                            } else if (name == "object") {
                                process(linkReceiver, uri2, startTag.getAttributeValue("data"));
                            } else if (name == "a" || name == "area" || name == "link") {
                                process(linkReceiver, uri2, startTag.getAttributeValue("href"));
                            } else if (name == "base") {
                                String attributeValue = startTag.getAttributeValue("href");
                                if (attributeValue != null && (parse = BURL.parse(attributeValue)) != null) {
                                    if (parse.isAbsolute()) {
                                        uri2 = parse;
                                    } else if (LOGGER.isDebugEnabled()) {
                                        LOGGER.debug("Found relative BASE URL: \"{}\"", parse);
                                    }
                                }
                            } else if (name == "meta") {
                                String attributeValue2 = startTag.getAttributeValue("http-equiv");
                                String attributeValue3 = startTag.getAttributeValue("content");
                                if (attributeValue2 != null && attributeValue3 != null) {
                                    attributeValue2.toLowerCase();
                                    if (attributeValue2.equals("refresh") && (search = URLEQUAL_PATTERN.search(attributeValue3)) != -1 && (parse3 = BURL.parse((substring = attributeValue3.substring(search + URLEQUAL_PATTERN.length())))) != null) {
                                        if (!parse3.isAbsolute() && LOGGER.isDebugEnabled()) {
                                            LOGGER.debug("Found relative META refresh URL: \"{}\"", substring);
                                        }
                                        linkReceiver.metaRefresh(uri2.resolve(parse3));
                                    }
                                    if (attributeValue2.equals("location") && (parse2 = BURL.parse(attributeValue3)) != null) {
                                        if (!parse2.isAbsolute() && LOGGER.isDebugEnabled()) {
                                            LOGGER.debug("Found relative META location URL: \"{}\"", attributeValue3);
                                        }
                                        URI resolve2 = uri2.resolve(parse2);
                                        this.metaLocation = resolve2;
                                        linkReceiver.metaLocation(resolve2);
                                    }
                                }
                            }
                        }
                    }
                } else if (next instanceof EndTag) {
                    EndTag endTag = (EndTag) next;
                    String name2 = endTag.getName();
                    if (name2 == "style" || name2 == "script") {
                        i2 = Math.max(0, i2 - 1);
                    }
                    if (this.digestAppendable != null && endTag.getTagType() == EndTagType.NORMAL) {
                        this.digestAppendable.endTag(endTag);
                    }
                } else if (i2 == 0) {
                    if (this.textProcessor != null) {
                        if (next instanceof CharacterReference) {
                            ((CharacterReference) next).appendCharTo(this.textProcessor);
                        } else {
                            this.textProcessor.append(next);
                        }
                    }
                    if (this.digestAppendable != null) {
                        if (next instanceof CharacterReference) {
                            ((CharacterReference) next).appendCharTo(this.digestAppendable);
                        } else {
                            this.digestAppendable.append(next);
                        }
                    }
                }
            }
        }
        if (this.digestAppendable != null && httpResponse.getStatusLine().getStatusCode() / 100 == 3) {
            this.digestAppendable.append((char) 0);
            if (this.location != null) {
                this.digestAppendable.append(BURL.toByteArray(this.location));
            }
            this.digestAppendable.append((char) 0);
            if (this.metaLocation != null) {
                this.digestAppendable.append(BURL.toByteArray(this.metaLocation));
            }
            this.digestAppendable.append((char) 0);
        }
        if (this.digestAppendable != null) {
            return this.digestAppendable.digest();
        }
        return null;
    }

    @Override // it.unimi.di.law.bubing.parser.Parser
    public String guessedCharset() {
        return this.guessedCharset;
    }

    public URI location() {
        if (this.location != null) {
            return this.location;
        }
        if (this.metaLocation != null) {
            return this.metaLocation;
        }
        return null;
    }

    public static String getCharsetName(byte[] bArr, int i) {
        int i2 = 0;
        while (true) {
            int search = META_PATTERN.search(bArr, i2, i);
            if (search == -1) {
                return null;
            }
            int i3 = search;
            while (i3 < i && bArr[i3] != 62) {
                i3++;
            }
            if (i3 == i) {
                return null;
            }
            ByteArrayCharSequence byteArrayCharSequence = new ByteArrayCharSequence(bArr, search + META_PATTERN.length(), (i3 - search) - META_PATTERN.length());
            if (HTTP_EQUIV_PATTERN.matcher(byteArrayCharSequence).matches()) {
                Matcher matcher = CONTENT_PATTERN.matcher(byteArrayCharSequence);
                if (matcher.matches()) {
                    return getCharsetNameFromHeader(matcher.group(2));
                }
            }
            i2 = i3 + 1;
        }
    }

    public static String getCharsetNameFromHeader(String str) {
        Matcher matcher = CHARSET_PATTERN.matcher(str);
        if (!matcher.matches()) {
            return null;
        }
        String group = matcher.group(1);
        int i = 0;
        int length = group.length();
        if (length > 0 && (group.charAt(0) == '\"' || group.charAt(0) == '\'')) {
            i = 1;
        }
        if (length > 0 && (group.charAt(length - 1) == '\"' || group.charAt(length - 1) == '\'')) {
            length--;
        }
        if (i < length) {
            return group.substring(i, length);
        }
        return null;
    }

    @Override // com.google.common.base.Predicate
    public boolean apply(URIResponse uRIResponse) {
        Header contentType = uRIResponse.response().getEntity().getContentType();
        return contentType != null && contentType.getValue().startsWith("text/");
    }

    /* renamed from: clone, reason: merged with bridge method [inline-methods] */
    public HTMLParser<T> m3617clone() {
        return new HTMLParser<>(this.digestAppendable == null ? null : this.digestAppendable.hashFunction, this.textProcessor == null ? null : this.textProcessor.copy(), this.crossAuthorityDuplicates, this.buffer.length);
    }

    @Override // it.unimi.dsi.lang.FlyweightPrototype
    public HTMLParser<T> copy() {
        return m3617clone();
    }

    @Override // it.unimi.di.law.bubing.parser.Parser
    public T result() {
        if (this.textProcessor == null) {
            return null;
        }
        return this.textProcessor.result();
    }

    public static void main(String[] strArr) throws IllegalArgumentException, IOException, URISyntaxException, JSAPException, NoSuchAlgorithmException {
        byte[] parse;
        SimpleJSAP simpleJSAP = new SimpleJSAP(HTMLParser.class.getName(), "Produce the digest of a page: the page is downloaded or passed as argument by specifying a file", new Parameter[]{new UnflaggedOption("url", JSAP.STRING_PARSER, true, "The url of the page."), new Switch("crossAuthorityDuplicates", 'c', "cross-authority-duplicates"), new FlaggedOption("charBufferSize", JSAP.INTSIZE_PARSER, Integer.toString(131072), false, 'b', "buffer", "The size of the parser character buffer (0 for dynamic sizing)."), new FlaggedOption("file", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'f', "file", "The page to be processed."), new FlaggedOption("digester", JSAP.STRING_PARSER, MessageDigestAlgorithms.MD5, false, 'd', "digester", "The digester to be used.")});
        JSAPResult parse2 = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            System.exit(1);
        }
        String string = parse2.getString("url");
        HTMLParser hTMLParser = new HTMLParser(BinaryParser.forName(parse2.getString("digester")), (Parser.TextProcessor) null, parse2.userSpecified("crossAuthorityDuplicates"), parse2.getInt("charBufferSize"));
        SetLinkReceiver setLinkReceiver = new SetLinkReceiver();
        if (parse2.userSpecified("file")) {
            parse = hTMLParser.parse(BURL.parse(string), new StringHttpMessages.HttpResponse(IOUtils.toString(new InputStreamReader(new FileInputStream(parse2.getString("file"))))), setLinkReceiver);
        } else {
            URI uri = new URI(string);
            HttpGet httpGet = new HttpGet(uri);
            httpGet.setConfig(RequestConfig.custom().setRedirectsEnabled(false).build());
            parse = hTMLParser.parse(uri, HttpClients.createDefault().execute((HttpUriRequest) httpGet), setLinkReceiver);
        }
        System.out.println("DigestHexString: " + Hex.encodeHexString(parse));
        System.out.println("Links: " + setLinkReceiver.urls);
        ObjectOpenHashSet objectOpenHashSet = new ObjectOpenHashSet();
        Iterator<URI> it2 = setLinkReceiver.urls.iterator();
        while (it2.hasNext()) {
            objectOpenHashSet.add(it2.next().toString());
        }
        if (objectOpenHashSet.size() != setLinkReceiver.urls.size()) {
            System.out.println("There are " + setLinkReceiver.urls.size() + " URIs but " + objectOpenHashSet.size() + " strings");
        }
    }

    static {
        StartTagType.SERVER_COMMON.deregister();
        StartTagType.SERVER_COMMON_COMMENT.deregister();
        StartTagType.SERVER_COMMON_ESCAPED.deregister();
        URLEQUAL_PATTERN = new TextPattern("URL=", 1);
        META_PATTERN = new TextPattern("<meta", 1);
        HTTP_EQUIV_PATTERN = Pattern.compile(".*http-equiv\\s*=\\s*('|\")?content-type('|\")?.*", 2);
        CONTENT_PATTERN = Pattern.compile(".*content\\s*=\\s*('|\")([^'\"]*)('|\").*", 2);
        CHARSET_PATTERN = Pattern.compile(".*charset\\s*=\\s*(([\\041-\\0176&&[^<>\\{\\}\\\\/:,;@?=]])+|\"[^\"]*\").*", 2);
    }
}
