/*
 * Decompiled with CFR 0.152.
 */
package org.tribuo.util.tokens.universal;

import com.oracle.labs.mlrg.olcut.config.Config;
import com.oracle.labs.mlrg.olcut.config.Configurable;
import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.Queue;
import org.tribuo.util.tokens.Token;
import org.tribuo.util.tokens.Tokenizer;
import org.tribuo.util.tokens.universal.Range;

public class UniversalTokenizer
implements Tokenizer {
    protected int maxTokenLength = 256;
    private boolean eofReached = false;
    private int pos;
    private int start;
    private boolean generateUnigrams = true;
    private boolean generateNgrams = true;
    private State state;
    private CharSequence cs;
    private char[] buffer;
    private String currToken;
    private Token.TokenType currType;
    private int currPos;
    private int startOffset;
    private int endOffset;
    private int tokenLength;
    private boolean firstToken;
    private boolean ready;
    @Config(description="Send punctuation through as tokens.")
    private boolean sendPunct = false;
    private Queue<Range> queuedTokens;
    private Queue<Range> pool;
    private char c;

    public UniversalTokenizer(boolean sendPunct) {
        this.sendPunct = sendPunct;
        this.buffer = new char[this.maxTokenLength];
        this.tokenLength = 0;
        this.state = State.SKIPPING;
        this.queuedTokens = new LinkedList<Range>();
        this.pool = new LinkedList<Range>();
    }

    public UniversalTokenizer() {
        this(false);
    }

    public static boolean isLetterOrDigit(char c) {
        if (c <= 'z' && c >= 'a' || c <= 'Z' && c >= 'A' || c <= '9' && c >= '0') {
            return true;
        }
        if (c <= '`' || c == '\u00d2' || c == '\u00d3' || c >= '{' && c <= '\u007f') {
            return false;
        }
        if (c >= '\u0bcd' && c <= '\u0bd5' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') {
            return true;
        }
        return Character.isLetterOrDigit(c);
    }

    public static boolean isDigit(char c) {
        if (c <= '9' && c >= '0') {
            return true;
        }
        if (c <= '\u00ff') {
            return false;
        }
        return Character.isDigit(c);
    }

    public static boolean isWhitespace(char c) {
        if (c == ' ' || c <= '\r' && c >= '\t' || c <= '\u0004' && c >= '\u0001') {
            return true;
        }
        if (c <= '\u00ff') {
            return false;
        }
        return Character.isWhitespace(c);
    }

    public static boolean isNgram(char c) {
        if (c > '\u3002' && c <= '\ud7ff') {
            return c < '\u3040' || c > '\u30ff';
        }
        return c >= '\u0600' && c <= '\u06ff' || c >= '\uf900' && c <= '\ufaff' || c >= '\u1100' && c <= '\u11ff' || c >= '\ufb50' && c <= '\ufe2f' || c >= '\ufe30' && c <= '\ufe4f' || c >= '\ufe70' && c <= '\ufeff' || c >= '\uff60' && c <= '\uffdf' || c >= '\u0e00' && c <= '\u0e7f' || c >= '\u0e80' && c <= '\u0eff' || c >= '\u0f00' && c <= '\u0fbf' || c >= '\u0b80' && c <= '\u0bff' || c >= '\u0c00' && c <= '\u0c7f' || c >= '\u0c80' && c <= '\u0cff' || c >= '\u0d00' && c <= '\u0d7f' || c >= '\u10a0' && c <= '\u10ff';
    }

    public boolean isGenerateUnigrams() {
        return this.generateUnigrams;
    }

    public void setGenerateUnigrams(boolean generateUnigrams) {
        this.generateUnigrams = generateUnigrams;
    }

    public boolean isGenerateNgrams() {
        return this.generateNgrams;
    }

    public void setGenerateNgrams(boolean generateNgrams) {
        this.generateNgrams = generateNgrams;
    }

    public int getMaxTokenLength() {
        return this.maxTokenLength;
    }

    public void setMaxTokenLength(int maxTokenLength) {
        this.maxTokenLength = maxTokenLength;
    }

    public ConfiguredObjectProvenance getProvenance() {
        return new ConfiguredObjectProvenanceImpl((Configurable)this, "Tokenizer");
    }

    @Override
    public final boolean advance() {
        if (this.cs == null) {
            throw new IllegalStateException("UniversalTokenizer has not been reset.");
        }
        if (this.queuedTokens.size() > 0) {
            this.handleQueued();
            return true;
        }
        if (this.eofReached) {
            return false;
        }
        while (this.pos < this.cs.length()) {
            this.c = this.cs.charAt(this.pos);
            this.handleChar();
            ++this.pos;
            if (this.queuedTokens.size() <= 0) continue;
            this.handleQueued();
            return true;
        }
        this.eofReached = true;
        this.makeTokens();
        if (this.queuedTokens.size() > 0) {
            this.handleQueued();
            return true;
        }
        return false;
    }

    private void handleQueued() {
        this.ready = true;
        Range range = this.queuedTokens.poll();
        this.currToken = new String(range.buff, 0, range.len);
        this.startOffset = range.start;
        this.endOffset = range.end;
        if (this.firstToken && range.incr == 0) {
            range.incr = 1;
            this.firstToken = false;
        }
        this.currType = range.type;
        this.currPos = range.incr;
        this.pool.offer(range);
    }

    protected void handleChar() {
        if (this.c >= 'a' && this.c <= 'z' || this.c >= 'A' && this.c <= 'Z') {
            if (this.state == State.NGRAM) {
                this.makeTokens();
            }
            this.addChar();
            this.state = State.COLLECTING;
            return;
        }
        if (this.c == ' ') {
            switch (this.state) {
                case COLLECTING: 
                case NGRAM: {
                    this.makeTokens();
                    break;
                }
                case SKIPPING: {
                    break;
                }
            }
            this.sendPunct();
            this.state = State.SKIPPING;
            return;
        }
        if (UniversalTokenizer.isNgram(this.c)) {
            switch (this.state) {
                case SKIPPING: {
                    this.state = State.NGRAM;
                    break;
                }
                case COLLECTING: {
                    this.makeTokens();
                    this.state = State.NGRAM;
                    break;
                }
                case NGRAM: {
                    break;
                }
            }
            this.addChar();
            return;
        }
        if (this.c == '\u0000' || this.state == State.NGRAM && this.c >= '\n' && this.c <= '\r') {
            return;
        }
        if (UniversalTokenizer.isWhitespace(this.c)) {
            switch (this.state) {
                case COLLECTING: 
                case NGRAM: {
                    this.makeTokens();
                    break;
                }
                case SKIPPING: {
                    break;
                }
            }
            this.sendPunct();
            this.state = State.SKIPPING;
            return;
        }
        if (this.c >= '0' && this.c <= '9' || this.c > '\u00ff' && Character.isDigit(this.c)) {
            switch (this.state) {
                case SKIPPING: {
                    this.state = State.COLLECTING;
                    break;
                }
                case NGRAM: {
                    this.makeTokens();
                    this.state = State.COLLECTING;
                    break;
                }
                case COLLECTING: {
                    break;
                }
            }
            this.addChar();
            return;
        }
        if (UniversalTokenizer.isLetterOrDigit(this.c)) {
            if (this.state == State.NGRAM) {
                this.makeTokens();
            }
            this.addChar();
            this.state = State.COLLECTING;
            return;
        }
        if (this.state != State.SKIPPING) {
            this.makeTokens();
        }
        this.sendPunct();
        this.state = State.SKIPPING;
    }

    private void sendPunct() {
        if (this.sendPunct && !UniversalTokenizer.isWhitespace(this.c)) {
            Range r = this.getRange();
            r.punct(this.c, this.pos);
            this.queuedTokens.add(r);
        }
    }

    protected void addChar() {
        if (this.buffer.length <= this.tokenLength) {
            this.buffer = Arrays.copyOf(this.buffer, this.tokenLength + 32);
        }
        if (this.tokenLength == 0) {
            this.start = this.pos;
        }
        this.buffer[this.tokenLength++] = this.c;
        if (this.tokenLength >= this.maxTokenLength) {
            this.makeTokens();
        }
    }

    @Override
    public int getStart() {
        if (this.ready) {
            return this.startOffset;
        }
        throw new IllegalStateException("UniversalTokenizer is not ready.");
    }

    @Override
    public int getEnd() {
        if (this.ready) {
            return this.endOffset;
        }
        throw new IllegalStateException("UniversalTokenizer is not ready.");
    }

    @Override
    public String getText() {
        if (this.ready) {
            return this.currToken;
        }
        throw new IllegalStateException("UniversalTokenizer is not ready.");
    }

    @Override
    public Token.TokenType getType() {
        if (this.ready) {
            return this.currType;
        }
        throw new IllegalStateException("UniversalTokenizer is not ready.");
    }

    public int getPos() {
        return this.currPos;
    }

    @Override
    public Tokenizer clone() {
        try {
            UniversalTokenizer copy = (UniversalTokenizer)super.clone();
            copy.buffer = new char[this.maxTokenLength];
            copy.tokenLength = 0;
            copy.state = State.SKIPPING;
            copy.pool = new LinkedList<Range>();
            copy.queuedTokens = new LinkedList<Range>();
            copy.currToken = null;
            copy.ready = false;
            copy.cs = null;
            return copy;
        }
        catch (CloneNotSupportedException e) {
            throw new AssertionError((Object)"UniversalTokenizer is Cloneable, but clone call failed");
        }
    }

    @Override
    public void reset(CharSequence cs) {
        this.cs = cs;
        this.pos = 0;
        this.tokenLength = 0;
        this.start = -1;
        this.state = State.SKIPPING;
        this.eofReached = false;
        this.firstToken = true;
        this.c = '\u0000';
        this.startOffset = -1;
        this.endOffset = -1;
        this.currToken = null;
        this.ready = false;
    }

    private Range getRange() {
        if (this.pool.isEmpty()) {
            return new Range();
        }
        return this.pool.remove();
    }

    protected void makeTokens() {
        if (this.tokenLength <= 0) {
            return;
        }
        if (this.state == State.NGRAM) {
            if (this.tokenLength == 1) {
                Range range = this.getRange();
                range.set(this.buffer[0], this.start);
                this.queuedTokens.add(range);
                this.tokenLength = 0;
                return;
            }
            for (int i = 0; i < this.tokenLength; ++i) {
                Range range;
                if (this.generateUnigrams) {
                    range = this.getRange();
                    range.set(this.buffer[i], this.start + i);
                    this.queuedTokens.add(range);
                }
                if (!this.generateNgrams || i >= this.tokenLength - 1) continue;
                range = this.getRange();
                range.set(this.buffer[i], this.buffer[i + 1], this.start + i);
                this.queuedTokens.add(range);
            }
        } else {
            Range range = this.getRange();
            range.set(this.buffer, this.tokenLength, this.start);
            this.queuedTokens.add(range);
        }
        this.tokenLength = 0;
    }

    private static enum State {
        SKIPPING,
        COLLECTING,
        NGRAM;

    }
}

