Commit 7793b1ab authored by Rene Saarsoo's avatar Rene Saarsoo
Browse files

Remove the old JavaScript Lexer.

The class was for long no more in use, but the file was left undeleted.
parent 8af897f3
Loading
Loading
Loading
Loading

lib/jsduck/lexer.rb

deleted100644 → 0
+0 −251
Original line number Diff line number Diff line
require 'strscan'

module JsDuck

  # Tokenizes JavaScript code into lexical tokens.
  #
  # Each token has a type and value.
  # Types and possible values for them are as follows:
  #
  # - :number      -- 25
  # - :string      -- "Hello world"
  # - :ident       -- "foo"
  # - :regex       -- "/abc/i"
  # - :operator    -- "+"
  # - :doc_comment -- "/** My comment */"
  #
  # Plus a separate types for all keywords: :if, :while, :function, ...
  # For keywords the type and value are the same.
  #
  # Notice that doc-comments are recognized as tokens while normal
  # comments are ignored just as whitespace.
  #
  class Lexer
    # Input can be either a String or StringScanner.
    #
    # In the latter case we ensure that only #next will advance the
    # scanpointer of StringScanner - this allows context-switching
    # while parsing some string.  Specifically we need this feature to
    # parse some JavaScript inside doc-comments.
    def initialize(input)
      @input = input.is_a?(StringScanner) ? input : StringScanner.new(input)
      @buffer = []
    end

    # Tests if given pattern matches the tokens that follow at current
    # position.
    #
    # Takes list of strings and symbols.  Symbols are compared to
    # token type, while strings to token value.  For example:
    #
    #     look(:ident, "=", :regex)
    #
    def look(*tokens)
      buffer_tokens(tokens.length)
      i = 0
      tokens.all? do |t|
        tok = @buffer[i]
        i += 1
        if !tok
          false
        elsif t.instance_of?(Symbol)
          tok[:type] == t
        else
          tok[:value] == t
        end
      end
    end

    # Returns the value of next token, moving the current token cursor
    # also to next token.
    #
    # When full=true, returns full token as hash like so:
    #
    #     {:type => :ident, :value => "foo"}
    #
    # For doc-comments the full token also contains the field :linenr,
    # pointing to the line where the doc-comment began.
    #
    def next(full=false)
      buffer_tokens(1)
      tok = @buffer.shift
      # advance the scanpointer to the position after this token
      @input.pos = tok[:pos]
      full ? tok : tok[:value]
    end

    # True when no more tokens.
    def empty?
      buffer_tokens(1)
      return !@buffer.first
    end

    # Ensures next n tokens are read in buffer
    #
    # At the end of buffering the initial position scanpointer is
    # restored.  Only the #next method will advance the scanpointer in
    # a way that's visible outside this class.
    def buffer_tokens(n)
      prev_pos = @input.pos
      @input.pos = @buffer.last[:pos] if @buffer.last
      (n - @buffer.length).times do
        @previous_token = tok = next_token
        if tok
          # remember scanpointer position after each token
          tok[:pos] = @input.pos
          @buffer << tok
        end
      end
      @input.pos = prev_pos
    end

    # Parses out next token from input stream.
    #
    # For efficency we look for tokens in order of frequency in
    # JavaScript source code:
    #
    # - first check for most common operators.
    # - then for identifiers and keywords.
    # - then strings
    # - then comments
    #
    # The remaining token types are less frequent, so these are left
    # to the end.
    #
    def next_token
      while !@input.eos? do
        skip_white
        if @input.check(/[.(),;={}:]/)
          return {
            :type => :operator,
            :value => @input.scan(/./)
          }
        elsif @input.check(/[a-zA-Z_$]/)
          value = @input.scan(/[$\w]+/)
          kw = KEYWORDS[value]
          return {
            :type => kw || :ident,
            :value => kw || value
          }
        elsif @input.check(/'/)
          return {
            :type => :string,
            :value => @input.scan(/'([^'\\]|\\.)*('|\Z)/m).gsub(/\A'|'\Z/m, "")
          }
        elsif @input.check(/"/)
          return {
            :type => :string,
            :value => @input.scan(/"([^"\\]|\\.)*("|\Z)/m).gsub(/\A"|"\Z/m, "")
          }
        elsif @input.check(/\//)
          # Several things begin with dash:
          # - comments, regexes, division-operators
          if @input.check(/\/\*\*[^\/]/)
            return {
              :type => :doc_comment,
              # Calculate current line number, starting with 1
              :linenr => @input.string[0...@input.pos].count("\n") + 1,
              :value => @input.scan_until(/\*\/|\Z/).sub(/\A\/\*\*/, "").sub(/\*\/\Z/, "")
            }
          elsif @input.check(/\/\*/)
            # skip multiline comment
            @input.scan_until(/\*\/|\Z/)
          elsif @input.check(/\/\//)
            # skip line comment
            @input.scan_until(/\n|\Z/)
          elsif regex?
            return {
              :type => :regex,
              :value => @input.scan(META_REGEX)
            }
          else
            return {
              :type => :operator,
              :value => @input.scan(/\//)
            }
          end
        elsif @input.check(/[0-9]+/)
          nr = @input.scan(/[0-9]+(\.[0-9]*)?/)
          return {
            :type => :number,
            :value => nr
          }
        elsif  @input.check(/./)
          return {
            :type => :operator,
            :value => @input.scan(/./)
          }
        end
      end
    end

    # A slash "/" is a division operator if it follows:
    # - identifier
    # - the "this" keyword
    # - number
    # - closing bracket )
    # - closing square-bracket ]
    # Otherwise it's a beginning of regex
    def regex?
      if @previous_token
        type = @previous_token[:type]
        value = @previous_token[:value]
        if type == :ident || type == :number
          return false
        elsif type == :this
          return false
        elsif type == :operator && (value == ")" || value == "]")
          return false
        end
      end
      return true
    end

    def skip_white
      @input.scan(/\s+/)
    end

    # A regex to match a regex
    META_REGEX = %r{
      /               (?# beginning    )
      (
        [^/\[\\]      (?# any character except \ / [    )
        |
        \\.           (?# an escaping \ followed by any character    )
        |
        \[ ([^\]\\]|\\.)* \]    (?# [...] containing any characters including /    )
                                (?# except \ ] which have to be escaped    )
      )*
      (/[gim]*|\Z)   (?# ending + modifiers    )
    }x

    KEYWORDS = {
      "break" => :break,
      "case" => :case,
      "catch" => :catch,
      "continue" => :continue,
      "default" => :default,
      "delete" => :delete,
      "do" => :do,
      "else" => :else,
      "finally" => :finally,
      "for" => :for,
      "function" => :function,
      "if" => :if,
      "in" => :in,
      "instanceof" => :instanceof,
      "new" => :new,
      "return" => :return,
      "switch" => :switch,
      "this" => :this,
      "throw" => :throw,
      "try" => :try,
      "typeof" => :typeof,
      "var" => :var,
      "void" => :void,
      "while" => :while,
      "with" => :with,
    }
  end

end