Remove the old JavaScript Lexer. (7793b1ab) · Commits · extjs / jsduck

lib/jsduck/lexer.rb

deleted100644 → 0

+0 −251

Original line number	Diff line number	Diff line
		require 'strscan'

		module JsDuck

		# Tokenizes JavaScript code into lexical tokens.
		#
		# Each token has a type and value.
		# Types and possible values for them are as follows:
		#
		# - :number -- 25
		# - :string -- "Hello world"
		# - :ident -- "foo"
		# - :regex -- "/abc/i"
		# - :operator -- "+"
		# - :doc_comment -- "/** My comment */"
		#
		# Plus a separate types for all keywords: :if, :while, :function, ...
		# For keywords the type and value are the same.
		#
		# Notice that doc-comments are recognized as tokens while normal
		# comments are ignored just as whitespace.
		#
		class Lexer
		# Input can be either a String or StringScanner.
		#
		# In the latter case we ensure that only #next will advance the
		# scanpointer of StringScanner - this allows context-switching
		# while parsing some string. Specifically we need this feature to
		# parse some JavaScript inside doc-comments.
		def initialize(input)
		@input = input.is_a?(StringScanner) ? input : StringScanner.new(input)
		@buffer = []
		end

		# Tests if given pattern matches the tokens that follow at current
		# position.
		#
		# Takes list of strings and symbols. Symbols are compared to
		# token type, while strings to token value. For example:
		#
		# look(:ident, "=", :regex)
		#
		def look(*tokens)
		buffer_tokens(tokens.length)
		i = 0
		tokens.all? do \|t\|
		tok = @buffer[i]
		i += 1
		if !tok
		false
		elsif t.instance_of?(Symbol)
		tok[:type] == t
		else
		tok[:value] == t
		end
		end
		end

		# Returns the value of next token, moving the current token cursor
		# also to next token.
		#
		# When full=true, returns full token as hash like so:
		#
		# {:type => :ident, :value => "foo"}
		#
		# For doc-comments the full token also contains the field :linenr,
		# pointing to the line where the doc-comment began.
		#
		def next(full=false)
		buffer_tokens(1)
		tok = @buffer.shift
		# advance the scanpointer to the position after this token
		@input.pos = tok[:pos]
		full ? tok : tok[:value]
		end

		# True when no more tokens.
		def empty?
		buffer_tokens(1)
		return !@buffer.first
		end

		# Ensures next n tokens are read in buffer
		#
		# At the end of buffering the initial position scanpointer is
		# restored. Only the #next method will advance the scanpointer in
		# a way that's visible outside this class.
		def buffer_tokens(n)
		prev_pos = @input.pos
		@input.pos = @buffer.last[:pos] if @buffer.last
		(n - @buffer.length).times do
		@previous_token = tok = next_token
		if tok
		# remember scanpointer position after each token
		tok[:pos] = @input.pos
		@buffer << tok
		end
		end
		@input.pos = prev_pos
		end

		# Parses out next token from input stream.
		#
		# For efficency we look for tokens in order of frequency in
		# JavaScript source code:
		#
		# - first check for most common operators.
		# - then for identifiers and keywords.
		# - then strings
		# - then comments
		#
		# The remaining token types are less frequent, so these are left
		# to the end.
		#
		def next_token
		while !@input.eos? do
		skip_white
		if @input.check(/[.(),;={}:]/)
		return {
		:type => :operator,
		:value => @input.scan(/./)
		}
		elsif @input.check(/[a-zA-Z_$]/)
		value = @input.scan(/[$\w]+/)
		kw = KEYWORDS[value]
		return {
		:type => kw \|\| :ident,
		:value => kw \|\| value
		}
		elsif @input.check(/'/)
		return {
		:type => :string,
		:value => @input.scan(/'([^'\\]\|\\.)*('\|\Z)/m).gsub(/\A'\|'\Z/m, "")
		}
		elsif @input.check(/"/)
		return {
		:type => :string,
		:value => @input.scan(/"([^"\\]\|\\.)*("\|\Z)/m).gsub(/\A"\|"\Z/m, "")
		}
		elsif @input.check(/\//)
		# Several things begin with dash:
		# - comments, regexes, division-operators
		if @input.check(/\/\\[^\/]/)
		return {
		:type => :doc_comment,
		# Calculate current line number, starting with 1
		:linenr => @input.string[0...@input.pos].count("\n") + 1,
		:value => @input.scan_until(/\\/\|\Z/).sub(/\A\/\\/, "").sub(/\\/\Z/, "")
		}
		elsif @input.check(/\/\*/)
		# skip multiline comment
		@input.scan_until(/\*\/\|\Z/)
		elsif @input.check(/\/\//)
		# skip line comment
		@input.scan_until(/\n\|\Z/)
		elsif regex?
		return {
		:type => :regex,
		:value => @input.scan(META_REGEX)
		}
		else
		return {
		:type => :operator,
		:value => @input.scan(/\//)
		}
		end
		elsif @input.check(/[0-9]+/)
		nr = @input.scan(/[0-9]+(\.[0-9]*)?/)
		return {
		:type => :number,
		:value => nr
		}
		elsif @input.check(/./)
		return {
		:type => :operator,
		:value => @input.scan(/./)
		}
		end
		end
		end

		# A slash "/" is a division operator if it follows:
		# - identifier
		# - the "this" keyword
		# - number
		# - closing bracket )
		# - closing square-bracket ]
		# Otherwise it's a beginning of regex
		def regex?
		if @previous_token
		type = @previous_token[:type]
		value = @previous_token[:value]
		if type == :ident \|\| type == :number
		return false
		elsif type == :this
		return false
		elsif type == :operator && (value == ")" \|\| value == "]")
		return false
		end
		end
		return true
		end

		def skip_white
		@input.scan(/\s+/)
		end

		# A regex to match a regex
		META_REGEX = %r{
		/ (?# beginning )
		(
		[^/\[\\] (?# any character except \ / [ )
		\|
		\\. (?# an escaping \ followed by any character )
		\|
		\[ ([^\]\\]\|\\.)* \] (?# [...] containing any characters including / )
		(?# except \ ] which have to be escaped )
		)*
		(/[gim]*\|\Z) (?# ending + modifiers )
		}x

		KEYWORDS = {
		"break" => :break,
		"case" => :case,
		"catch" => :catch,
		"continue" => :continue,
		"default" => :default,
		"delete" => :delete,
		"do" => :do,
		"else" => :else,
		"finally" => :finally,
		"for" => :for,
		"function" => :function,
		"if" => :if,
		"in" => :in,
		"instanceof" => :instanceof,
		"new" => :new,
		"return" => :return,
		"switch" => :switch,
		"this" => :this,
		"throw" => :throw,
		"try" => :try,
		"typeof" => :typeof,
		"var" => :var,
		"void" => :void,
		"while" => :while,
		"with" => :with,
		}
		end

		end