Implement proper CSS/SCSS lexer. (abe2aafb) · Commits · extjs / jsduck

lib/jsduck/css_lexer.rb

0 → 100644

+201 −0

Original line number	Diff line number	Diff line
		require 'strscan'

		module JsDuck

		# Tokenizes CSS or SCSS code into lexical tokens.
		#
		# Each token has a type and value.
		# Types and possible values for them are as follows:
		#
		# - :number -- "25.8"
		# - :percentage -- "25%"
		# - :dimension -- "2em"
		# - :string -- '"Hello world"'
		# - :ident -- "foo-bar"
		# - :at_keyword -- "@mixin"
		# - :hash -- "#00FF66"
		# - :delim -- "{"
		# - :doc_comment -- "/** My comment */"
		#
		# Notice that doc-comments are recognized as tokens while normal
		# comments are ignored just as whitespace.
		#
		class CssLexer
		# Initializes lexer with input string.
		def initialize(input)
		@input = StringScanner.new(input)
		@buffer = []
		end

		# Tests if given pattern matches the tokens that follow at current
		# position.
		#
		# Takes list of strings and symbols. Symbols are compared to
		# token type, while strings to token value. For example:
		#
		# look(:ident, ":", :dimension)
		#
		def look(*tokens)
		buffer_tokens(tokens.length)
		i = 0
		tokens.all? do \|t\|
		tok = @buffer[i]
		i += 1
		if !tok
		false
		elsif t.instance_of?(Symbol)
		tok[:type] == t
		else
		tok[:value] == t
		end
		end
		end

		# Returns the value of next token, moving the current token cursor
		# also to next token.
		#
		# When full=true, returns full token as hash like so:
		#
		# {:type => :ident, :value => "foo"}
		#
		# For doc-comments the full token also contains the field :linenr,
		# pointing to the line where the doc-comment began.
		#
		def next(full=false)
		buffer_tokens(1)
		tok = @buffer.shift
		# advance the scanpointer to the position after this token
		@input.pos = tok[:pos]
		full ? tok : tok[:value]
		end

		# True when no more tokens.
		def empty?
		buffer_tokens(1)
		return !@buffer.first
		end

		# Ensures next n tokens are read in buffer
		#
		# At the end of buffering the initial position scanpointer is
		# restored. Only the #next method will advance the scanpointer in
		# a way that's visible outside this class.
		def buffer_tokens(n)
		prev_pos = @input.pos
		@input.pos = @buffer.last[:pos] if @buffer.last
		(n - @buffer.length).times do
		@previous_token = tok = next_token
		if tok
		# remember scanpointer position after each token
		tok[:pos] = @input.pos
		@buffer << tok
		end
		end
		@input.pos = prev_pos
		end

		# Parses out next token from input stream.
		def next_token
		while !@input.eos? do
		skip_white
		if @input.check(IDENT)
		return {
		:type => :ident,
		:value => @input.scan(IDENT)
		}
		elsif @input.check(/'/)
		return {
		:type => :string,
		:value => @input.scan(/'([^'\\]\|\\.)*('\|\Z)/m)
		}
		elsif @input.check(/"/)
		return {
		:type => :string,
		:value => @input.scan(/"([^"\\]\|\\.)*("\|\Z)/m)
		}
		elsif @input.check(/\//)
		# Several things begin with dash:
		# - comments, regexes, division-operators
		if @input.check(/\/\\[^\/]/)
		return {
		:type => :doc_comment,
		# Calculate current line number, starting with 1
		:linenr => @input.string[0...@input.pos].count("\n") + 1,
		:value => @input.scan_until(/\*\/\|\Z/)
		}
		elsif @input.check(/\/\*/)
		# skip multiline comment
		@input.scan_until(/\*\/\|\Z/)
		elsif @input.check(/\/\//)
		# skip line comment
		@input.scan_until(/\n\|\Z/)
		else
		return {
		:type => :operator,
		:value => @input.scan(/\//)
		}
		end
		elsif @input.check(NUM)
		nr = @input.scan(NUM)
		if @input.check(/%/)
		return {
		:type => :percentage,
		:value => nr + @input.scan(/%/)
		}
		elsif @input.check(IDENT)
		return {
		:type => :dimension,
		:value => nr + @input.scan(IDENT)
		}
		else
		return {
		:type => :number,
		:value => nr
		}
		end
		elsif @input.check(/@/)
		return maybe(:at_keyword, /@/, IDENT)
		elsif @input.check(/#/)
		return maybe(:hash, /#/, NAME)
		elsif @input.check(/\$/)
		return maybe(:var, /\$/, IDENT)
		elsif @input.check(/./)
		return {
		:type => :delim,
		:value => @input.scan(/./)
		}
		end
		end
		end

		# Returns token of given type when both regexes match.
		# Otherwise returns :delim token with value of first regex match.
		# First regex must always match.
		def maybe(token_type, before_re, after_re)
		before = @input.scan(before_re)
		if @input.check(after_re)
		return {
		:type => token_type,
		:value => before + @input.scan(after_re)
		}
		else
		return {
		:type => :delim,
		:value => before
		}
		end
		end

		def skip_white
		@input.scan(/\s+/)
		end

		# Simplified token syntax based on:
		# http://www.w3.org/TR/CSS21/syndata.html
		IDENT = /-?[_a-z][_a-z0-9-]*/i
		NAME = /[_a-z0-9-]+/i
		NUM = /[0-9]*\.[0-9]+\|[0-9]+/

		end

		end

spec/css_lexer_spec.rb

0 → 100644

+122 −0

Original line number	Diff line number	Diff line
		require "jsduck/css_lexer"

		describe JsDuck::CssLexer do

		def lex(source)
		lex = JsDuck::CssLexer.new(source)
		tokens = []
		while !lex.empty?
		t = lex.next(true)
		tokens << [t[:type], t[:value]]
		if t[:linenr]
		tokens.last << t[:linenr]
		end
		end
		tokens
		end

		it "tokenizes simple selector" do
		lex("a { font-size: 3em; }").should == [
		[:ident, "a"],
		[:delim, "{"],
		[:ident, "font-size"],
		[:delim, ":"],
		[:dimension, "3em"],
		[:delim, ";"],
		[:delim, "}"],
		]
		end

		it "tokenizes simple at-rule" do
		lex("@foo;").should == [[:at_keyword, "@foo"], [:delim, ";"]]
		end

		it "tokenizes color value" do
		lex("color: #cc00FF;").should == [[:ident, "color"], [:delim, ":"], [:hash, "#cc00FF"], [:delim, ";"]]
		end

		it "tokenizes various numbers" do
		lex("10 5.6 .14").should == [[:number, "10"], [:number, "5.6"], [:number, ".14"]]
		end

		it "identifies SCSS variable" do
		lex("$foo-bar").should == [[:var, "$foo-bar"]]
		end

		describe "identifies strings" do

		before do
		@d = '"' # double-quote
		@s = "'" # single-quote
		@b = "\\" # backslash
		end

		it "when single-quote inside double-quoted string" do
		lex(@d+@s+@d + ' "blah"').should == [[:string, @d+@s+@d], [:string, '"blah"']]
		end

		it "when double-quote inside single-quoted string" do
		lex(@s+@d+@s + ' "blah"').should == [[:string, @s+@d+@s], [:string, '"blah"']]
		end

		it "when escaped double-quote inside double-quoted string" do
		lex(@d+@b+@d+@d + ' "blah"').should == [[:string, @d+@b+@d+@d], [:string, '"blah"']]
		end

		it "when escaped single-quote inside single-quoted string" do
		lex(@s+@b+@s+@s + ' "blah"').should == [[:string, @s+@b+@s+@s], [:string, '"blah"']]
		end
		end

		it "ignores one-line comments" do
		lex("a // foo\n b").should == [[:ident, "a"], [:ident, "b"]]
		end

		it "ignores multi-line comments" do
		lex("a /* foo */ b").should == [[:ident, "a"], [:ident, "b"]]
		end

		it "ignores empty multi-line comments" do
		lex("a /**/ b").should == [[:ident, "a"], [:ident, "b"]]
		end

		it "identifies doc-comments together with line numbers" do
		lex("/** foo /").should == [[:doc_comment, "/* foo */", 1]]
		end

		it "counts line numbers correctly" do
		tokens = lex(<<-EOS)
		foo = {
		bar: foo,
		/**
		* My comment.
		*/
		EOS
		tokens.last.last.should == 3
		end

		describe "handles unfinished" do

		it "single-line comment" do
		lex("// ").should == []
		end

		it "multi-line comment" do
		lex("/* ").should == []
		end

		it "doc-comment" do
		lex("/ ").should == [[:doc_comment, "/ ", 1]]
		end

		it "single-quoted string" do
		lex("' ").should == [[:string, "' "]]
		end

		it "double-quoted string" do
		lex('" ').should == [[:string, '" ']]
		end
		end

		end