Commit abe2aafb authored by Rene Saarsoo's avatar Rene Saarsoo
Browse files

Implement proper CSS/SCSS lexer.

parent 26f8c533
Loading
Loading
Loading
Loading
+201 −0
Original line number Diff line number Diff line
require 'strscan'

module JsDuck

  # Tokenizes CSS or SCSS code into lexical tokens.
  #
  # Each token has a type and value.
  # Types and possible values for them are as follows:
  #
  # - :number      -- "25.8"
  # - :percentage  -- "25%"
  # - :dimension   -- "2em"
  # - :string      -- '"Hello world"'
  # - :ident       -- "foo-bar"
  # - :at_keyword  -- "@mixin"
  # - :hash        -- "#00FF66"
  # - :delim       -- "{"
  # - :doc_comment -- "/** My comment */"
  #
  # Notice that doc-comments are recognized as tokens while normal
  # comments are ignored just as whitespace.
  #
  class CssLexer
    # Initializes lexer with input string.
    def initialize(input)
      @input = StringScanner.new(input)
      @buffer = []
    end

    # Tests if given pattern matches the tokens that follow at current
    # position.
    #
    # Takes list of strings and symbols.  Symbols are compared to
    # token type, while strings to token value.  For example:
    #
    #     look(:ident, ":", :dimension)
    #
    def look(*tokens)
      buffer_tokens(tokens.length)
      i = 0
      tokens.all? do |t|
        tok = @buffer[i]
        i += 1
        if !tok
          false
        elsif t.instance_of?(Symbol)
          tok[:type] == t
        else
          tok[:value] == t
        end
      end
    end

    # Returns the value of next token, moving the current token cursor
    # also to next token.
    #
    # When full=true, returns full token as hash like so:
    #
    #     {:type => :ident, :value => "foo"}
    #
    # For doc-comments the full token also contains the field :linenr,
    # pointing to the line where the doc-comment began.
    #
    def next(full=false)
      buffer_tokens(1)
      tok = @buffer.shift
      # advance the scanpointer to the position after this token
      @input.pos = tok[:pos]
      full ? tok : tok[:value]
    end

    # True when no more tokens.
    def empty?
      buffer_tokens(1)
      return !@buffer.first
    end

    # Ensures next n tokens are read in buffer
    #
    # At the end of buffering the initial position scanpointer is
    # restored.  Only the #next method will advance the scanpointer in
    # a way that's visible outside this class.
    def buffer_tokens(n)
      prev_pos = @input.pos
      @input.pos = @buffer.last[:pos] if @buffer.last
      (n - @buffer.length).times do
        @previous_token = tok = next_token
        if tok
          # remember scanpointer position after each token
          tok[:pos] = @input.pos
          @buffer << tok
        end
      end
      @input.pos = prev_pos
    end

    # Parses out next token from input stream.
    def next_token
      while !@input.eos? do
        skip_white
        if @input.check(IDENT)
          return {
            :type => :ident,
            :value => @input.scan(IDENT)
          }
        elsif @input.check(/'/)
          return {
            :type => :string,
            :value => @input.scan(/'([^'\\]|\\.)*('|\Z)/m)
          }
        elsif @input.check(/"/)
          return {
            :type => :string,
            :value => @input.scan(/"([^"\\]|\\.)*("|\Z)/m)
          }
        elsif @input.check(/\//)
          # Several things begin with dash:
          # - comments, regexes, division-operators
          if @input.check(/\/\*\*[^\/]/)
            return {
              :type => :doc_comment,
              # Calculate current line number, starting with 1
              :linenr => @input.string[0...@input.pos].count("\n") + 1,
              :value => @input.scan_until(/\*\/|\Z/)
            }
          elsif @input.check(/\/\*/)
            # skip multiline comment
            @input.scan_until(/\*\/|\Z/)
          elsif @input.check(/\/\//)
            # skip line comment
            @input.scan_until(/\n|\Z/)
          else
            return {
              :type => :operator,
              :value => @input.scan(/\//)
            }
          end
        elsif @input.check(NUM)
          nr = @input.scan(NUM)
          if @input.check(/%/)
            return {
              :type => :percentage,
              :value => nr + @input.scan(/%/)
            }
          elsif @input.check(IDENT)
            return {
              :type => :dimension,
              :value => nr + @input.scan(IDENT)
            }
          else
            return {
              :type => :number,
              :value => nr
            }
          end
        elsif @input.check(/@/)
          return maybe(:at_keyword, /@/, IDENT)
        elsif @input.check(/#/)
          return maybe(:hash, /#/, NAME)
        elsif @input.check(/\$/)
          return maybe(:var, /\$/, IDENT)
        elsif @input.check(/./)
          return {
            :type => :delim,
            :value => @input.scan(/./)
          }
        end
      end
    end

    # Returns token of given type when both regexes match.
    # Otherwise returns :delim token with value of first regex match.
    # First regex must always match.
    def maybe(token_type, before_re, after_re)
      before = @input.scan(before_re)
      if @input.check(after_re)
        return {
          :type => token_type,
          :value => before + @input.scan(after_re)
        }
      else
        return {
          :type => :delim,
          :value => before
        }
      end
    end

    def skip_white
      @input.scan(/\s+/)
    end

    # Simplified token syntax based on:
    # http://www.w3.org/TR/CSS21/syndata.html
    IDENT = /-?[_a-z][_a-z0-9-]*/i
    NAME = /[_a-z0-9-]+/i
    NUM = /[0-9]*\.[0-9]+|[0-9]+/

  end

end

spec/css_lexer_spec.rb

0 → 100644
+122 −0
Original line number Diff line number Diff line
require "jsduck/css_lexer"

describe JsDuck::CssLexer do

  def lex(source)
    lex = JsDuck::CssLexer.new(source)
    tokens = []
    while !lex.empty?
      t = lex.next(true)
      tokens << [t[:type], t[:value]]
      if t[:linenr]
        tokens.last << t[:linenr]
      end
    end
    tokens
  end

  it "tokenizes simple selector" do
    lex("a { font-size: 3em; }").should == [
      [:ident, "a"],
      [:delim, "{"],
      [:ident, "font-size"],
      [:delim, ":"],
      [:dimension, "3em"],
      [:delim, ";"],
      [:delim, "}"],
    ]
  end

  it "tokenizes simple at-rule" do
    lex("@foo;").should == [[:at_keyword, "@foo"], [:delim, ";"]]
  end

  it "tokenizes color value" do
    lex("color: #cc00FF;").should == [[:ident, "color"], [:delim, ":"], [:hash, "#cc00FF"], [:delim, ";"]]
  end

  it "tokenizes various numbers" do
    lex("10 5.6 .14").should == [[:number, "10"], [:number, "5.6"], [:number, ".14"]]
  end

  it "identifies SCSS variable" do
    lex("$foo-bar").should == [[:var, "$foo-bar"]]
  end

  describe "identifies strings" do

    before do
      @d = '"' # double-quote
      @s = "'" # single-quote
      @b = "\\" # backslash
    end

    it "when single-quote inside double-quoted string" do
      lex(@d+@s+@d   + ' "blah"').should == [[:string, @d+@s+@d], [:string, '"blah"']]
    end

    it "when double-quote inside single-quoted string" do
      lex(@s+@d+@s   + ' "blah"').should == [[:string, @s+@d+@s], [:string, '"blah"']]
    end

    it "when escaped double-quote inside double-quoted string" do
      lex(@d+@b+@d+@d   + ' "blah"').should == [[:string, @d+@b+@d+@d], [:string, '"blah"']]
    end

    it "when escaped single-quote inside single-quoted string" do
      lex(@s+@b+@s+@s   + ' "blah"').should == [[:string, @s+@b+@s+@s], [:string, '"blah"']]
    end
  end

  it "ignores one-line comments" do
    lex("a // foo\n b").should == [[:ident, "a"], [:ident, "b"]]
  end

  it "ignores multi-line comments" do
    lex("a /* foo */ b").should == [[:ident, "a"], [:ident, "b"]]
  end

  it "ignores empty multi-line comments" do
    lex("a /**/ b").should == [[:ident, "a"], [:ident, "b"]]
  end

  it "identifies doc-comments together with line numbers" do
    lex("/** foo */").should == [[:doc_comment, "/** foo */", 1]]
  end

  it "counts line numbers correctly" do
    tokens = lex(<<-EOS)
      foo = {
        bar: foo,
        /**
         * My comment.
         */
    EOS
    tokens.last.last.should == 3
  end

  describe "handles unfinished" do

    it "single-line comment" do
      lex("// ").should == []
    end

    it "multi-line comment" do
      lex("/* ").should == []
    end

    it "doc-comment" do
      lex("/** ").should == [[:doc_comment, "/** ", 1]]
    end

    it "single-quoted string" do
      lex("' ").should == [[:string, "' "]]
    end

    it "double-quoted string" do
      lex('" ').should == [[:string, '" ']]
    end
  end

end