t/lib-shell-parser.pl - git - Git at Google

 # Copyright (c) 2021-2022 Eric Sunshine <sunshine@sunshineco.com>
 #
 # Shared shell script parser for test lint tools. Provides Lexer,
 # ShellParser, and ScriptParser. Subclass ScriptParser and override
 # check_test() to implement lint checks.

 use strict;
 use warnings;

 # Lexer tokenizes POSIX shell scripts. It is roughly modeled after section 2.3
 # "Token Recognition" of POSIX chapter 2 "Shell Command Language". Although
 # similar to lexical analyzers for other languages, this one differs in a few
 # substantial ways due to quirks of the shell command language.
 #
 # For instance, in many languages, newline is just whitespace like space or
 # TAB, but in shell a newline is a command separator, thus a distinct lexical
 # token. A newline is significant and returned as a distinct token even at the
 # end of a shell comment.
 #
 # In other languages, `1+2` would typically be scanned as three tokens
 # (`1`, `+`, and `2`), but in shell it is a single token. However, the similar
 # `1 + 2`, which embeds whitepaces, is scanned as three token in shell, as well.
 # In shell, several characters with special meaning lose that meaning when not
 # surrounded by whitespace. For instance, the negation operator `!` is special
 # when standing alone surrounded by whitespace; whereas in `foo!uucp` it is
 # just a plain character in the longer token "foo!uucp". In many other
 # languages, `"string"/foo:'string'` might be scanned as five tokens ("string",
 # `/`, `foo`, `:`, and 'string'), but in shell, it is just a single token.
 #
 # The lexical analyzer for the shell command language is also somewhat unusual
 # in that it recursively invokes the parser to handle the body of `$(...)`
 # expressions which can contain arbitrary shell code. Such expressions may be
 # encountered both inside and outside of double-quoted strings.
 #
 # The lexical analyzer is responsible for consuming shell here-doc bodies which
 # extend from the line following a `<<TAG` operator until a line consisting
 # solely of `TAG`. Here-doc consumption begins when a newline is encountered.
 # It is legal for multiple here-doc `<<TAG` operators to be present on a single
 # line, in which case their bodies must be present one following the next, and
 # are consumed in the (left-to-right) order the `<<TAG` operators appear on the
 # line. A special complication is that the bodies of all here-docs must be
 # consumed when the newline is encountered even if the parse context depth has
 # changed. For instance, in `cat <<A && x=$(cat <<B &&\n`, bodies of here-docs
 # "A" and "B" must be consumed even though "A" was introduced outside the
 # recursive parse context in which "B" was introduced and in which the newline
 # is encountered.
 package Lexer;

 sub new {
 	my ($class, $parser, $s) = @_;
 	bless {
 		parser => $parser,
 		buff => $s,
 		lineno => 1,
 		heretags => []
 	} => $class;
 }

 sub scan_heredoc_tag {
 	my $self = shift @_;
 	${$self->{buff}} =~ /\G(-?)/gc;
 	my $indented = $1;
 	my $token = $self->scan_token();
 	return "<<$indented" unless $token;
 	my $tag = $token->[0];
 	$tag =~ s/['"\\]//g;
 	$$token[0] = $indented ? "\t$tag" : "$tag";
 	push(@{$self->{heretags}}, $token);
 	return "<<$indented$tag";
 }

 sub scan_op {
 	my ($self, $c) = @_;
 	my $b = $self->{buff};
 	return $c unless $$b =~ /\G(.)/sgc;
 	my $cc = $c . $1;
 	return scan_heredoc_tag($self) if $cc eq '<<';
 	return $cc if $cc =~ /^(?:&&|\|\||>>|;;|<&|>&|<>|>\|)$/;
 	pos($$b)--;
 	return $c;
 }

 sub scan_sqstring {
 	my $self = shift @_;
 	${$self->{buff}} =~ /\G([^']*'|.*\z)/sgc;
 	my $s = $1;
 	$self->{lineno} += () = $s =~ /\n/sg;
 	return "'" . $s;
 }

 sub scan_dqstring {
 	my $self = shift @_;
 	my $b = $self->{buff};
 	my $s = '"';
 	while (1) {
 		# Slurp non-special characters; count newlines here because
 		# newlines inside $() are already counted by the recursive parse.
 		if ($$b =~ /\G([^"\$\\]+)/gc) {
 			$s .= $1;
 			$self->{lineno} += $1 =~ tr/\n//;
 		}
 		# handle special characters
 		last unless $$b =~ /\G(.)/sgc;
 		my $c = $1;
 		$s .= '"', last if $c eq '"';
 		$s .= '$' . $self->scan_dollar(), next if $c eq '$';
 		if ($c eq '\\') {
 			$s .= '\\', last unless $$b =~ /\G(.)/sgc;
 			$c = $1;
 			$self->{lineno}++, next if $c eq "\n"; # line splice
 			# backslash escapes only $, `, ", \ in dq-string
 			$s .= '\\' unless $c =~ /^[\$`"\\]$/;
 			$s .= $c;
 			next;
 		}
 		die("internal error scanning dq-string '$c'\n");
 	}
 	return $s;
 }

 sub scan_balanced {
 	my ($self, $c1, $c2) = @_;
 	my $b = $self->{buff};
 	my $depth = 1;
 	my $s = $c1;
 	while ($$b =~ /\G([^\Q$c1$c2\E]*(?:[\Q$c1$c2\E]|\z))/gc) {
 		$s .= $1;
 		$depth++, next if $s =~ /\Q$c1\E$/;
 		$depth--;
 		last if $depth == 0;
 	}
 	$self->{lineno} += () = $s =~ /\n/sg;
 	return $s;
 }

 sub scan_subst {
 	my $self = shift @_;
 	my @tokens = $self->{parser}->parse(qr/^\)$/);
 	$self->{parser}->next_token(); # closing ")"
 	return @tokens;
 }

 sub scan_dollar {
 	my $self = shift @_;
 	my $b = $self->{buff};
 	return $self->scan_balanced('(', ')') if $$b =~ /\G\((?=\()/gc; # $((...))
 	return '(' . join(' ', map {$_->[0]} $self->scan_subst()) . ')' if $$b =~ /\G\(/gc; # $(...)
 	return $self->scan_balanced('{', '}') if $$b =~ /\G\{/gc; # ${...}
 	return $1 if $$b =~ /\G(\w+)/gc; # $var
 	return $1 if $$b =~ /\G([@*#?$!0-9-])/gc; # $*, $1, $$, etc.
 	return '';
 }

 sub swallow_heredocs {
 	my $self = shift @_;
 	my $b = $self->{buff};
 	my $tags = $self->{heretags};
 	while (my $tag = shift @$tags) {
 		my $start = pos($$b);
 		my $indent = $$tag[0] =~ s/^\t// ? '\\s*' : '';
 		$$b =~ /(?:\G|\n)$indent\Q$$tag[0]\E(?:\n|\z)/gc;
 		if (pos($$b) > $start) {
 			my $body = substr($$b, $start, pos($$b) - $start);
 			$self->{parser}->{heredocs}->{$$tag[0]} = {
 				content => substr($body, 0, length($body) - length($&)),
 				start_line => $self->{lineno},
 		        };
 			$self->{lineno} += () = $body =~ /\n/sg;
 			next;
 		}
 		push(@{$self->{parser}->{problems}}, ['HEREDOC', $tag]);
 		$$b =~ /(?:\G|\n).*\z/gc; # consume rest of input
 		my $body = substr($$b, $start, pos($$b) - $start);
 		$self->{lineno} += () = $body =~ /\n/sg;
 		last;
 	}
 }

 sub scan_token {
 	my $self = shift @_;
 	my $b = $self->{buff};
 	my $token = '';
 	my ($start, $startln);
 RESTART:
 	$startln = $self->{lineno};
 	$$b =~ /\G[ \t]+/gc; # skip whitespace (but not newline)
 	$start = pos($$b) || 0;
 	$self->{lineno}++, return ["\n", $start, pos($$b), $startln, $startln] if $$b =~ /\G#[^\n]*(?:\n|\z)/gc; # comment
 	while (1) {
 		# slurp up non-special characters
 		$token .= $1 if $$b =~ /\G([^\\;&|<>(){}'"\$\s]+)/gc;
 		# handle special characters
 		last unless $$b =~ /\G(.)/sgc;
 		my $c = $1;
 		pos($$b)--, last if $c =~ /^[ \t]$/; # whitespace ends token
 		pos($$b)--, last if length($token) && $c =~ /^[;&|<>(){}\n]$/;
 		$token .= $self->scan_sqstring(), next if $c eq "'";
 		$token .= $self->scan_dqstring(), next if $c eq '"';
 		$token .= $c . $self->scan_dollar(), next if $c eq '$';
 		$self->{lineno}++, $self->swallow_heredocs(), $token = $c, last if $c eq "\n";
 		$token = $self->scan_op($c), last if $c =~ /^[;&|<>]$/;
 		$token = $c, last if $c =~ /^[(){}]$/;
 		if ($c eq '\\') {
 			$token .= '\\', last unless $$b =~ /\G(.)/sgc;
 			$c = $1;
 			$self->{lineno}++, next if $c eq "\n" && length($token); # line splice
 			$self->{lineno}++, goto RESTART if $c eq "\n"; # line splice
 			$token .= '\\' . $c;
 			next;
 		}
 		die("internal error scanning character '$c'\n");
 	}
 	return length($token) ? [$token, $start, pos($$b), $startln, $self->{lineno}] : undef;
 }

 # ShellParser parses POSIX shell scripts (with minor extensions for Bash). It
 # is a recursive descent parser very roughly modeled after section 2.10 "Shell
 # Grammar" of POSIX chapter 2 "Shell Command Language".

 package ShellParser;

 sub new {
 	my ($class, $s) = @_;
 	my $self = bless {
 		buff => [],
 		stop => [],
 		output => [],
 		heredocs => {},
 		insubshell => 0,
 	} => $class;
 	$self->{lexer} = Lexer->new($self, $s);
 	return $self;
 }

 sub next_token {
 	my $self = shift @_;
 	return pop(@{$self->{buff}}) if @{$self->{buff}};
 	return $self->{lexer}->scan_token();
 }

 sub untoken {
 	my $self = shift @_;
 	push(@{$self->{buff}}, @_);
 }

 sub peek {
 	my $self = shift @_;
 	my $token = $self->next_token();
 	return undef unless defined($token);
 	$self->untoken($token);
 	return $token;
 }

 sub stop_at {
 	my ($self, $token) = @_;
 	return 1 unless defined($token);
 	my $stop = ${$self->{stop}}[-1] if @{$self->{stop}};
 	return defined($stop) && $token->[0] =~ $stop;
 }

 sub expect {
 	my ($self, $expect) = @_;
 	my $token = $self->next_token();
 	return $token if defined($token) && $token->[0] eq $expect;
 	push(@{$self->{output}}, "?!ERR?! expected '$expect' but found '" . (defined($token) ? $token->[0] : "<end-of-input>") . "'\n");
 	$self->untoken($token) if defined($token);
 	return ();
 }

 sub optional_newlines {
 	my $self = shift @_;
 	my @tokens;
 	while (my $token = $self->peek()) {
 		last unless $token->[0] eq "\n";
 		push(@tokens, $self->next_token());
 	}
 	return @tokens;
 }

 sub parse_group {
 	my $self = shift @_;
 	return ($self->parse(qr/^}$/),
 		$self->expect('}'));
 }

 sub parse_subshell {
 	my $self = shift @_;
 	$self->{insubshell}++;
 	my @tokens = ($self->parse(qr/^\)$/),
 		      $self->expect(')'));
 	$self->{insubshell}--;
 	return @tokens;
 }

 sub parse_case_pattern {
 	my $self = shift @_;
 	my @tokens;
 	while (defined(my $token = $self->next_token())) {
 		push(@tokens, $token);
 		last if $token->[0] eq ')';
 	}
 	return @tokens;
 }

 sub parse_case {
 	my $self = shift @_;
 	my @tokens;
 	push(@tokens,
 	     $self->next_token(), # subject
 	     $self->optional_newlines(),
 	     $self->expect('in'),
 	     $self->optional_newlines());
 	while (1) {
 		my $token = $self->peek();
 		last unless defined($token) && $token->[0] ne 'esac';
 		push(@tokens,
 		     $self->parse_case_pattern(),
 		     $self->optional_newlines(),
 		     $self->parse(qr/^(?:;;|esac)$/)); # item body
 		$token = $self->peek();
 		last unless defined($token) && $token->[0] ne 'esac';
 		push(@tokens,
 		     $self->expect(';;'),
 		     $self->optional_newlines());
 	}
 	push(@tokens, $self->expect('esac'));
 	return @tokens;
 }

 sub parse_for {
 	my $self = shift @_;
 	my @tokens;
 	push(@tokens,
 	     $self->next_token(), # variable
 	     $self->optional_newlines());
 	my $token = $self->peek();
 	if (defined($token) && $token->[0] eq 'in') {
 		push(@tokens,
 		     $self->expect('in'),
 		     $self->optional_newlines());
 	}
 	push(@tokens,
 	     $self->parse(qr/^do$/), # items
 	     $self->expect('do'),
 	     $self->optional_newlines(),
 	     $self->parse_loop_body(),
 	     $self->expect('done'));
 	return @tokens;
 }

 sub parse_if {
 	my $self = shift @_;
 	my @tokens;
 	while (1) {
 		push(@tokens,
 		     $self->parse(qr/^then$/), # if/elif condition
 		     $self->expect('then'),
 		     $self->optional_newlines(),
 		     $self->parse(qr/^(?:elif|else|fi)$/)); # if/elif body
 		my $token = $self->peek();
 		last unless defined($token) && $token->[0] eq 'elif';
 		push(@tokens, $self->expect('elif'));
 	}
 	my $token = $self->peek();
 	if (defined($token) && $token->[0] eq 'else') {
 		push(@tokens,
 		     $self->expect('else'),
 		     $self->optional_newlines(),
 		     $self->parse(qr/^fi$/)); # else body
 	}
 	push(@tokens, $self->expect('fi'));
 	return @tokens;
 }

 sub parse_loop_body {
 	my $self = shift @_;
 	return $self->parse(qr/^done$/);
 }

 sub parse_loop {
 	my $self = shift @_;
 	return ($self->parse(qr/^do$/), # condition
 		$self->expect('do'),
 		$self->optional_newlines(),
 		$self->parse_loop_body(),
 		$self->expect('done'));
 }

 sub parse_func {
 	my $self = shift @_;
 	return ($self->expect('('),
 		$self->expect(')'),
 		$self->optional_newlines(),
 		$self->parse_cmd()); # body
 }

 sub parse_bash_array_assignment {
 	my $self = shift @_;
 	my @tokens = $self->expect('(');
 	while (defined(my $token = $self->next_token())) {
 		push(@tokens, $token);
 		last if $token->[0] eq ')';
 	}
 	return @tokens;
 }

 my %compound = (
 	'{' => \&parse_group,
 	'(' => \&parse_subshell,
 	'case' => \&parse_case,
 	'for' => \&parse_for,
 	'if' => \&parse_if,
 	'until' => \&parse_loop,
 	'while' => \&parse_loop);

 sub parse_cmd {
 	my $self = shift @_;
 	my $cmd = $self->next_token();
 	return () unless defined($cmd);
 	return $cmd if $cmd->[0] eq "\n";

 	my $token;
 	my @tokens = $cmd;
 	if ($cmd->[0] eq '!') {
 		push(@tokens, $self->parse_cmd());
 		return @tokens;
 	} elsif (my $f = $compound{$cmd->[0]}) {
 		push(@tokens, $self->$f());
 	} elsif (defined($token = $self->peek()) && $token->[0] eq '(') {
 		if ($cmd->[0] !~ /\w=$/) {
 			push(@tokens, $self->parse_func());
 			return @tokens;
 		}
 		my @array = $self->parse_bash_array_assignment();
 		$tokens[-1]->[0] .= join(' ', map {$_->[0]} @array);
 		$tokens[-1]->[2] = $array[$#array][2] if @array;
 	}

 	while (defined(my $token = $self->next_token())) {
 		$self->untoken($token), last if $self->stop_at($token);
 		push(@tokens, $token);
 		last if $token->[0] =~ /^(?:[;&\n|]|&&|\|\|)$/;
 	}
 	push(@tokens, $self->next_token()) if $tokens[-1]->[0] ne "\n" && defined($token = $self->peek()) && $token->[0] eq "\n";
 	return @tokens;
 }

 sub accumulate {
 	my ($self, $tokens, $cmd) = @_;
 	push(@$tokens, @$cmd);
 }

 sub parse {
 	my ($self, $stop) = @_;
 	push(@{$self->{stop}}, $stop);
 	goto DONE if $self->stop_at($self->peek());
 	my @tokens;
 	while (my @cmd = $self->parse_cmd()) {
 		$self->accumulate(\@tokens, \@cmd);
 		last if $self->stop_at($self->peek());
 	}
 DONE:
 	pop(@{$self->{stop}});
 	return @tokens;
 }

 # ScriptParser is a subclass of ShellParser which identifies individual test
 # definitions within test scripts and passes each test body to check_test().
 # ScriptParser detects test definitions not only at the top-level of test
 # scripts but also within compound commands such as loops and function
 # definitions.

 package ScriptParser;

 our @ISA = ('ShellParser');

 sub new {
 	my $class = shift @_;
 	my $self = $class->SUPER::new(@_);
 	$self->{ntests} = 0;
 	$self->{nerrs} = 0;
 	return $self;
 }

 # extract the raw content of a token, which may be a single string or a
 # composition of multiple strings and non-string character runs; for instance,
 # `"test body"` unwraps to `test body`; `word"a b"42'c d'` to `worda b42c d`
 sub unwrap {
 	my $token = (@_ ? shift @_ : $_)->[0];
 	# simple case: 'sqstring' or "dqstring"
 	return $token if $token =~ s/^'([^']*)'$/$1/;
 	return $token if $token =~ s/^"([^"]*)"$/$1/;

 	# composite case
 	my ($s, $q, $escaped);
 	while (1) {
 		# slurp up non-special characters
 		$s .= $1 if $token =~ /\G([^\\'"]*)/gc;
 		# handle special characters
 		last unless $token =~ /\G(.)/sgc;
 		my $c = $1;
 		$q = undef, next if defined($q) && $c eq $q;
 		$q = $c, next if !defined($q) && $c =~ /^['"]$/;
 		if ($c eq '\\') {
 			last unless $token =~ /\G(.)/sgc;
 			$c = $1;
 			$s .= '\\' if $c eq "\n"; # preserve line splice
 		}
 		$s .= $c;
 	}
 	return $s
 }

 sub check_test {
 	# no-op; subclass and override to implement lint checks
 }

 sub parse_cmd {
 	my $self = shift @_;
 	my @tokens = $self->SUPER::parse_cmd();
 	return @tokens unless @tokens && $tokens[0]->[0] =~ /^test_expect_(?:success|failure)$/;
 	my $n = $#tokens;
 	$n-- while $n >= 0 && $tokens[$n]->[0] =~ /^(?:[;&\n|]|&&|\|\|)$/;
 	my $herebody;
 	if ($n >= 2 && $tokens[$n-1]->[0] eq '-' && $tokens[$n]->[0] =~ /^<<-?(.+)$/) {
 		$herebody = $self->{heredocs}->{$1};
 		$n--;
 	}
 	$self->check_test($tokens[1], $tokens[2], $herebody) if $n == 2; # title body
 	$self->check_test($tokens[2], $tokens[3], $herebody) if $n > 2;  # prereq title body
 	return @tokens;
 }

 1;
	# Copyright (c) 2021-2022 Eric Sunshine <sunshine@sunshineco.com>
	#
	# Shared shell script parser for test lint tools. Provides Lexer,
	# ShellParser, and ScriptParser. Subclass ScriptParser and override
	# check_test() to implement lint checks.

	use strict;
	use warnings;

	# Lexer tokenizes POSIX shell scripts. It is roughly modeled after section 2.3
	# "Token Recognition" of POSIX chapter 2 "Shell Command Language". Although
	# similar to lexical analyzers for other languages, this one differs in a few
	# substantial ways due to quirks of the shell command language.
	#
	# For instance, in many languages, newline is just whitespace like space or
	# TAB, but in shell a newline is a command separator, thus a distinct lexical
	# token. A newline is significant and returned as a distinct token even at the
	# end of a shell comment.
	#
	# In other languages, `1+2` would typically be scanned as three tokens
	# (`1`, `+`, and `2`), but in shell it is a single token. However, the similar
	# `1 + 2`, which embeds whitepaces, is scanned as three token in shell, as well.
	# In shell, several characters with special meaning lose that meaning when not
	# surrounded by whitespace. For instance, the negation operator `!` is special
	# when standing alone surrounded by whitespace; whereas in `foo!uucp` it is
	# just a plain character in the longer token "foo!uucp". In many other
	# languages, `"string"/foo:'string'` might be scanned as five tokens ("string",
	# `/`, `foo`, `:`, and 'string'), but in shell, it is just a single token.
	#
	# The lexical analyzer for the shell command language is also somewhat unusual
	# in that it recursively invokes the parser to handle the body of `$(...)`
	# expressions which can contain arbitrary shell code. Such expressions may be
	# encountered both inside and outside of double-quoted strings.
	#
	# The lexical analyzer is responsible for consuming shell here-doc bodies which
	# extend from the line following a `<<TAG` operator until a line consisting
	# solely of `TAG`. Here-doc consumption begins when a newline is encountered.
	# It is legal for multiple here-doc `<<TAG` operators to be present on a single
	# line, in which case their bodies must be present one following the next, and
	# are consumed in the (left-to-right) order the `<<TAG` operators appear on the
	# line. A special complication is that the bodies of all here-docs must be
	# consumed when the newline is encountered even if the parse context depth has
	# changed. For instance, in `cat <<A && x=$(cat <<B &&\n`, bodies of here-docs
	# "A" and "B" must be consumed even though "A" was introduced outside the
	# recursive parse context in which "B" was introduced and in which the newline
	# is encountered.
	package Lexer;

	sub new {
	my ($class, $parser, $s) = @_;
	bless {
	parser => $parser,
	buff => $s,
	lineno => 1,
	heretags => []
	} => $class;
	}

	sub scan_heredoc_tag {
	my $self = shift @_;
	${$self->{buff}} =~ /\G(-?)/gc;
	my $indented = $1;
	my $token = $self->scan_token();
	return "<<$indented" unless $token;
	my $tag = $token->[0];
	$tag =~ s/['"\\]//g;
	$$token[0] = $indented ? "\t$tag" : "$tag";
	push(@{$self->{heretags}}, $token);
	return "<<$indented$tag";
	}

	sub scan_op {
	my ($self, $c) = @_;
	my $b = $self->{buff};
	return $c unless $$b =~ /\G(.)/sgc;
	my $cc = $c . $1;
	return scan_heredoc_tag($self) if $cc eq '<<';
	return $cc if $cc =~ /^(?:&&\|\\|\\|\|>>\|;;\|<&\|>&\|<>\|>\\|)$/;
	pos($$b)--;
	return $c;
	}

	sub scan_sqstring {
	my $self = shift @_;
	${$self->{buff}} =~ /\G([^']'\|.\z)/sgc;
	my $s = $1;
	$self->{lineno} += () = $s =~ /\n/sg;
	return "'" . $s;
	}

	sub scan_dqstring {
	my $self = shift @_;
	my $b = $self->{buff};
	my $s = '"';
	while (1) {
	# Slurp non-special characters; count newlines here because
	# newlines inside $() are already counted by the recursive parse.
	if ($$b =~ /\G([^"\$\\]+)/gc) {
	$s .= $1;
	$self->{lineno} += $1 =~ tr/\n//;
	}
	# handle special characters
	last unless $$b =~ /\G(.)/sgc;
	my $c = $1;
	$s .= '"', last if $c eq '"';
	$s .= '$' . $self->scan_dollar(), next if $c eq '$';
	if ($c eq '\\') {
	$s .= '\\', last unless $$b =~ /\G(.)/sgc;
	$c = $1;
	$self->{lineno}++, next if $c eq "\n"; # line splice
	# backslash escapes only $, `, ", \ in dq-string
	$s .= '\\' unless $c =~ /^[\$`"\\]$/;
	$s .= $c;
	next;
	}
	die("internal error scanning dq-string '$c'\n");
	}
	return $s;
	}

	sub scan_balanced {
	my ($self, $c1, $c2) = @_;
	my $b = $self->{buff};
	my $depth = 1;
	my $s = $c1;
	while ($$b =~ /\G([^\Q$c1$c2\E]*(?:[\Q$c1$c2\E]\|\z))/gc) {
	$s .= $1;
	$depth++, next if $s =~ /\Q$c1\E$/;
	$depth--;
	last if $depth == 0;
	}
	$self->{lineno} += () = $s =~ /\n/sg;
	return $s;
	}

	sub scan_subst {
	my $self = shift @_;
	my @tokens = $self->{parser}->parse(qr/^\)$/);
	$self->{parser}->next_token(); # closing ")"
	return @tokens;
	}

	sub scan_dollar {
	my $self = shift @_;
	my $b = $self->{buff};
	return $self->scan_balanced('(', ')') if $$b =~ /\G\((?=\()/gc; # $((...))
	return '(' . join(' ', map {$_->[0]} $self->scan_subst()) . ')' if $$b =~ /\G\(/gc; # $(...)
	return $self->scan_balanced('{', '}') if $$b =~ /\G\{/gc; # ${...}
	return $1 if $$b =~ /\G(\w+)/gc; # $var
	return $1 if $$b =~ /\G([@#?$!0-9-])/gc; # $, $1, $$, etc.
	return '';
	}

	sub swallow_heredocs {
	my $self = shift @_;
	my $b = $self->{buff};
	my $tags = $self->{heretags};
	while (my $tag = shift @$tags) {
	my $start = pos($$b);
	my $indent = $$tag[0] =~ s/^\t// ? '\\s*' : '';
	$$b =~ /(?:\G\|\n)$indent\Q$$tag[0]\E(?:\n\|\z)/gc;
	if (pos($$b) > $start) {
	my $body = substr($$b, $start, pos($$b) - $start);
	$self->{parser}->{heredocs}->{$$tag[0]} = {
	content => substr($body, 0, length($body) - length($&)),
	start_line => $self->{lineno},
	};
	$self->{lineno} += () = $body =~ /\n/sg;
	next;
	}
	push(@{$self->{parser}->{problems}}, ['HEREDOC', $tag]);
	$$b =~ /(?:\G\|\n).*\z/gc; # consume rest of input
	my $body = substr($$b, $start, pos($$b) - $start);
	$self->{lineno} += () = $body =~ /\n/sg;
	last;
	}
	}

	sub scan_token {
	my $self = shift @_;
	my $b = $self->{buff};
	my $token = '';
	my ($start, $startln);
	RESTART:
	$startln = $self->{lineno};
	$$b =~ /\G[ \t]+/gc; # skip whitespace (but not newline)
	$start = pos($$b) \|\| 0;
	$self->{lineno}++, return ["\n", $start, pos($$b), $startln, $startln] if $$b =~ /\G#[^\n]*(?:\n\|\z)/gc; # comment
	while (1) {
	# slurp up non-special characters
	$token .= $1 if $$b =~ /\G([^\\;&\|<>(){}'"\$\s]+)/gc;
	# handle special characters
	last unless $$b =~ /\G(.)/sgc;
	my $c = $1;
	pos($$b)--, last if $c =~ /^[ \t]$/; # whitespace ends token
	pos($$b)--, last if length($token) && $c =~ /^[;&\|<>(){}\n]$/;
	$token .= $self->scan_sqstring(), next if $c eq "'";
	$token .= $self->scan_dqstring(), next if $c eq '"';
	$token .= $c . $self->scan_dollar(), next if $c eq '$';
	$self->{lineno}++, $self->swallow_heredocs(), $token = $c, last if $c eq "\n";
	$token = $self->scan_op($c), last if $c =~ /^[;&\|<>]$/;
	$token = $c, last if $c =~ /^[(){}]$/;
	if ($c eq '\\') {
	$token .= '\\', last unless $$b =~ /\G(.)/sgc;
	$c = $1;
	$self->{lineno}++, next if $c eq "\n" && length($token); # line splice
	$self->{lineno}++, goto RESTART if $c eq "\n"; # line splice
	$token .= '\\' . $c;
	next;
	}
	die("internal error scanning character '$c'\n");
	}
	return length($token) ? [$token, $start, pos($$b), $startln, $self->{lineno}] : undef;
	}

	# ShellParser parses POSIX shell scripts (with minor extensions for Bash). It
	# is a recursive descent parser very roughly modeled after section 2.10 "Shell
	# Grammar" of POSIX chapter 2 "Shell Command Language".

	package ShellParser;

	sub new {
	my ($class, $s) = @_;
	my $self = bless {
	buff => [],
	stop => [],
	output => [],
	heredocs => {},
	insubshell => 0,
	} => $class;
	$self->{lexer} = Lexer->new($self, $s);
	return $self;
	}

	sub next_token {
	my $self = shift @_;
	return pop(@{$self->{buff}}) if @{$self->{buff}};
	return $self->{lexer}->scan_token();
	}

	sub untoken {
	my $self = shift @_;
	push(@{$self->{buff}}, @_);
	}

	sub peek {
	my $self = shift @_;
	my $token = $self->next_token();
	return undef unless defined($token);
	$self->untoken($token);
	return $token;
	}

	sub stop_at {
	my ($self, $token) = @_;
	return 1 unless defined($token);
	my $stop = ${$self->{stop}}[-1] if @{$self->{stop}};
	return defined($stop) && $token->[0] =~ $stop;
	}

	sub expect {
	my ($self, $expect) = @_;
	my $token = $self->next_token();
	return $token if defined($token) && $token->[0] eq $expect;
	push(@{$self->{output}}, "?!ERR?! expected '$expect' but found '" . (defined($token) ? $token->[0] : "<end-of-input>") . "'\n");
	$self->untoken($token) if defined($token);
	return ();
	}

	sub optional_newlines {
	my $self = shift @_;
	my @tokens;
	while (my $token = $self->peek()) {
	last unless $token->[0] eq "\n";
	push(@tokens, $self->next_token());
	}
	return @tokens;
	}

	sub parse_group {
	my $self = shift @_;
	return ($self->parse(qr/^}$/),
	$self->expect('}'));
	}

	sub parse_subshell {
	my $self = shift @_;
	$self->{insubshell}++;
	my @tokens = ($self->parse(qr/^\)$/),
	$self->expect(')'));
	$self->{insubshell}--;
	return @tokens;
	}

	sub parse_case_pattern {
	my $self = shift @_;
	my @tokens;
	while (defined(my $token = $self->next_token())) {
	push(@tokens, $token);
	last if $token->[0] eq ')';
	}
	return @tokens;
	}

	sub parse_case {
	my $self = shift @_;
	my @tokens;
	push(@tokens,
	$self->next_token(), # subject
	$self->optional_newlines(),
	$self->expect('in'),
	$self->optional_newlines());
	while (1) {
	my $token = $self->peek();
	last unless defined($token) && $token->[0] ne 'esac';
	push(@tokens,
	$self->parse_case_pattern(),
	$self->optional_newlines(),
	$self->parse(qr/^(?:;;\|esac)$/)); # item body
	$token = $self->peek();
	last unless defined($token) && $token->[0] ne 'esac';
	push(@tokens,
	$self->expect(';;'),
	$self->optional_newlines());
	}
	push(@tokens, $self->expect('esac'));
	return @tokens;
	}

	sub parse_for {
	my $self = shift @_;
	my @tokens;
	push(@tokens,
	$self->next_token(), # variable
	$self->optional_newlines());
	my $token = $self->peek();
	if (defined($token) && $token->[0] eq 'in') {
	push(@tokens,
	$self->expect('in'),
	$self->optional_newlines());
	}
	push(@tokens,
	$self->parse(qr/^do$/), # items
	$self->expect('do'),
	$self->optional_newlines(),
	$self->parse_loop_body(),
	$self->expect('done'));
	return @tokens;
	}

	sub parse_if {
	my $self = shift @_;
	my @tokens;
	while (1) {
	push(@tokens,
	$self->parse(qr/^then$/), # if/elif condition
	$self->expect('then'),
	$self->optional_newlines(),
	$self->parse(qr/^(?:elif\|else\|fi)$/)); # if/elif body
	my $token = $self->peek();
	last unless defined($token) && $token->[0] eq 'elif';
	push(@tokens, $self->expect('elif'));
	}
	my $token = $self->peek();
	if (defined($token) && $token->[0] eq 'else') {
	push(@tokens,
	$self->expect('else'),
	$self->optional_newlines(),
	$self->parse(qr/^fi$/)); # else body
	}
	push(@tokens, $self->expect('fi'));
	return @tokens;
	}

	sub parse_loop_body {
	my $self = shift @_;
	return $self->parse(qr/^done$/);
	}

	sub parse_loop {
	my $self = shift @_;
	return ($self->parse(qr/^do$/), # condition
	$self->expect('do'),
	$self->optional_newlines(),
	$self->parse_loop_body(),
	$self->expect('done'));
	}

	sub parse_func {
	my $self = shift @_;
	return ($self->expect('('),
	$self->expect(')'),
	$self->optional_newlines(),
	$self->parse_cmd()); # body
	}

	sub parse_bash_array_assignment {
	my $self = shift @_;
	my @tokens = $self->expect('(');
	while (defined(my $token = $self->next_token())) {
	push(@tokens, $token);
	last if $token->[0] eq ')';
	}
	return @tokens;
	}

	my %compound = (
	'{' => \&parse_group,
	'(' => \&parse_subshell,
	'case' => \&parse_case,
	'for' => \&parse_for,
	'if' => \&parse_if,
	'until' => \&parse_loop,
	'while' => \&parse_loop);

	sub parse_cmd {
	my $self = shift @_;
	my $cmd = $self->next_token();
	return () unless defined($cmd);
	return $cmd if $cmd->[0] eq "\n";

	my $token;
	my @tokens = $cmd;
	if ($cmd->[0] eq '!') {
	push(@tokens, $self->parse_cmd());
	return @tokens;
	} elsif (my $f = $compound{$cmd->[0]}) {
	push(@tokens, $self->$f());
	} elsif (defined($token = $self->peek()) && $token->[0] eq '(') {
	if ($cmd->[0] !~ /\w=$/) {
	push(@tokens, $self->parse_func());
	return @tokens;
	}
	my @array = $self->parse_bash_array_assignment();
	$tokens[-1]->[0] .= join(' ', map {$_->[0]} @array);
	$tokens[-1]->[2] = $array[$#array][2] if @array;
	}

	while (defined(my $token = $self->next_token())) {
	$self->untoken($token), last if $self->stop_at($token);
	push(@tokens, $token);
	last if $token->[0] =~ /^(?:[;&\n\|]\|&&\|\\|\\|)$/;
	}
	push(@tokens, $self->next_token()) if $tokens[-1]->[0] ne "\n" && defined($token = $self->peek()) && $token->[0] eq "\n";
	return @tokens;
	}

	sub accumulate {
	my ($self, $tokens, $cmd) = @_;
	push(@$tokens, @$cmd);
	}

	sub parse {
	my ($self, $stop) = @_;
	push(@{$self->{stop}}, $stop);
	goto DONE if $self->stop_at($self->peek());
	my @tokens;
	while (my @cmd = $self->parse_cmd()) {
	$self->accumulate(\@tokens, \@cmd);
	last if $self->stop_at($self->peek());
	}
	DONE:
	pop(@{$self->{stop}});
	return @tokens;
	}

	# ScriptParser is a subclass of ShellParser which identifies individual test
	# definitions within test scripts and passes each test body to check_test().
	# ScriptParser detects test definitions not only at the top-level of test
	# scripts but also within compound commands such as loops and function
	# definitions.

	package ScriptParser;

	our @ISA = ('ShellParser');

	sub new {
	my $class = shift @_;
	my $self = $class->SUPER::new(@_);
	$self->{ntests} = 0;
	$self->{nerrs} = 0;
	return $self;
	}

	# extract the raw content of a token, which may be a single string or a
	# composition of multiple strings and non-string character runs; for instance,
	# `"test body"` unwraps to `test body`; `word"a b"42'c d'` to `worda b42c d`
	sub unwrap {
	my $token = (@_ ? shift @_ : $_)->[0];
	# simple case: 'sqstring' or "dqstring"
	return $token if $token =~ s/^'([^']*)'$/$1/;
	return $token if $token =~ s/^"([^"]*)"$/$1/;

	# composite case
	my ($s, $q, $escaped);
	while (1) {
	# slurp up non-special characters
	$s .= $1 if $token =~ /\G([^\\'"]*)/gc;
	# handle special characters
	last unless $token =~ /\G(.)/sgc;
	my $c = $1;
	$q = undef, next if defined($q) && $c eq $q;
	$q = $c, next if !defined($q) && $c =~ /^['"]$/;
	if ($c eq '\\') {
	last unless $token =~ /\G(.)/sgc;
	$c = $1;
	$s .= '\\' if $c eq "\n"; # preserve line splice
	}
	$s .= $c;
	}
	return $s
	}

	sub check_test {
	# no-op; subclass and override to implement lint checks
	}

	sub parse_cmd {
	my $self = shift @_;
	my @tokens = $self->SUPER::parse_cmd();
	return @tokens unless @tokens && $tokens[0]->[0] =~ /^test_expect_(?:success\|failure)$/;
	my $n = $#tokens;
	$n-- while $n >= 0 && $tokens[$n]->[0] =~ /^(?:[;&\n\|]\|&&\|\\|\\|)$/;
	my $herebody;
	if ($n >= 2 && $tokens[$n-1]->[0] eq '-' && $tokens[$n]->[0] =~ /^<<-?(.+)$/) {
	$herebody = $self->{heredocs}->{$1};
	$n--;
	}
	$self->check_test($tokens[1], $tokens[2], $herebody) if $n == 2; # title body
	$self->check_test($tokens[2], $tokens[3], $herebody) if $n > 2; # prereq title body
	return @tokens;
	}

	1;