diff --git a/SHELL_FEATURES.md b/SHELL_FEATURES.md index c710f630..9236b3c3 100644 --- a/SHELL_FEATURES.md +++ b/SHELL_FEATURES.md @@ -7,7 +7,7 @@ The in-shell `help` command mirrors these feature categories: run `help` for a c ## Builtins -- ✅ `awk [-F SEP] [-v NAME=VALUE] ['PROGRAM'|-f PROGRAM-FILE] [FILE]...` — pattern scanning and text processing; supports BEGIN/main/END rules, read-only fields (`$0`, `$1`, `$NF`), `NF`/`NR`/`FNR`/`FILENAME`, `FS`/`OFS`/`ORS`, `print`, `printf`, scalar assignment, arithmetic/comparison/boolean expressions, regex patterns and `~`/`!~`, string concatenation, `if`/`else`, `next`, and scalar builtins (`length`, `substr`, `index`, `tolower`, `toupper`, `int`); `system()`, command pipes, output redirection, `getline`, arrays, loops, regex `FS`, and field mutation are rejected or deferred +- ✅ `awk [-F SEP] [-v NAME=VALUE] ['PROGRAM'|-f PROGRAM-FILE] [FILE]...` — pattern scanning and text processing; supports BEGIN/main/END rules, fields and field mutation (`$0`, `$1`, `$NF`), `NF`/`NR`/`FNR`/`FILENAME`, `FS`/`OFS`/`ORS`, regex `FS`, `print`, `printf`, scalar and associative array assignment, `split`, `in`, `delete`, `for`, `while`, `break`, `continue`, range patterns, arithmetic/comparison/boolean expressions, regex patterns and `~`/`!~`, string concatenation, `if`/`else`, `next`, `ENVIRON`, and scalar builtins (`length`, `substr`, `index`, `tolower`, `toupper`, `int`); `system()`, command pipes, output redirection, `getline`, user-defined functions, and many POSIX/GNU awk builtins remain rejected or deferred - ✅ `break` — exit the innermost `for` loop - ✅ `cat [-AbeEnstTuv] [FILE]...` — concatenate files to stdout; supports line numbering, blank squeezing, and non-printing character display - ✅ `continue` — skip to the next iteration of the innermost `for` loop diff --git a/analysis/symbols_builtins.go b/analysis/symbols_builtins.go index f5d4f2a6..20485cfa 100644 --- a/analysis/symbols_builtins.go +++ b/analysis/symbols_builtins.go @@ -636,7 +636,6 @@ var builtinAllowedSymbols = []string{ "slices.Reverse", // 🟢 reverses a slice in-place; pure function, no I/O. "slices.SortFunc", // 🟢 sorts a slice with a comparison function; pure function, no I/O. "slices.SortStableFunc", // 🟢 stable sort with a comparison function; pure function, no I/O. - "strings.Repeat", // 🟢 returns a string of n repetitions; pure function, no I/O. "strconv.Atoi", // 🟢 string-to-int conversion; pure function, no I/O. "strconv.ErrRange", // 🟢 sentinel error value for overflow; pure constant. "strconv.FormatBool", // 🟢 bool-to-string conversion; pure function, no I/O. @@ -661,6 +660,7 @@ var builtinAllowedSymbols = []string{ "strings.Join", // 🟢 concatenates a slice of strings with a separator; pure function, no I/O. "strings.NewReader", // 🟢 wraps a string as an io.Reader; pure in-memory, no I/O. "strings.ReplaceAll", // 🟢 replaces all occurrences of a substring; pure function, no I/O. + "strings.Repeat", // 🟢 returns a string of n repetitions; pure function, no I/O. "strings.Split", // 🟢 splits a string by separator into a slice; pure function, no I/O. "strings.ToLower", // 🟢 converts string to lowercase; pure function, no I/O. "strings.ToUpper", // 🟢 converts string to uppercase; pure function, no I/O. diff --git a/builtins/awk/ast.go b/builtins/awk/ast.go index d68bde50..e7f7c6ac 100644 --- a/builtins/awk/ast.go +++ b/builtins/awk/ast.go @@ -47,10 +47,50 @@ type ifStmt struct { func (*ifStmt) stmtNode() {} +type forInStmt struct { + varName string + arrayName string + body []stmt +} + +func (*forInStmt) stmtNode() {} + +type forStmt struct { + init expr + cond expr + post expr + body []stmt +} + +func (*forStmt) stmtNode() {} + +type whileStmt struct { + cond expr + body []stmt +} + +func (*whileStmt) stmtNode() {} + type nextStmt struct{} func (*nextStmt) stmtNode() {} +type breakStmt struct{} + +func (*breakStmt) stmtNode() {} + +type continueStmt struct{} + +func (*continueStmt) stmtNode() {} + +type deleteStmt struct { + name string + index expr + all bool +} + +func (*deleteStmt) stmtNode() {} + type exprStmt struct { x expr } @@ -86,6 +126,13 @@ type varExpr struct { func (*varExpr) exprNode() {} +type arrayRefExpr struct { + name string + index expr +} + +func (*arrayRefExpr) exprNode() {} + type fieldExpr struct { index expr } @@ -113,6 +160,13 @@ type binaryExpr struct { func (*binaryExpr) exprNode() {} +type rangeExpr struct { + start expr + end expr +} + +func (*rangeExpr) exprNode() {} + type assignExpr struct { op string left expr diff --git a/builtins/awk/awk.go b/builtins/awk/awk.go index c70d97ef..bd4cc05b 100644 --- a/builtins/awk/awk.go +++ b/builtins/awk/awk.go @@ -12,16 +12,17 @@ // awk [OPTION]... -f program-file [FILE]... // // This implements a practical, intentionally restricted awk profile: program -// loading from an inline argument or -f files, -F one-character field +// loading from an inline argument or -f files, -F field // separators, -v scalar variables, BEGIN/main/END rules, print and printf, -// scalar assignment, if/else, next, arithmetic/comparison/boolean expressions, -// regex patterns and match operators, string concatenation, scalar built-in -// functions, and read-only fields/built-in variables such as $0, $1, NF, NR, -// FNR, FILENAME, FS, OFS, and ORS. +// scalar and associative array assignment, if/else, for/while loops, next, +// arithmetic/comparison/boolean expressions, regex patterns and match +// operators, regex field separators, string concatenation, scalar built-in +// functions, split, delete, ENVIRON, and field/built-in variables such as $0, +// $1, NF, NR, FNR, FILENAME, FS, OFS, and ORS. // // Blocked or deferred features include system(), command pipes, output -// redirection, getline, arrays, loops, user-defined functions, regex FS, and -// field mutation/$0 rebuilding. +// redirection, getline, user-defined functions, and many additional POSIX/GNU +// awk builtins. package awk import ( @@ -93,7 +94,7 @@ func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { help := fs.BoolP("help", "h", false, "print usage and exit") var orderedOptions []orderedOption fieldSep := fieldSeparatorOption{options: &orderedOptions} - fs.VarP(&fieldSep, "field-separator", "F", "use a single-character input field separator") + fs.VarP(&fieldSep, "field-separator", "F", "use an input field separator regular expression") var programFiles stringList fs.VarP(&programFiles, "file", "f", "read awk program from file") assignments := assignmentOption{options: &orderedOptions} diff --git a/builtins/awk/eval.go b/builtins/awk/eval.go index 475f814d..8f544e65 100644 --- a/builtins/awk/eval.go +++ b/builtins/awk/eval.go @@ -6,6 +6,7 @@ package awk import ( + "context" "errors" "fmt" "math" @@ -13,9 +14,14 @@ import ( ) var errNextRecord = errors.New("next record") +var errBreakLoop = errors.New("break loop") +var errContinueLoop = errors.New("continue loop") -func (rt *runtime) execStatements(stmts []stmt) error { +func (rt *runtime) execStatements(ctx context.Context, stmts []stmt) error { for _, st := range stmts { + if err := ctx.Err(); err != nil { + return err + } switch s := st.(type) { case *printStmt: vals := make([]value, 0, len(s.args)) @@ -56,16 +62,61 @@ func (rt *runtime) execStatements(stmts []stmt) error { return err } if cond.Bool() { - if err := rt.execStatements(s.thenStmts); err != nil { + if err := rt.execStatements(ctx, s.thenStmts); err != nil { return err } } else if len(s.elseStmts) > 0 { - if err := rt.execStatements(s.elseStmts); err != nil { + if err := rt.execStatements(ctx, s.elseStmts); err != nil { + return err + } + } + case *forInStmt: + keys, err := rt.arrayKeys(s.arrayName) + if err != nil { + return err + } + for _, key := range keys { + if err := rt.setVar(s.varName, stringValue(key)); err != nil { return err } + if err := rt.execStatements(ctx, s.body); err != nil { + if errors.Is(err, errBreakLoop) { + break + } + if errors.Is(err, errContinueLoop) { + continue + } + return err + } + } + case *forStmt: + if err := rt.execFor(ctx, s); err != nil { + return err + } + case *whileStmt: + if err := rt.execWhile(ctx, s); err != nil { + return err } case *nextStmt: return errNextRecord + case *breakStmt: + return errBreakLoop + case *continueStmt: + return errContinueLoop + case *deleteStmt: + if s.all { + if err := rt.deleteArray(s.name); err != nil { + return err + } + continue + } + key, err := rt.eval(s.index) + if err != nil { + return err + } + if err := rt.deleteArrayElem(s.name, key.String()); err != nil { + return err + } case *exprStmt: if _, err := rt.eval(s.x); err != nil { return err @@ -77,6 +128,65 @@ func (rt *runtime) execStatements(stmts []stmt) error { return nil } +func (rt *runtime) execFor(ctx context.Context, s *forStmt) error { + if s.init != nil { + if _, err := rt.eval(s.init); err != nil { + return err + } + } + for { + if err := ctx.Err(); err != nil { + return err + } + if s.cond != nil { + cond, err := rt.eval(s.cond) + if err != nil { + return err + } + if !cond.Bool() { + return nil + } + } + err := rt.execStatements(ctx, s.body) + if errors.Is(err, errBreakLoop) { + return nil + } + if err != nil && !errors.Is(err, errContinueLoop) { + return err + } + if s.post != nil { + if _, postErr := rt.eval(s.post); postErr != nil { + return postErr + } + } + } +} + +func (rt *runtime) execWhile(ctx context.Context, s *whileStmt) error { + for { + if err := ctx.Err(); err != nil { + return err + } + cond, err := rt.eval(s.cond) + if err != nil { + return err + } + if !cond.Bool() { + return nil + } + err = rt.execStatements(ctx, s.body) + if errors.Is(err, errBreakLoop) { + return nil + } + if errors.Is(err, errContinueLoop) { + continue + } + if err != nil { + return err + } + } +} + func substrStart(n float64, length int) int { if n <= 1 || math.IsNaN(n) { return 0 @@ -120,7 +230,12 @@ func (rt *runtime) eval(x expr) (value, error) { } return boolValue(re.MatchString(rt.record)), nil case *varExpr: + if rt.isArray(e.name) || isBuiltinArrayName(e.name) { + return value{}, fmt.Errorf("cannot use array %s as scalar", e.name) + } return rt.getVar(e.name), nil + case *arrayRefExpr: + return rt.evalArrayRef(e) case *fieldExpr: v, err := rt.eval(e.index) if err != nil { @@ -165,6 +280,9 @@ func (rt *runtime) eval(x expr) (value, error) { } func (rt *runtime) evalCall(e *callExpr) (value, error) { + if e.name == "split" { + return rt.evalSplit(e) + } args := make([]value, 0, len(e.args)) for _, arg := range e.args { v, err := rt.eval(arg) @@ -219,6 +337,62 @@ func (rt *runtime) evalCall(e *callExpr) (value, error) { } } +func (rt *runtime) evalSplit(e *callExpr) (value, error) { + if err := validateBuiltinCallArity(e.name, len(e.args)); err != nil { + return value{}, err + } + target, ok := e.args[1].(*varExpr) + if !ok { + return value{}, fmt.Errorf("split destination must be an array variable") + } + input, err := rt.eval(e.args[0]) + if err != nil { + return value{}, err + } + sep := rt.getVar("FS").String() + charSplit := false + regexSplit := false + if len(e.args) == 3 { + if rx, ok := e.args[2].(*regexExpr); ok { + sep = rx.pattern + regexSplit = true + } else { + sepValue, err := rt.eval(e.args[2]) + if err != nil { + return value{}, err + } + sep = sepValue.String() + charSplit = sep == "" + } + } + var parts []string + if charSplit { + parts = splitAwkChars(input.String()) + } else if regexSplit || sep != " " { + if regexSplit { + parts, err = splitAwkRegex(input.String(), sep) + } else { + parts, err = splitAwkFields(input.String(), sep) + } + if err != nil { + return value{}, err + } + } else { + parts, err = splitAwkFields(input.String(), sep) + if err != nil { + return value{}, err + } + } + elems := make(map[string]value, len(parts)) + for i, part := range parts { + elems[fmt.Sprintf("%d", i+1)] = inputStringValue(part) + } + if err := rt.replaceArray(target.name, elems); err != nil { + return value{}, err + } + return numberValue(float64(len(parts))), nil +} + func (rt *runtime) evalBinary(e *binaryExpr) (value, error) { if e.op == "&&" { left, err := rt.eval(e.left) @@ -268,6 +442,16 @@ func (rt *runtime) evalBinary(e *binaryExpr) (value, error) { matched = !matched } return boolValue(matched), nil + case "in": + arrayName, ok := e.right.(*varExpr) + if !ok { + return value{}, fmt.Errorf("right side of in requires an array variable") + } + ok, err := rt.hasArrayElem(arrayName.name, left.String()) + if err != nil { + return value{}, err + } + return boolValue(ok), nil } right, err := rt.eval(e.right) if err != nil { @@ -319,16 +503,19 @@ func (rt *runtime) matchRegexExpr(left value, rightExpr expr) (bool, error) { } func (rt *runtime) evalAssign(e *assignExpr) (value, error) { - lhs, ok := e.left.(*varExpr) - if !ok { - return value{}, fmt.Errorf("assignment requires a scalar variable") + target, left, err := rt.resolveAssignable(e.left) + if err != nil { + return value{}, err } right, err := rt.eval(e.right) if err != nil { return value{}, err } if e.op != "=" { - left := rt.getVar(lhs.name) + left, err = rt.currentResolvedAssignable(target) + if err != nil { + return value{}, err + } switch e.op { case "+=": right = numberValue(left.Number() + right.Number()) @@ -350,18 +537,17 @@ func (rt *runtime) evalAssign(e *assignExpr) (value, error) { return value{}, fmt.Errorf("unknown assignment operator %s", e.op) } } - if err := rt.setVar(lhs.name, right); err != nil { + if err := rt.setResolvedAssignable(target, right); err != nil { return value{}, err } return right, nil } func (rt *runtime) evalIncDec(e *incDecExpr) (value, error) { - vref, ok := e.x.(*varExpr) - if !ok { - return value{}, fmt.Errorf("increment and decrement require scalar variables") + target, old, err := rt.resolveAssignable(e.x) + if err != nil { + return value{}, err } - old := rt.getVar(vref.name) next := old.Number() if e.op == "++" { next++ @@ -369,7 +555,7 @@ func (rt *runtime) evalIncDec(e *incDecExpr) (value, error) { next-- } nv := numberValue(next) - if err := rt.setVar(vref.name, nv); err != nil { + if err := rt.setResolvedAssignable(target, nv); err != nil { return value{}, err } if e.prefix { @@ -378,6 +564,75 @@ func (rt *runtime) evalIncDec(e *incDecExpr) (value, error) { return old, nil } +type assignTarget struct { + name string + key string + array bool + field bool + fieldIndex int +} + +func (rt *runtime) resolveAssignable(x expr) (assignTarget, value, error) { + switch v := x.(type) { + case *varExpr: + if rt.isArray(v.name) { + return assignTarget{}, value{}, fmt.Errorf("cannot use array %s as scalar", v.name) + } + return assignTarget{name: v.name}, rt.getVar(v.name), nil + case *arrayRefExpr: + key, err := rt.eval(v.index) + if err != nil { + return assignTarget{}, value{}, err + } + keyString := key.String() + current, err := rt.getArrayElem(v.name, keyString) + if err != nil { + return assignTarget{}, value{}, err + } + return assignTarget{name: v.name, key: keyString, array: true}, current, nil + case *fieldExpr: + index, err := rt.eval(v.index) + if err != nil { + return assignTarget{}, value{}, err + } + n := int(index.Number()) + if n < 0 { + return assignTarget{}, value{}, fmt.Errorf("invalid field index") + } + return assignTarget{field: true, fieldIndex: n}, rt.field(n), nil + default: + return assignTarget{}, value{}, fmt.Errorf("expected variable") + } +} + +func (rt *runtime) setResolvedAssignable(target assignTarget, v value) error { + if target.array { + return rt.setArrayElem(target.name, target.key, v) + } + if target.field { + return rt.setField(target.fieldIndex, v) + } + return rt.setVar(target.name, v) +} + +func (rt *runtime) currentResolvedAssignable(target assignTarget) (value, error) { + if target.array { + return rt.getArrayElem(target.name, target.key) + } + if target.field { + return rt.field(target.fieldIndex), nil + } + return rt.getVar(target.name), nil +} + +func (rt *runtime) evalArrayRef(ref *arrayRefExpr) (value, error) { + key, err := rt.eval(ref.index) + if err != nil { + return value{}, err + } + return rt.getArrayElem(ref.name, key.String()) +} + func boolValue(ok bool) value { if ok { return numberValue(1) diff --git a/builtins/awk/parser.go b/builtins/awk/parser.go index a787c85b..34ce6050 100644 --- a/builtins/awk/parser.go +++ b/builtins/awk/parser.go @@ -47,7 +47,6 @@ var unsupportedBuiltinFunctions = map[string]struct{}{ "rand": {}, "rshift": {}, "sin": {}, - "split": {}, "sprintf": {}, "sqrt": {}, "srand": {}, @@ -64,6 +63,7 @@ var supportedBuiltinFunctions = map[string]struct{}{ "index": {}, "int": {}, "length": {}, + "split": {}, "substr": {}, "tolower": {}, "toupper": {}, @@ -114,7 +114,12 @@ func (p *parser) parseRule() (rule, error) { return rule{}, err } if p.at(tokComma) { - return rule{}, fmt.Errorf("range patterns are not supported") + p.advance() + end, err := p.parseExpression(0) + if err != nil { + return rule{}, err + } + pattern = &rangeExpr{start: pattern, end: end} } if p.at(tokLBrace) { action, err := p.parseAction() @@ -155,23 +160,35 @@ func (p *parser) parseStatement() (stmt, error) { if p.atIdent("if") { return p.parseIf() } + if p.atIdent("for") { + return p.parseFor() + } + if p.atIdent("while") { + return p.parseWhile() + } if p.atIdent("next") { p.advance() return &nextStmt{}, nil } + if p.atIdent("break") { + p.advance() + return &breakStmt{}, nil + } + if p.atIdent("continue") { + p.advance() + return &continueStmt{}, nil + } if p.atIdent("print") { return p.parsePrint() } if p.atIdent("printf") { return p.parsePrintf() } - if p.atIdent("if") || p.atIdent("while") || p.atIdent("for") || - p.atIdent("nextfile") || p.atIdent("exit") || - p.atIdent("break") || p.atIdent("continue") { + if p.atIdent("if") || p.atIdent("nextfile") || p.atIdent("exit") { return nil, fmt.Errorf("control flow statements are not supported") } if p.atIdent("delete") { - return nil, fmt.Errorf("arrays are not supported") + return p.parseDelete() } if p.atIdent("getline") { return nil, fmt.Errorf("getline is not supported") @@ -183,6 +200,97 @@ func (p *parser) parseStatement() (stmt, error) { return &exprStmt{x: x}, nil } +func (p *parser) parseFor() (stmt, error) { + p.advance() + if !p.match(tokLParen) { + return nil, fmt.Errorf("expected ( after for") + } + p.skipNewlines() + if p.cur().kind == tokIdent && p.peek(1).kind == tokIdent && p.peek(1).lit == "in" { + varName := p.cur().lit + if err := validateIdentifierReference(varName); err != nil { + return nil, err + } + p.advance() + p.advance() + if p.cur().kind != tokIdent { + return nil, fmt.Errorf("expected array name in for loop") + } + arrayName := p.cur().lit + if err := validateIdentifierReference(arrayName); err != nil { + return nil, err + } + p.advance() + p.skipSeparators() + if !p.match(tokRParen) { + return nil, fmt.Errorf("expected ) after for loop") + } + body, err := p.parseStatementGroup() + if err != nil { + return nil, err + } + return &forInStmt{varName: varName, arrayName: arrayName, body: body}, nil + } + init, err := p.parseOptionalForExpr(tokSemicolon) + if err != nil { + return nil, err + } + if !p.match(tokSemicolon) { + return nil, fmt.Errorf("expected ; in for loop") + } + cond, err := p.parseOptionalForExpr(tokSemicolon) + if err != nil { + return nil, err + } + if !p.match(tokSemicolon) { + return nil, fmt.Errorf("expected ; in for loop") + } + post, err := p.parseOptionalForExpr(tokRParen) + if err != nil { + return nil, err + } + if !p.match(tokRParen) { + return nil, fmt.Errorf("expected ) after for loop") + } + body, err := p.parseStatementGroup() + if err != nil { + return nil, err + } + return &forStmt{init: init, cond: cond, post: post, body: body}, nil +} + +func (p *parser) parseOptionalForExpr(end tokenKind) (expr, error) { + p.skipNewlines() + if p.at(end) { + return nil, nil + } + x, err := p.parseExpression(0) + if err != nil { + return nil, err + } + p.skipNewlines() + return x, nil +} + +func (p *parser) parseWhile() (stmt, error) { + p.advance() + if !p.match(tokLParen) { + return nil, fmt.Errorf("expected ( after while") + } + cond, err := p.parseExpression(0) + if err != nil { + return nil, err + } + if !p.match(tokRParen) { + return nil, fmt.Errorf("expected ) after while condition") + } + body, err := p.parseStatementGroup() + if err != nil { + return nil, err + } + return &whileStmt{cond: cond, body: body}, nil +} + func (p *parser) parseIf() (stmt, error) { p.advance() if !p.match(tokLParen) { @@ -216,7 +324,7 @@ func (p *parser) parseIf() (stmt, error) { func (p *parser) parseStatementGroup() ([]stmt, error) { p.skipNewlines() - if p.match(tokSemicolon) { + if p.at(tokSemicolon) { return nil, nil } if p.match(tokLBrace) { @@ -229,6 +337,29 @@ func (p *parser) parseStatementGroup() ([]stmt, error) { return []stmt{st}, nil } +func (p *parser) parseDelete() (stmt, error) { + p.advance() + if p.cur().kind != tokIdent { + return nil, fmt.Errorf("delete requires an array name") + } + name := p.cur().lit + if err := validateIdentifierReference(name); err != nil { + return nil, err + } + p.advance() + if !p.match(tokLBracket) { + return &deleteStmt{name: name, all: true}, nil + } + index, err := p.parseExpression(0) + if err != nil { + return nil, err + } + if !p.match(tokRBracket) { + return nil, fmt.Errorf("expected ] after array index") + } + return &deleteStmt{name: name, index: index}, nil +} + func (p *parser) parsePrint() (stmt, error) { p.advance() ps := &printStmt{} @@ -314,8 +445,8 @@ func (p *parser) parseExpression(minPrec int) (expr, error) { } op := p.cur().lit p.advance() - if _, ok := left.(*varExpr); !ok { - return nil, fmt.Errorf("increment and decrement require scalar variables") + if !isAssignableExpr(left) { + return nil, fmt.Errorf("increment and decrement require variables") } left = &incDecExpr{op: op, x: left} continue @@ -342,11 +473,8 @@ func (p *parser) parseExpression(minPrec int) (expr, error) { } } if isAssignOp(op) { - if _, ok := left.(*fieldExpr); ok { - return nil, fmt.Errorf("field assignment is not supported") - } - if _, ok := left.(*varExpr); !ok { - return nil, fmt.Errorf("assignment requires a scalar variable") + if !isAssignableExpr(left) { + return nil, fmt.Errorf("assignment requires a variable") } left = &assignExpr{op: op, left: left, right: right} } else { @@ -395,7 +523,7 @@ func (p *parser) parsePrefix() (expr, error) { return nil, err } if p.at(tokLBracket) { - return nil, fmt.Errorf("arrays are not supported") + return p.parseArrayRef(tok.lit) } return &varExpr{name: tok.lit}, nil case tokDollar: @@ -426,8 +554,8 @@ func (p *parser) parsePrefix() (expr, error) { if err != nil { return nil, err } - if _, ok := x.(*varExpr); !ok { - return nil, fmt.Errorf("increment and decrement require scalar variables") + if !isAssignableExpr(x) { + return nil, fmt.Errorf("increment and decrement require variables") } return &incDecExpr{op: tok.lit, x: x, prefix: true}, nil default: @@ -435,6 +563,18 @@ func (p *parser) parsePrefix() (expr, error) { } } +func (p *parser) parseArrayRef(name string) (expr, error) { + p.advance() + index, err := p.parseExpression(0) + if err != nil { + return nil, err + } + if !p.match(tokRBracket) { + return nil, fmt.Errorf("expected ] after array index") + } + return &arrayRefExpr{name: name, index: index}, nil +} + func (p *parser) parseFunctionCall(name string) (expr, error) { if _, ok := supportedBuiltinFunctions[name]; !ok { if name == "system" { @@ -486,6 +626,10 @@ func validateBuiltinCallArity(name string, argc int) error { if argc != 2 { return fmt.Errorf("index expects 2 arguments") } + case "split": + if argc != 2 && argc != 3 { + return fmt.Errorf("split expects 2 or 3 arguments") + } case "tolower", "toupper", "int": if argc != 1 { return fmt.Errorf("%s expects 1 argument", name) @@ -494,6 +638,15 @@ func validateBuiltinCallArity(name string, argc int) error { return nil } +func isAssignableExpr(x expr) bool { + switch x.(type) { + case *varExpr, *arrayRefExpr, *fieldExpr: + return true + default: + return false + } +} + func validateIdentifierReference(name string) error { if msg, ok := unsupportedExpressionKeyword(name); ok { return fmt.Errorf("%s", msg) @@ -570,6 +723,9 @@ func (p *parser) parseFieldRef() (expr, error) { } func (p *parser) binaryOp() (string, int, string, bool) { + if p.atIdent("in") { + return "in", precCompare, "left", true + } switch p.cur().kind { case tokAssign: return "=", precAssign, "right", true @@ -653,6 +809,14 @@ func (p *parser) cur() token { return p.toks[p.pos] } +func (p *parser) peek(n int) token { + idx := p.pos + n + if idx >= len(p.toks) { + return token{kind: tokEOF} + } + return p.toks[idx] +} + func (p *parser) at(k tokenKind) bool { return p.cur().kind == k } diff --git a/builtins/awk/parser_test.go b/builtins/awk/parser_test.go index 32a72513..3330c2ec 100644 --- a/builtins/awk/parser_test.go +++ b/builtins/awk/parser_test.go @@ -26,7 +26,6 @@ func TestParseRejectsUnsafeFeatures(t *testing.T) { `{ system("sh") }`, `{ print $1 > "out" }`, `{ "cmd" | getline }`, - `{ $1 = "x" }`, `{ exit 1 }`, } { _, err := parseProgram(src) diff --git a/builtins/awk/runtime.go b/builtins/awk/runtime.go index 9b399772..31ede07d 100644 --- a/builtins/awk/runtime.go +++ b/builtins/awk/runtime.go @@ -191,11 +191,15 @@ func numericPrefix(s string) string { } type runtime struct { - callCtx *builtins.CallContext - prog *program - vars map[string]value - varSizes map[string]int - varBytes int + callCtx *builtins.CallContext + prog *program + vars map[string]value + arrays map[string]map[string]value + varSizes map[string]int + arraySizes map[arraySlot]int + varBytes int + rangeOn map[int]bool + environSet bool record string fields []string @@ -204,12 +208,20 @@ type runtime struct { fnr int } +type arraySlot struct { + name string + key string +} + func newRuntime(callCtx *builtins.CallContext, prog *program) *runtime { rt := &runtime{ - callCtx: callCtx, - prog: prog, - vars: make(map[string]value), - varSizes: make(map[string]int), + callCtx: callCtx, + prog: prog, + vars: make(map[string]value), + arrays: make(map[string]map[string]value), + varSizes: make(map[string]int), + arraySizes: make(map[arraySlot]int), + rangeOn: make(map[int]bool), } rt.vars["FS"] = stringValue(" ") rt.vars["OFS"] = stringValue(" ") @@ -256,6 +268,21 @@ func (rt *runtime) run(ctx context.Context, files []string) builtins.Result { return builtins.Result{} } +func (rt *runtime) ensureEnviron() { + if rt.environSet { + return + } + rt.environSet = true + elems := make(map[string]value) + if rt.callCtx.Env != nil { + rt.callCtx.Env(func(name, value string) bool { + elems[name] = inputStringValue(value) + return true + }) + } + rt.arrays["ENVIRON"] = elems +} + func (rt *runtime) applyOperandAssignment(arg string) (bool, error) { name, value, ok := strings.Cut(arg, "=") if !ok || !validIdentifierName(name) { @@ -346,7 +373,8 @@ func (rt *runtime) openInput(ctx context.Context, file string) (io.ReadCloser, e } func (rt *runtime) runRules(ctx context.Context, kind ruleKind) error { - for _, r := range rt.prog.rules { + for i := range rt.prog.rules { + r := &rt.prog.rules[i] if err := ctx.Err(); err != nil { return err } @@ -354,7 +382,7 @@ func (rt *runtime) runRules(ctx context.Context, kind ruleKind) error { continue } if kind == ruleNormal && r.pattern != nil { - ok, err := rt.matchPattern(r.pattern) + ok, err := rt.matchPattern(i, r.pattern) if err != nil { return err } @@ -368,7 +396,7 @@ func (rt *runtime) runRules(ctx context.Context, kind ruleKind) error { } continue } - if err := rt.execStatements(r.action); err != nil { + if err := rt.execStatements(ctx, r.action); err != nil { if errors.Is(err, errNextRecord) { if kind == ruleNormal { return err @@ -381,7 +409,42 @@ func (rt *runtime) runRules(ctx context.Context, kind ruleKind) error { return nil } -func (rt *runtime) matchPattern(x expr) (bool, error) { +func (rt *runtime) matchPattern(ruleIndex int, x expr) (bool, error) { + if rx, ok := x.(*rangeExpr); ok { + return rt.matchRangePattern(ruleIndex, rx) + } + return rt.matchSimplePattern(x) +} + +func (rt *runtime) matchRangePattern(ruleIndex int, x *rangeExpr) (bool, error) { + if rt.rangeOn[ruleIndex] { + end, err := rt.matchSimplePattern(x.end) + if err != nil { + return false, err + } + if end { + rt.rangeOn[ruleIndex] = false + } + return true, nil + } + start, err := rt.matchSimplePattern(x.start) + if err != nil { + return false, err + } + if !start { + return false, nil + } + end, err := rt.matchSimplePattern(x.end) + if err != nil { + return false, err + } + if !end { + rt.rangeOn[ruleIndex] = true + } + return true, nil +} + +func (rt *runtime) matchSimplePattern(x expr) (bool, error) { if rx, ok := x.(*regexExpr); ok { re, err := compileRegex(rx.pattern) if err != nil { @@ -397,26 +460,120 @@ func (rt *runtime) matchPattern(x expr) (bool, error) { } func (rt *runtime) setRecord(rec string) error { + if err := validateRecordSize(rec); err != nil { + return err + } rt.record = rec fs := rt.getVar("FS").String() - if fs == " " { - rt.fields = splitAwkWhitespaceFields(rec) - } else { - if err := validateFS(fs); err != nil { - return err - } - if rec == "" { - rt.fields = nil - } else { - rt.fields = strings.Split(rec, fs) - } + fields, err := splitAwkFields(rec, fs) + if err != nil { + return err } + rt.fields = fields if len(rt.fields) > MaxFields { return fmt.Errorf("record has too many fields") } return nil } +func validateRecordSize(rec string) error { + if len(rec) > MaxRecordBytes { + return fmt.Errorf("record exceeds %d bytes", MaxRecordBytes) + } + return nil +} + +func validateRebuiltRecordSize(fields []string, fieldCount, replacementIndex int, replacement, ofs string) error { + total := 0 + for i := 0; i < fieldCount; i++ { + if i > 0 { + total += len(ofs) + if total > MaxRecordBytes { + return fmt.Errorf("record exceeds %d bytes", MaxRecordBytes) + } + } + field := "" + if i < len(fields) { + field = fields[i] + } + if replacementIndex == i+1 { + field = replacement + } + total += len(field) + if total > MaxRecordBytes { + return fmt.Errorf("record exceeds %d bytes", MaxRecordBytes) + } + } + return nil +} + +func (rt *runtime) rebuildRecordFromFields() error { + ofs := rt.getVar("OFS").String() + if err := validateRebuiltRecordSize(rt.fields, len(rt.fields), 0, "", ofs); err != nil { + return err + } + rt.record = strings.Join(rt.fields, ofs) + return nil +} + +func (rt *runtime) setField(n int, v value) error { + if n < 0 { + return fmt.Errorf("invalid field index") + } + if n == 0 { + return rt.setRecord(v.String()) + } + if n > MaxFields { + return fmt.Errorf("record has too many fields") + } + s := v.String() + fieldCount := max(len(rt.fields), n) + if err := validateRebuiltRecordSize(rt.fields, fieldCount, n, s, rt.getVar("OFS").String()); err != nil { + return err + } + for len(rt.fields) < n { + rt.fields = append(rt.fields, "") + } + rt.fields[n-1] = s + return rt.rebuildRecordFromFields() +} + +func (rt *runtime) setNF(n int) error { + if n < 0 { + return fmt.Errorf("invalid NF value") + } + if n > MaxFields { + return fmt.Errorf("record has too many fields") + } + if err := validateRebuiltRecordSize(rt.fields, n, 0, "", rt.getVar("OFS").String()); err != nil { + return err + } + if n < len(rt.fields) { + rt.fields = rt.fields[:n] + } else { + for len(rt.fields) < n { + rt.fields = append(rt.fields, "") + } + } + return rt.rebuildRecordFromFields() +} + +func splitAwkFields(s, fs string) ([]string, error) { + if fs == " " { + return splitAwkWhitespaceFields(s), nil + } + if err := validateFS(fs); err != nil { + return nil, err + } + if s == "" { + return nil, nil + } + if isSingleRune(fs) { + return strings.Split(s, fs), nil + } + return splitAwkRegex(s, fs) +} + func splitAwkWhitespaceFields(rec string) []string { var fields []string for i := 0; i < len(rec); { @@ -438,6 +595,45 @@ func isAwkFieldBlank(b byte) bool { return b == ' ' || b == '\t' || b == '\n' } +func splitAwkChars(s string) []string { + if s == "" { + return nil + } + chars := make([]string, 0, len(s)) + for _, r := range s { + chars = append(chars, string(r)) + } + return chars +} + +func splitAwkRegex(s, pattern string) ([]string, error) { + if s == "" { + return nil, nil + } + if pattern == "" { + return splitAwkChars(s), nil + } + re, err := compileRegex(pattern) + if err != nil { + return nil, err + } + matches := re.FindAllStringIndex(s, -1) + fields := make([]string, 0, len(matches)+1) + last := 0 + for _, match := range matches { + if match[0] == match[1] { + continue + } + fields = append(fields, s[last:match[0]]) + last = match[1] + } + if len(fields) == 0 { + return []string{s}, nil + } + fields = append(fields, s[last:]) + return fields, nil +} + func (rt *runtime) field(n int) value { if n == 0 { return inputStringValue(rt.record) @@ -462,14 +658,25 @@ func (rt *runtime) getVar(name string) value { if v, ok := rt.vars[name]; ok { return v } - return unassignedValue() + if isBuiltinArrayName(name) { + return unassignedValue() + } + v := unassignedValue() + rt.vars[name] = v + return v } } func (rt *runtime) setVar(name string, v value) error { + if rt.isArray(name) { + return fmt.Errorf("cannot use array %s as scalar", name) + } + if isBuiltinArrayName(name) { + return fmt.Errorf("cannot use array %s as scalar", name) + } switch name { case "NF": - return fmt.Errorf("assignment to NF is not supported") + return rt.setNF(int(v.Number())) case "NR", "FNR", "FILENAME": return fmt.Errorf("assignment to %s is not supported", name) case "FS": @@ -491,6 +698,172 @@ func (rt *runtime) setVar(name string, v value) error { return nil } +func (rt *runtime) isArray(name string) bool { + _, ok := rt.arrays[name] + return ok +} + +func (rt *runtime) getArrayElem(name, key string) (value, error) { + rt.ensureBuiltinArray(name) + if err := rt.validateArrayName(name); err != nil { + return value{}, err + } + rt.markArrayName(name) + if v, ok := rt.arrays[name][key]; ok { + return v, nil + } + v := unassignedValue() + if err := rt.setArrayElem(name, key, v); err != nil { + return value{}, err + } + return v, nil +} + +func (rt *runtime) hasArrayElem(name, key string) (bool, error) { + rt.ensureBuiltinArray(name) + if err := rt.validateArrayName(name); err != nil { + return false, err + } + rt.markArrayName(name) + _, ok := rt.arrays[name][key] + return ok, nil +} + +func (rt *runtime) setArrayElem(name, key string, v value) error { + rt.ensureBuiltinArray(name) + if err := rt.validateArrayName(name); err != nil { + return err + } + rt.markArrayName(name) + size := len(key) + len(v.String()) + if size > MaxVariableBytes { + return fmt.Errorf("array element exceeds %d bytes", MaxVariableBytes) + } + slot := arraySlot{name: name, key: key} + old := rt.arraySizes[slot] + if rt.varBytes-old+size > MaxVariableBytes { + return fmt.Errorf("variable storage limit exceeded (%d bytes total)", rt.varBytes-old+size) + } + rt.varBytes = rt.varBytes - old + size + rt.arraySizes[slot] = size + rt.arrays[name][key] = v + return nil +} + +func (rt *runtime) replaceArray(name string, elems map[string]value) error { + if err := rt.deleteArray(name); err != nil { + return err + } + if rt.arrays[name] == nil { + rt.arrays[name] = make(map[string]value, len(elems)) + } + for key, v := range elems { + if err := rt.setArrayElem(name, key, v); err != nil { + return err + } + } + return nil +} + +func (rt *runtime) deleteArrayElem(name, key string) error { + rt.ensureBuiltinArray(name) + if err := rt.validateArrayName(name); err != nil { + return err + } + rt.markArrayName(name) + slot := arraySlot{name: name, key: key} + if old := rt.arraySizes[slot]; old > 0 { + rt.varBytes -= old + if rt.varBytes < 0 { + rt.varBytes = 0 + } + } + delete(rt.arraySizes, slot) + delete(rt.arrays[name], key) + return nil +} + +func (rt *runtime) deleteArray(name string) error { + rt.ensureBuiltinArray(name) + if err := rt.validateArrayName(name); err != nil { + return err + } + rt.markArrayName(name) + for slot, size := range rt.arraySizes { + if slot.name != name { + continue + } + rt.varBytes -= size + delete(rt.arraySizes, slot) + } + if rt.varBytes < 0 { + rt.varBytes = 0 + } + rt.arrays[name] = make(map[string]value) + return nil +} + +func (rt *runtime) arrayKeys(name string) ([]string, error) { + rt.ensureBuiltinArray(name) + if err := rt.validateArrayName(name); err != nil { + return nil, err + } + rt.markArrayName(name) + keys := make([]string, 0, len(rt.arrays[name])) + for key := range rt.arrays[name] { + keys = append(keys, key) + } + sortStringKeys(keys) + return keys, nil +} + +func sortStringKeys(keys []string) { + for i := 1; i < len(keys); i++ { + key := keys[i] + j := i - 1 + for j >= 0 && keys[j] > key { + keys[j+1] = keys[j] + j-- + } + keys[j+1] = key + } +} + +func (rt *runtime) ensureBuiltinArray(name string) { + if name == "ENVIRON" { + rt.ensureEnviron() + } +} + +func (rt *runtime) markArrayName(name string) { + if rt.arrays[name] == nil { + rt.arrays[name] = make(map[string]value) + } +} + +func (rt *runtime) validateArrayName(name string) error { + if isBuiltinScalarName(name) { + return fmt.Errorf("cannot use scalar %s as array", name) + } + if _, ok := rt.vars[name]; ok { + return fmt.Errorf("cannot use scalar %s as array", name) + } + return nil +} + +func isBuiltinScalarName(name string) bool { + switch name { + case "NF", "NR", "FNR", "FILENAME": + return true + default: + return false + } +} + +func isBuiltinArrayName(name string) bool { + return name == "ENVIRON" +} + func validateFS(fs string) error { if fs == " " { return nil @@ -498,22 +871,31 @@ func validateFS(fs string) error { if fs == "" { return fmt.Errorf("empty FS is not supported") } - r, size := utf8.DecodeRuneInString(fs) - if r == utf8.RuneError && size == 0 { - return fmt.Errorf("empty FS is not supported") + if isSingleRune(fs) { + return nil } - if size != len(fs) { - return fmt.Errorf("multi-character and regex FS values are not supported") + _, err := compileRegex(fs) + if err != nil { + return err } return nil } +func isSingleRune(s string) bool { + if s == "" { + return false + } + _, size := utf8.DecodeRuneInString(s) + return size == len(s) +} + func compileRegex(pattern string) (*regexp.Regexp, error) { normalized := normalizeAwkRegex(pattern) re, err := regexp.Compile(normalized) if err != nil { return nil, fmt.Errorf("invalid regular expression %q: %v", pattern, err) } + re.Longest() return re, nil } diff --git a/builtins/builtins.go b/builtins/builtins.go index 8eaeefe7..e0b1de90 100644 --- a/builtins/builtins.go +++ b/builtins/builtins.go @@ -117,6 +117,11 @@ type CallContext struct { // LastExitCode is the exit code from the previous command. LastExitCode uint8 + // Env iterates over the shell-visible environment snapshot for this + // command. It includes caller-provided Env values and shell variables, but + // not the host process environment unless the caller explicitly provided it. + Env func(func(name, value string) bool) + // OpenFile opens a file within the shell's path restrictions. OpenFile func(ctx context.Context, path string, flags int, mode os.FileMode) (io.ReadWriteCloser, error) diff --git a/builtins/tests/awk/awk_test.go b/builtins/tests/awk/awk_test.go index 5dc156bc..86bb0258 100644 --- a/builtins/tests/awk/awk_test.go +++ b/builtins/tests/awk/awk_test.go @@ -13,6 +13,7 @@ import ( "path/filepath" "strings" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -106,6 +107,106 @@ func TestAwkBeginEndAndAggregation(t *testing.T) { assert.Equal(t, "start\nsum 5\n", stdout) } +func TestAwkCompoundAssignmentReadsCurrentTargetAfterRightSide(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := cmdRun(t, `awk 'BEGIN { print b += b += 1; b = 6; print b += b++; print b }'`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "2\n13\n13\n", stdout) +} + +func TestAwkAssociativeArrayElements(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "api 200\napi 500\nworker 200\n") + stdout, stderr, code := cmdRun(t, `awk '{ count[$1]++; status[$2] += 1 } END { print count["api"], count["worker"], status[200], status[500], missing["x"] }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "2 1 2 1 \n", stdout) +} + +func TestAwkArrayMembershipDeleteForInAndSplit(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "api 200\napi 500\nworker 200\n") + stdout, stderr, code := cmdRun(t, `awk '{ count[$1]++; split($0, fields); status[fields[2]]++ } END { delete status[500]; print ("api" in count), ("500" in status); for (k in count) print k, count[k]; delete count; print ("api" in count) }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "1 0\napi 2\nworker 1\n0\n", stdout) +} + +func TestAwkSplitRegexAndCharacterSeparator(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := cmdRun(t, `awk 'BEGIN { n = split("a,b:c", fields, /[,:]/); print n, fields[1], fields[2], fields[3]; m = split("xy", chars, ""); print m, chars[1], chars[2]; print split("a b", special, " "), split("a b", literal, / /); print split("abc", dotLiteral, "."), split("a.b", dotted, "."), split("a|b", pipeLiteral, "|"), split("abc", dotRegex, /./); print split("abc", nullRegex, //), nullRegex[1], nullRegex[2], nullRegex[3]; print split(" a b ", starRegex, / */), "[" starRegex[1] "]", "[" starRegex[2] "]", "[" starRegex[3] "]", "[" starRegex[4] "]"; print split("aaa", longest, /a|aa/), "[" longest[1] "]", "[" longest[2] "]", "[" longest[3] "]" }'`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "3 a b c\n2 x y\n2 3\n1 2 2 4\n3 a b c\n4 [] [a] [b] []\n3 [] [] []\n", stdout) +} + +func TestAwkForWhileBreakAndContinue(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := cmdRun(t, `awk 'BEGIN { for (i = 1; i <= 5; i++) { if (i == 2) continue; if (i == 5) break; sum += i }; j = 0; while (j < 3) { j++; if (j == 2) continue; seen = seen j }; i = 0; for (; i < 3; i++) noinit = noinit i; for (i = 0; i < 3; i++); emptyFor = i; j = 0; while (j++ < 3); emptyWhile = j; print sum, seen, noinit, emptyFor, emptyWhile }'`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "8 13 012 3 4\n", stdout) +} + +func TestAwkLoopsObserveContextCancellation(t *testing.T) { + for _, script := range []string{ + `awk 'BEGIN { while (1) {} }'`, + `awk 'BEGIN { for (i = 1; 1; i++) {} }'`, + } { + t.Run(script, func(t *testing.T) { + parser := syntax.NewParser() + prog, err := parser.Parse(strings.NewReader(script), "") + require.NoError(t, err) + var outBuf, errBuf bytes.Buffer + runner, err := interp.New( + interp.StdIO(nil, &outBuf, &errBuf), + interpoption.AllowAllCommands().(interp.RunnerOption), + interp.MaxExecutionTime(500*time.Millisecond), + ) + require.NoError(t, err) + defer runner.Close() + + done := make(chan error, 1) + go func() { + done <- runner.Run(context.Background(), prog) + }() + + select { + case runErr := <-done: + var exitStatus interp.ExitStatus + if errors.As(runErr, &exitStatus) { + assert.NotEqual(t, 0, int(exitStatus)) + assert.Contains(t, errBuf.String(), "context deadline exceeded") + return + } + assert.ErrorIs(t, runErr, context.DeadlineExceeded) + case <-time.After(2 * time.Second): + t.Fatal("awk loop did not observe context cancellation") + } + }) + } +} + +func TestAwkRejectsScalarArrayNameConflicts(t *testing.T) { + dir := t.TempDir() + for _, script := range []string{ + `awk 'BEGIN { x = 1; print x[1] }'`, + `awk 'BEGIN { print x; x[1] = 1 }'`, + `awk 'BEGIN { a[1] = 2; print a }'`, + `awk 'BEGIN { for (k in a) {}; print a }'`, + `awk 'BEGIN { print ("x" in a); print a }'`, + `awk 'BEGIN { delete a; print a }'`, + `awk 'BEGIN { print ENVIRON }'`, + `awk 'BEGIN { FS[1] = 2 }'`, + `awk 'BEGIN { NF[1] = 2 }'`, + } { + _, stderr, code := cmdRun(t, script, dir) + assert.Equal(t, 1, code, script) + assert.Contains(t, stderr, "awk:", script) + } +} + func TestAwkExplicitEmptyActionDoesNothing(t *testing.T) { dir := t.TempDir() writeFile(t, dir, "input.txt", "alpha\n") @@ -204,6 +305,75 @@ func TestAwkLiteralFieldSeparatorBlankRecordNF(t *testing.T) { assert.Equal(t, "2\n0\n2\n", stdout) } +func TestAwkRegexFieldSeparator(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "a,b:c\n") + stdout, stderr, code := cmdRun(t, `awk -F '[,:]' '{ print NF, $1, $2, $3 }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "3 a b c\n", stdout) +} + +func TestAwkSingleCharacterFieldSeparatorIsLiteral(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "plain.txt", "abc\n") + writeFile(t, dir, "pipe.txt", "a|b\n") + stdout, stderr, code := cmdRun(t, `awk -F . '{ print NF }' plain.txt; awk -F '|' '{ print NF, $1, $2 }' pipe.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "1\n2 a b\n", stdout) +} + +func TestAwkRangePatterns(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "ignore\nstart\nmiddle\nend\ntail\nstart end\nafter\n") + stdout, stderr, code := cmdRun(t, `awk '/start/,/end/ { print NR ":" $0 }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "2:start\n3:middle\n4:end\n6:start end\n", stdout) +} + +func TestAwkFieldAssignmentAndRecordRebuild(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "a b c\n") + stdout, stderr, code := cmdRun(t, `awk 'BEGIN { OFS="|" } { $2 = toupper($2); print $0, NF; NF = 4; $4 = "z"; print $0, NF; $0 = "m n"; print $1, $2, NF }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "a|B|c|3\na|B|c|z|4\nm|n|2\n", stdout) +} + +func TestAwkRecordAssignmentRespectsRecordLimit(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "large.txt", strings.Repeat("x", 1<<20)+"\n") + for _, script := range []string{ + `awk 'BEGIN { $0 = "x"; for (i = 0; i < 21; i++) $0 = $0 $0; print "unreachable" }'`, + `awk '{ $1 = $0; $2 = $0; print "unreachable" }' large.txt`, + `awk '{ $1 = $0; NF = 2; print "unreachable" }' large.txt`, + } { + stdout, stderr, code := cmdRun(t, script, dir) + assert.Equal(t, 1, code, script) + assert.Equal(t, "", stdout, script) + assert.Contains(t, stderr, "record exceeds 1048576 bytes", script) + } +} + +func TestAwkEnvironUsesRshellEnvironment(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := runScript(t, `FOO=script; awk 'BEGIN { print ENVIRON["FROM_ENV"], ENVIRON["FOO"], ("PATH" in ENVIRON), ("PWD" in ENVIRON); print ENVIRON["NUMERIC_ENV"] < 2, ENVIRON["NUMERIC_ENV"] + 0, ENVIRON["NUMERIC_ENV"] == 10 }'`, dir, interp.Env("FROM_ENV=provided", "NUMERIC_ENV=10")) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "provided script 0 1\n0 10 1\n", stdout) +} + +func TestAwkLargeEnvironDoesNotConsumeVariableBudget(t *testing.T) { + dir := t.TempDir() + big := strings.Repeat("x", 1<<20) + stdout, stderr, code := runScript(t, `awk 'BEGIN { print 1; print length(ENVIRON["BIG"]) }'`, dir, interp.Env("BIG="+big)) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "1\n1048576\n", stdout) +} + func TestAwkStringNumericSemantics(t *testing.T) { dir := t.TempDir() writeFile(t, dir, "input.txt", "0\n10\n123abc\n-4.5x\nabc123\n") @@ -213,6 +383,15 @@ func TestAwkStringNumericSemantics(t *testing.T) { assert.Equal(t, "1 1 1 0 124\n1 1 1 1\ntruthy 10\n11 0 0 1\ntruthy 123abc\n124 0 1 1\ntruthy -4.5x\n-3.5 0 1 1\ntruthy abc123\n1 0 0 0\n", stdout) } +func TestAwkRegexFieldSeparatorAllowsZeroWidthMatches(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "abc\naxb\n") + stdout, stderr, code := cmdRun(t, `awk -F 'x*' '{ print NF, "[" $1 "]", "[" $2 "]" }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "1 [abc] []\n2 [a] [b]\n", stdout) +} + func TestAwkEmptyProgramIsNoOp(t *testing.T) { dir := t.TempDir() stdout, stderr, code := cmdRun(t, `awk '' missing.txt`, dir) @@ -329,7 +508,6 @@ func TestAwkRejectsUnsafeFeatures(t *testing.T) { `awk '{ system("sh") }' input.txt`, `awk '{ print $1 > "out" }' input.txt`, `awk '{ printf "%s", $1 > "out" }' input.txt`, - `awk '{ $1 = "x" }' input.txt`, `awk '{ print getline }' input.txt`, `awk '{ x = next }' input.txt`, `awk '{ exit 0 }' input.txt`, diff --git a/docs/AWK_IMPLEMENTATION_PLAN.md b/docs/AWK_IMPLEMENTATION_PLAN.md index 349daeef..eefce62f 100644 --- a/docs/AWK_IMPLEMENTATION_PLAN.md +++ b/docs/AWK_IMPLEMENTATION_PLAN.md @@ -358,29 +358,43 @@ Phase 2 started scope: - common scalar builtins: `length`, `substr`, `index`, `tolower`, `toupper`, `int` -Remaining Phase 2 candidates: +Phase 3 scope: -- range patterns -- regex `FS` -- field assignment and `$0` rebuilding -- `split`, once array support is available or a narrow safe representation is - chosen +Phase 3 should bundle the remaining practical awk features that make the +builtin useful for real aggregation and log processing. This includes the +leftover Phase 2 candidates where they naturally depend on the same runtime +machinery. -Phase 3 candidates: - -- arrays, including `count[$1]++` -- `ENVIRON`, populated from the rshell environment snapshot +- associative array element expressions, including `count[$1]++` +- `split`, writing into awk arrays - `in` - `delete` - `for (k in array)` -- `for` and `while` +- C-style `for` and `while` - `break` and `continue` +- range patterns such as `/start/,/end/` +- regex `FS`, including values from `-F` and `FS = value` +- field assignment and `$0` rebuilding: `$1 = value`, `$0 = value`, and + `NF = value` +- `ENVIRON`, populated from the rshell environment snapshot + +Implementation order used by `codex/awk-phase-3`: + +1. Add associative arrays and array element expressions. +2. Add `in`, `delete`, `for (k in array)`, and `split`. +3. Add `while`, C-style `for`, `break`, and `continue`. +4. Add range patterns. +5. Add regex `FS`. +6. Add field assignment and `$0`/field rebuilding. +7. Add `ENVIRON` using the rshell environment snapshot API. Phase 4 candidates: - user-defined functions - additional POSIX awk builtins - carefully restricted `getline`, only if a safe design is approved +- safe command pipes through rshell's controlled execution model, only if a + concrete non-host-escape design is approved - safe GNU awk compatibility extensions that do not violate rshell policy ## Open Design Questions diff --git a/interp/runner_exec.go b/interp/runner_exec.go index a9c5af04..c35a9e18 100644 --- a/interp/runner_exec.go +++ b/interp/runner_exec.go @@ -547,6 +547,14 @@ func (r *Runner) call(ctx context.Context, pos syntax.Pos, args []string) { if isKnown { r.dispatchedCount++ + envEach := func(fn func(name, value string) bool) { + r.writeEnv.Each(func(name string, vr expand.Variable) bool { + if !vr.IsSet() { + return true + } + return fn(name, vr.Str) + }) + } var runCmdWithStdin func(context.Context, string, string, []string, io.Reader) (uint8, error) runCmdWithStdin = func(ctx context.Context, dir string, cmdName string, cmdArgs []string, childStdin io.Reader) (uint8, error) { if !r.allowAllCommands && !r.allowedCommands[cmdName] { @@ -560,6 +568,7 @@ func (r *Runner) call(ctx context.Context, pos syntax.Pos, args []string) { Stdout: r.stdout, Stderr: r.stderr, WorkDir: func() string { return dir }, + Env: envEach, HostPrefix: func() string { // Return the sandbox's normalized prefix (filepath.Clean'd // in SetHostPrefix) rather than the raw user-supplied @@ -663,6 +672,7 @@ func (r *Runner) call(ctx context.Context, pos syntax.Pos, args []string) { Stderr: r.stderr, InLoop: r.inLoop, LastExitCode: r.lastExit.code, + Env: envEach, WorkDir: func() string { return HandlerCtx(r.handlerCtx(ctx, todoPos)).Dir }, diff --git a/tests/awk_scenarios/enabled.txt b/tests/awk_scenarios/enabled.txt index 32f784a2..338cf37a 100644 --- a/tests/awk_scenarios/enabled.txt +++ b/tests/awk_scenarios/enabled.txt @@ -1,6 +1,12 @@ +gawk/arrays/associative_count.yaml +gawk/arrays/delete_index.yaml +gawk/arrays/subscript_name_keeps_scalar_value.yaml +gawk/arrays/unassigned_subscript_empty_string.yaml gawk/basic/begin_end_records.yaml gawk/basic/field_separator.yaml +gawk/control/for_loop_fields.yaml gawk/control/if_else.yaml +gawk/control/while_break.yaml gawk/expressions/appended_numeric_string_reconverts.yaml gawk/expressions/arithmetic_comparison.yaml gawk/expressions/concat_literal_punctuation.yaml @@ -14,15 +20,24 @@ gawk/expressions/string_constant_numeric_comparison.yaml gawk/expressions/string_field_number_reference.yaml gawk/expressions/unary_minus_string_operand.yaml gawk/expressions/unary_plus_preserves_decimal_string_value.yaml +gawk/fields/assign_rebuilds_record.yaml +gawk/fields/empty_field_assignment_preserves_nf.yaml +gawk/fields/nf_assignment.yaml gawk/fields/numeric_field_terminator.yaml gawk/functions/printf_width_precision_mix.yaml +gawk/functions/split.yaml +gawk/functions/split_default_separator.yaml gawk/functions/string_core.yaml gawk/input/no_trailing_newline_regex.yaml gawk/input/nr_concat_builtin_records.yaml +gawk/misc/assign_extends_record.yaml gawk/misc/begin_print_hello.yaml +gawk/misc/compound_assignment_subscript_side_effect.yaml +gawk/misc/in_operator_assignment_value.yaml gawk/misc/last_field_concat_once.yaml gawk/misc/nested_self_compound_assignment.yaml gawk/misc/printf_plus_flag_decimal.yaml +gawk/misc/range_pattern_boundaries.yaml gawk/output/hex_input_numeric_conversion.yaml gawk/output/integer_precision_padding.yaml gawk/output/print_separators.yaml @@ -34,13 +49,40 @@ gawk/regex/dfa_nested_closure_alternation.yaml gawk/regex/escaped_left_brace_literal.yaml gawk/regex/pattern_match.yaml gawk/text/print_records_verbatim.yaml +onetrueawk/arrays/delete_current_key.yaml +onetrueawk/arrays/first_seen_totals.yaml +onetrueawk/arrays/record_storage_split.yaml +onetrueawk/arrays/regex_bucket_counts.yaml +onetrueawk/arrays/unique_field_counts.yaml onetrueawk/basic/begin_filename_and_end_nr.yaml onetrueawk/basic/comments_ignored.yaml onetrueawk/basic/pattern_action.yaml onetrueawk/basic/record_counter_nr.yaml +onetrueawk/control/division_loop_variants.yaml +onetrueawk/control/for_each_field_reverse.yaml +onetrueawk/control/infinite_for_next_record.yaml +onetrueawk/core/assign_existing_field_constant.yaml +onetrueawk/core/assign_first_field_from_nr.yaml +onetrueawk/core/assign_last_field_from_nr.yaml +onetrueawk/core/assign_record_from_second_field.yaml +onetrueawk/core/break_end_stored_records.yaml +onetrueawk/core/break_inner_loop_only.yaml +onetrueawk/core/break_preserves_matching_element.yaml +onetrueawk/core/continue_skips_numeric_fields.yaml onetrueawk/core/custom_ors_without_final_newline.yaml +onetrueawk/core/delete_numeric_and_string_keys.yaml +onetrueawk/core/delete_split_element_count.yaml +onetrueawk/core/dynamic_field_zero_or_one_assignment.yaml +onetrueawk/core/dynamic_first_field_division.yaml onetrueawk/core/end_record_count.yaml +onetrueawk/core/field_assignment_rebuild_marker.yaml onetrueawk/core/field_reference_order.yaml +onetrueawk/core/first_seen_amount_totals.yaml +onetrueawk/core/for_in_break_finds_record.yaml +onetrueawk/core/for_in_counts_and_total.yaml +onetrueawk/core/for_increment_expression_sums_fields.yaml +onetrueawk/core/for_loop_multiline_clauses.yaml +onetrueawk/core/for_loop_next_after_fields.yaml onetrueawk/core/if_truthy_fields.yaml onetrueawk/core/inline_comments_inside_action.yaml onetrueawk/core/next_skips_later_action.yaml @@ -49,25 +91,40 @@ onetrueawk/core/numeric_field_comparison_pattern.yaml onetrueawk/core/numeric_literal_regex_pattern.yaml onetrueawk/core/or_pattern_with_regex.yaml onetrueawk/core/prefix_postfix_increment_counters.yaml +onetrueawk/core/range_pattern_basic.yaml onetrueawk/core/regex_bracket_classes_dynamic.yaml onetrueawk/core/regex_bracket_classes_literal.yaml onetrueawk/core/regex_match_operator.yaml onetrueawk/core/running_sum_and_final_total.yaml +onetrueawk/core/same_regex_range_records.yaml +onetrueawk/core/split_fields_reordered.yaml +onetrueawk/core/split_reuses_source_array.yaml onetrueawk/core/tt01_print_records.yaml onetrueawk/core/tt02_nr_nf_record.yaml onetrueawk/core/tt03_sum_second_field_lengths.yaml +onetrueawk/core/tt05_reverse_fields_string.yaml +onetrueawk/core/tt06_group_lengths_for_in.yaml onetrueawk/core/tt07_even_field_count_pattern.yaml onetrueawk/core/tt08_even_record_length_pattern.yaml onetrueawk/core/tt09_empty_record_pattern.yaml onetrueawk/core/tt10_nonempty_end_pattern.yaml onetrueawk/core/tt11_fixed_substr.yaml +onetrueawk/core/tt12_field_string_and_decrement.yaml +onetrueawk/core/tt13_store_fields_in_array.yaml onetrueawk/core/uninitialized_concat_prefix.yaml onetrueawk/expressions/number_string_conversion.yaml onetrueawk/expressions/numeric_string_exclusions.yaml onetrueawk/expressions/string_range_comparisons.yaml onetrueawk/expressions/uninitialized_numeric_coercion.yaml +onetrueawk/fields/assign_high_field.yaml +onetrueawk/fields/chained_record_field_assignment.yaml onetrueawk/fields/colon_field_separator.yaml +onetrueawk/fields/field_assignment_numeric_record.yaml onetrueawk/fields/field_regex_condition.yaml +onetrueawk/fields/first_field_assignment_rebuild.yaml +onetrueawk/fields/nf_assignment_rebuild.yaml +onetrueawk/fields/regex_field_separator_tabs.yaml +onetrueawk/fields/set_record_from_field.yaml onetrueawk/fixtures/t_1_x_concatenated_assignment.yaml onetrueawk/fixtures/t_4_x_parenthesized_field_reference.yaml onetrueawk/fixtures/t_6_x_nf_and_record_printing.yaml @@ -83,11 +140,15 @@ onetrueawk/fixtures/t_x_regex_default_print.yaml onetrueawk/fixtures/tt_03a_third_field_sum.yaml onetrueawk/fixtures/tt_10a_dynamic_dot_end_regex.yaml onetrueawk/functions/index_substring_positions.yaml +onetrueawk/functions/split_default_fields.yaml +onetrueawk/functions/split_dynamic_separator.yaml +onetrueawk/functions/split_regex_separator.yaml onetrueawk/functions/substr_pattern_filters.yaml onetrueawk/output/custom_ofs.yaml onetrueawk/output/ofs_ors_print.yaml onetrueawk/output/printf_numeric_formats.yaml onetrueawk/programs/constant_string_concatenation.yaml +onetrueawk/programs/delete_element_and_array.yaml onetrueawk/programs/expression_result_numeric_conversion.yaml onetrueawk/programs/p01_print_records.yaml onetrueawk/programs/p02_print_selected_fields.yaml @@ -112,6 +173,7 @@ onetrueawk/programs/p20_compound_condition.yaml onetrueawk/programs/p21_field_or_continent.yaml onetrueawk/programs/p21a_record_regex_or.yaml onetrueawk/programs/p22_anchored_alternation_field_regex.yaml +onetrueawk/programs/p23_regex_range_pattern.yaml onetrueawk/programs/p25_ratio_printf.yaml onetrueawk/programs/p26_accumulate_asia_long_assignment.yaml onetrueawk/programs/p26a_accumulate_asia_compound_assignment.yaml @@ -119,13 +181,22 @@ onetrueawk/programs/p27_maximum_numeric_field.yaml onetrueawk/programs/p28_nr_colon_record_concat.yaml onetrueawk/programs/p30_length_builtin_current_record.yaml onetrueawk/programs/p31_longest_first_field.yaml +onetrueawk/programs/p32_substr_field_rebuild.yaml onetrueawk/programs/p33_concatenate_substrings_end.yaml +onetrueawk/programs/p34_divide_field_rebuild.yaml +onetrueawk/programs/p35_tab_fs_ofs_conditional_field_rewrite.yaml +onetrueawk/programs/p36_computed_field_with_ofs.yaml onetrueawk/programs/p37_concatenated_field_equality.yaml onetrueawk/programs/p38_block_if_maximum.yaml +onetrueawk/programs/p39_while_print_each_field.yaml +onetrueawk/programs/p40_for_print_each_field.yaml +onetrueawk/programs/p42_array_accumulate_regex_buckets.yaml +onetrueawk/programs/p43_area_by_group_for_in.yaml onetrueawk/programs/p45_ofs_ors_print.yaml onetrueawk/programs/p46_adjacent_field_concatenation.yaml onetrueawk/programs/p5a_tabular_header_printf.yaml onetrueawk/programs/regular_expression_operator_matrix.yaml +onetrueawk/programs/split_empty_separator_and_fs_reparse.yaml onetrueawk/records/longest_record.yaml onetrueawk/records/modulo_pattern_default_print.yaml onetrueawk/records/sum_count_average.yaml diff --git a/tests/scenarios/cmd/awk/basic/array_counts.yaml b/tests/scenarios/cmd/awk/basic/array_counts.yaml new file mode 100644 index 00000000..9d3ab575 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/array_counts.yaml @@ -0,0 +1,18 @@ +description: awk associative array elements support aggregation by field value. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + api 200 + api 500 + worker 200 +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ count[$1]++; status[$2] += 1 } END { print count["api"], count["worker"], status[200], status[500] }' input.txt +expect: + stdout: |+ + 2 1 2 1 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/environ_numeric_string.yaml b/tests/scenarios/cmd/awk/basic/environ_numeric_string.yaml new file mode 100644 index 00000000..141140d5 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/environ_numeric_string.yaml @@ -0,0 +1,14 @@ +description: awk treats numeric ENVIRON values as numeric strings. +oracle: gawk +input: + envs: + NUMERIC_ENV: "10" + interpreter_env: + NUMERIC_ENV: "10" + script: |+ + awk 'BEGIN { print ENVIRON["NUMERIC_ENV"] < 2, ENVIRON["NUMERIC_ENV"] + 0, ENVIRON["NUMERIC_ENV"] == 10 }' +expect: + stdout: |+ + 0 10 1 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/field_assignment.yaml b/tests/scenarios/cmd/awk/basic/field_assignment.yaml new file mode 100644 index 00000000..6dc60aad --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/field_assignment.yaml @@ -0,0 +1,18 @@ +description: awk field assignment rebuilds records with OFS and $0 assignment resplits fields. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + a b c +input: + allowed_paths: ["$DIR"] + script: |+ + awk 'BEGIN { OFS="|" } { $2 = toupper($2); print $0, NF; NF = 4; $4 = "z"; print $0, NF; $0 = "m n"; print $1, $2, NF }' input.txt +expect: + stdout: |+ + a|B|c|3 + a|B|c|z|4 + m|n|2 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/literal_single_char_fs.yaml b/tests/scenarios/cmd/awk/basic/literal_single_char_fs.yaml new file mode 100644 index 00000000..3ebe4d29 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/literal_single_char_fs.yaml @@ -0,0 +1,21 @@ +description: awk treats single-character field separators as literals even when they are regex metacharacters. +oracle: gawk +setup: + files: + - path: plain.txt + content: |+ + abc + - path: pipe.txt + content: |+ + a|b +input: + allowed_paths: ["$DIR"] + script: |+ + awk -F . '{ print NF }' plain.txt + awk -F '|' '{ print NF, $1, $2 }' pipe.txt +expect: + stdout: |+ + 1 + 2 a b + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/loops.yaml b/tests/scenarios/cmd/awk/basic/loops.yaml new file mode 100644 index 00000000..abf1ebca --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/loops.yaml @@ -0,0 +1,10 @@ +description: awk supports practical while and for loops with break and continue. +oracle: gawk +input: + script: |+ + awk 'BEGIN { for (i = 1; i <= 5; i++) { if (i == 2) continue; if (i == 5) break; sum += i }; j = 0; while (j < 3) { j++; if (j == 2) continue; seen = seen j }; i = 0; for (; i < 3; i++) noinit = noinit i; for (i = 0; i < 3; i++); emptyFor = i; j = 0; while (j++ < 3); emptyWhile = j; print sum, seen, noinit, emptyFor, emptyWhile }' +expect: + stdout: |+ + 8 13 012 3 4 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/range_patterns.yaml b/tests/scenarios/cmd/awk/basic/range_patterns.yaml new file mode 100644 index 00000000..0b89af1b --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/range_patterns.yaml @@ -0,0 +1,25 @@ +description: awk range patterns include records from the start match through the end match. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + ignore + start + middle + end + tail + start end + after +input: + allowed_paths: ["$DIR"] + script: |+ + awk '/start/,/end/ { print NR ":" $0 }' input.txt +expect: + stdout: |+ + 2:start + 3:middle + 4:end + 6:start end + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/record_assignment_limit.yaml b/tests/scenarios/cmd/awk/basic/record_assignment_limit.yaml new file mode 100644 index 00000000..5445da0a --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/record_assignment_limit.yaml @@ -0,0 +1,10 @@ +description: awk rejects record assignment that exceeds the record size limit. +skip_assert_against_bash: true # intentional divergence: rshell caps in-memory awk records. +input: + script: |+ + awk 'BEGIN { $0 = "x"; for (i = 0; i < 21; i++) $0 = $0 $0; print "unreachable" }' +expect: + stdout: |+ + stderr: |+ + awk: record exceeds 1048576 bytes + exit_code: 1 diff --git a/tests/scenarios/cmd/awk/basic/regex_fs.yaml b/tests/scenarios/cmd/awk/basic/regex_fs.yaml new file mode 100644 index 00000000..6a863c87 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/regex_fs.yaml @@ -0,0 +1,16 @@ +description: awk supports regex field separators. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + a::b,c +input: + allowed_paths: ["$DIR"] + script: |+ + awk -F '::|,' '{ print NF, $1, $2, $3 }' input.txt +expect: + stdout: |+ + 3 a b c + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/regex_fs_empty_matches.yaml b/tests/scenarios/cmd/awk/basic/regex_fs_empty_matches.yaml new file mode 100644 index 00000000..bc5313ab --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/regex_fs_empty_matches.yaml @@ -0,0 +1,18 @@ +description: awk regex field separators ignore zero-width matches. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + abc + axb +input: + allowed_paths: ["$DIR"] + script: |+ + awk -F 'x*' '{ print NF, "[" $1 "]", "[" $2 "]" }' input.txt +expect: + stdout: |+ + 1 [abc] [] + 2 [a] [b] + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/regex_leftmost_longest.yaml b/tests/scenarios/cmd/awk/basic/regex_leftmost_longest.yaml new file mode 100644 index 00000000..2bdbf952 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/regex_leftmost_longest.yaml @@ -0,0 +1,11 @@ +description: awk regex separators use POSIX leftmost-longest matching. +oracle: gawk +input: + script: |+ + printf 'aaa\n' | awk -F 'a|aa' '{ print NF, "[" $1 "]", "[" $2 "]", "[" $3 "]"; n = split("aaa", parts, /a|aa/); print n, "[" parts[1] "]", "[" parts[2] "]", "[" parts[3] "]" }' +expect: + stdout: |+ + 3 [] [] [] + 3 [] [] [] + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/split_delete_in.yaml b/tests/scenarios/cmd/awk/basic/split_delete_in.yaml new file mode 100644 index 00000000..a86141be --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/split_delete_in.yaml @@ -0,0 +1,10 @@ +description: awk split, delete, and in support array membership checks. +oracle: gawk +input: + script: |+ + awk 'BEGIN { split("a,b:c", f, /[,:]/); counts[f[1]]++; counts[f[2]]++; delete counts["b"]; print (f[1] in counts), ("b" in counts), f[3] }' +expect: + stdout: |+ + 1 0 c + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/split_null_regex.yaml b/tests/scenarios/cmd/awk/basic/split_null_regex.yaml new file mode 100644 index 00000000..33923f6e --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/split_null_regex.yaml @@ -0,0 +1,10 @@ +description: awk split with a null regular expression separates characters. +oracle: gawk +input: + script: |+ + awk 'BEGIN { n = split("abc", chars, //); print n, chars[1], chars[2], chars[3] }' +expect: + stdout: |+ + 3 a b c + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/split_regex_empty_match.yaml b/tests/scenarios/cmd/awk/basic/split_regex_empty_match.yaml new file mode 100644 index 00000000..6311b158 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/split_regex_empty_match.yaml @@ -0,0 +1,11 @@ +description: awk split ignores empty regex separator matches while preserving non-empty separators. +oracle: gawk +input: + script: |+ + awk 'BEGIN { n = split(" a b ", parts, / */); print n, "[" parts[1] "]", "[" parts[2] "]", "[" parts[3] "]", "[" parts[4] "]"; print split("abc", none, /x*/), none[1] }' +expect: + stdout: |+ + 4 [] [a] [b] [] + 1 abc + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/errors/multichar_fs_rejected.yaml b/tests/scenarios/cmd/awk/errors/multichar_fs_rejected.yaml deleted file mode 100644 index 7399c888..00000000 --- a/tests/scenarios/cmd/awk/errors/multichar_fs_rejected.yaml +++ /dev/null @@ -1,16 +0,0 @@ -description: awk rejects multi-character field separators in Phase 1. -skip_assert_against_bash: true -setup: - files: - - path: input.txt - content: |+ - a::b -input: - allowed_paths: ["$DIR"] - script: |+ - awk -F:: '{ print $1 }' input.txt -expect: - stdout: "" - stderr: |+ - awk: multi-character and regex FS values are not supported - exit_code: 1 diff --git a/tests/scenarios_test.go b/tests/scenarios_test.go index cd80188c..22e55c55 100644 --- a/tests/scenarios_test.go +++ b/tests/scenarios_test.go @@ -445,7 +445,7 @@ func scenarioUsesCommand(script, command string) (bool, error) { func isRshellSpecificAwkScenario(rel string) bool { switch rel { - case "cmd/awk/errors/multichar_fs_rejected.yaml", + case "cmd/awk/basic/record_assignment_limit.yaml", "cmd/awk/safety/print_redirect_rejected.yaml", "cmd/awk/safety/system_rejected.yaml": return true