diff --git a/.claude/skills/implement-awk/SKILL.md b/.claude/skills/implement-awk/SKILL.md index f9fd266ee..25b925efb 100644 --- a/.claude/skills/implement-awk/SKILL.md +++ b/.claude/skills/implement-awk/SKILL.md @@ -9,6 +9,13 @@ argument-hint: "[feature-or-failure-filter]" Use this skill when implementing, extending, or fixing the rshell `awk` builtin. +## Shared Implementation Plan + +Before starting or resuming implementation work, read +`docs/AWK_IMPLEMENTATION_PLAN.md`. That document captures the agreed rshell awk +profile, the long-lived parser strategy, Phase 1 Practical awk scope, safety +policy, test plan, and later-phase roadmap. + ## Compatibility Target The implementation target is GNU awk (`gawk`), not POSIX awk alone, One True diff --git a/SHELL_FEATURES.md b/SHELL_FEATURES.md index 4aa0f4db9..4e4b7a7ee 100644 --- a/SHELL_FEATURES.md +++ b/SHELL_FEATURES.md @@ -7,6 +7,7 @@ The in-shell `help` command mirrors these feature categories: run `help` for a c ## Builtins +- ✅ `awk [-F SEP] [-v NAME=VALUE] ['PROGRAM'|-f PROGRAM-FILE] [FILE]...` — pattern scanning and text processing; Phase 1 supports BEGIN/main/END rules, read-only fields (`$0`, `$1`, `$NF`), `NF`/`NR`/`FNR`/`FILENAME`, `FS`/`OFS`/`ORS`, `print`, scalar assignment, arithmetic/comparison/boolean expressions, regex patterns and `~`/`!~`, and string concatenation; `system()`, command pipes, output redirection, `getline`, arrays, control flow, `printf`, regex `FS`, and field mutation are rejected or deferred - ✅ `break` — exit the innermost `for` loop - ✅ `cat [-AbeEnstTuv] [FILE]...` — concatenate files to stdout; supports line numbering, blank squeezing, and non-printing character display - ✅ `continue` — skip to the next iteration of the innermost `for` loop diff --git a/analysis/symbols_builtins.go b/analysis/symbols_builtins.go index 6da0effbf..b39309523 100644 --- a/analysis/symbols_builtins.go +++ b/analysis/symbols_builtins.go @@ -27,6 +27,28 @@ package analysis // Every symbol listed here must also appear in builtinAllowedSymbols // (which acts as the global ceiling). var builtinPerCommandSymbols = map[string][]string{ + "awk": { + "bufio.NewScanner", // 🟢 line-by-line record reading; no write or exec capability. + "context.Context", // 🟢 deadline/cancellation plumbing; pure interface, no side effects. + "fmt.Errorf", // 🟢 error formatting; pure function, no I/O. + "io.EOF", // 🟢 sentinel error value; pure constant. + "io.NopCloser", // 🟢 wraps a Reader with a no-op Close; no side effects. + "io.ReadCloser", // 🟢 interface type; no side effects. + "math.Mod", // 🟢 pure arithmetic modulo for awk % operator; no side effects. + "os.O_RDONLY", // 🟢 read-only file flag constant; cannot open files by itself. + "regexp.Compile", // 🟢 compiles a regular expression; pure function, no I/O. Uses RE2 engine (linear-time, no backtracking). + "regexp.Regexp", // 🟢 compiled regular expression type; no I/O side effects. All matching methods are linear-time (RE2). + "strconv.FormatFloat", // 🟢 float-to-string conversion for awk numeric output; pure function. + "strconv.ParseFloat", // 🟢 string-to-float conversion; pure function, no I/O. + "strings.Builder", // 🟢 efficient string concatenation; pure in-memory buffer, no I/O. + "strings.Cut", // 🟢 splits a string around the first separator; pure function, no I/O. + "strings.Join", // 🟢 concatenates a slice of strings with a separator; pure function, no I/O. + "strings.NewReader", // 🟢 wraps a string as an io.Reader; pure in-memory, no I/O. + "strings.Split", // 🟢 splits a string by separator into a slice; pure function, no I/O. + "strings.TrimSpace", // 🟢 removes leading/trailing whitespace; pure function. + "unicode/utf8.DecodeRuneInString", // 🟢 decodes first UTF-8 rune from a string; pure function, no I/O. + "unicode/utf8.RuneError", // 🟢 replacement character returned for invalid UTF-8; constant, no I/O. + }, "break": { "context.Context", // 🟢 deadline/cancellation plumbing; pure interface, no side effects. }, @@ -561,6 +583,7 @@ var builtinAllowedSymbols = []string{ "math.MaxInt64", // 🟢 integer constant; no side effects. "math.MaxUint64", // 🟢 integer constant; no side effects. "math.MinInt64", // 🟢 integer constant; no side effects. + "math.Mod", // 🟢 pure arithmetic modulo; no side effects. "math.NaN", // 🟢 returns IEEE 754 NaN value; pure function, no I/O. "net.DefaultResolver", // 🔴 default system DNS resolver; used for context-aware address lookup; network I/O is the explicit purpose of the ping builtin. "net.FlagBroadcast", // 🟢 interface flag constant: broadcast capability; pure constant, no network connections. @@ -600,6 +623,7 @@ var builtinAllowedSymbols = []string{ "strconv.Atoi", // 🟢 string-to-int conversion; pure function, no I/O. "strconv.ErrRange", // 🟢 sentinel error value for overflow; pure constant. "strconv.FormatBool", // 🟢 bool-to-string conversion; pure function, no I/O. + "strconv.FormatFloat", // 🟢 float-to-string conversion; pure function, no I/O. "strconv.FormatInt", // 🟢 int-to-string conversion; pure function, no I/O. "strconv.FormatUint", // 🟢 uint-to-string conversion; pure function, no I/O. "strconv.IntSize", // 🟢 platform int size constant (32 or 64); pure constant, no I/O. @@ -612,10 +636,12 @@ var builtinAllowedSymbols = []string{ "strings.Builder", // 🟢 efficient string concatenation; pure in-memory buffer, no I/O. "strings.Contains", // 🟢 substring search; pure function, no I/O. "strings.ContainsRune", // 🟢 checks if a rune is in a string; pure function, no I/O. + "strings.Cut", // 🟢 splits a string around the first separator; pure function, no I/O. "strings.Fields", // 🟢 splits a string on whitespace into a slice; pure function, no I/O. "strings.HasPrefix", // 🟢 pure function for prefix matching; no I/O. "strings.IndexByte", // 🟢 finds byte in string; pure function, no I/O. "strings.Join", // 🟢 concatenates a slice of strings with a separator; pure function, no I/O. + "strings.NewReader", // 🟢 wraps a string as an io.Reader; pure in-memory, no I/O. "strings.ReplaceAll", // 🟢 replaces all occurrences of a substring; pure function, no I/O. "strings.Split", // 🟢 splits a string by separator into a slice; pure function, no I/O. "strings.ToLower", // 🟢 converts string to lowercase; pure function, no I/O. diff --git a/analysis/symbols_verification_test.go b/analysis/symbols_verification_test.go index 8ac0c7a7e..b3496b0d6 100644 --- a/analysis/symbols_verification_test.go +++ b/analysis/symbols_verification_test.go @@ -147,13 +147,11 @@ func writeGoFile(t *testing.T, path, pkg string, imports []string, body string) func findFirstSubdirGoFile(t *testing.T, dir string) string { t.Helper() var found string + var fallback string err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { if err != nil { return err } - if found != "" { - return filepath.SkipAll - } if info.IsDir() { return nil } @@ -162,13 +160,26 @@ func findFirstSubdirGoFile(t *testing.T, dir string) string { return nil } if strings.Contains(rel, string(filepath.Separator)) { - found = path + data, err := os.ReadFile(path) + if err != nil { + return err + } + if strings.Contains(string(data), "import (") { + found = path + return filepath.SkipAll + } + if fallback == "" { + fallback = path + } } return nil }) if err != nil { t.Fatal(err) } + if found == "" { + found = fallback + } if found == "" { t.Fatalf("no .go file found in subdirectories of %s", dir) } diff --git a/builtins/awk/ast.go b/builtins/awk/ast.go new file mode 100644 index 000000000..4afe00b56 --- /dev/null +++ b/builtins/awk/ast.go @@ -0,0 +1,112 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package awk + +type program struct { + rules []rule +} + +type ruleKind int + +const ( + ruleNormal ruleKind = iota + ruleBegin + ruleEnd +) + +type rule struct { + kind ruleKind + pattern expr + action []stmt +} + +type stmt interface { + stmtNode() +} + +type printStmt struct { + args []expr +} + +func (*printStmt) stmtNode() {} + +type exprStmt struct { + x expr +} + +func (*exprStmt) stmtNode() {} + +type expr interface { + exprNode() +} + +type numberExpr struct { + text string + num float64 +} + +func (*numberExpr) exprNode() {} + +type stringExpr struct { + value string +} + +func (*stringExpr) exprNode() {} + +type regexExpr struct { + pattern string +} + +func (*regexExpr) exprNode() {} + +type varExpr struct { + name string +} + +func (*varExpr) exprNode() {} + +type fieldExpr struct { + index expr +} + +func (*fieldExpr) exprNode() {} + +type groupedExpr struct { + x expr +} + +func (*groupedExpr) exprNode() {} + +type unaryExpr struct { + op string + x expr +} + +func (*unaryExpr) exprNode() {} + +type binaryExpr struct { + op string + left expr + right expr +} + +func (*binaryExpr) exprNode() {} + +type assignExpr struct { + op string + left expr + right expr +} + +func (*assignExpr) exprNode() {} + +type incDecExpr struct { + op string + x expr + prefix bool +} + +func (*incDecExpr) exprNode() {} diff --git a/builtins/awk/awk.go b/builtins/awk/awk.go new file mode 100644 index 000000000..3a41fdc0a --- /dev/null +++ b/builtins/awk/awk.go @@ -0,0 +1,237 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +// Package awk implements the awk builtin command. +// +// awk - pattern scanning and text processing +// +// Usage: awk [OPTION]... 'program' [FILE]... +// +// awk [OPTION]... -f program-file [FILE]... +// +// Phase 1 implements a practical, intentionally restricted awk profile: +// program loading from an inline argument or -f files, -F one-character field +// separators, -v scalar variables, BEGIN/main/END rules, print, scalar +// assignment, arithmetic/comparison/boolean expressions, regex patterns and +// match operators, string concatenation, and read-only fields/built-in +// variables such as $0, $1, NF, NR, FNR, FILENAME, FS, OFS, and ORS. +// +// Blocked or deferred features include system(), command pipes, output +// redirection, getline, arrays, control flow statements, printf, user-defined +// functions, regex FS, and field mutation/$0 rebuilding. +package awk + +import ( + "context" + "fmt" + "io" + "os" + "strings" + + "github.com/DataDog/rshell/builtins" +) + +// Cmd is the awk builtin command descriptor. +var Cmd = builtins.Command{ + Name: "awk", + Description: "pattern scanning and text processing", + MakeFlags: registerFlags, +} + +type stringList []string + +func (s *stringList) String() string { return strings.Join(*s, ",") } +func (s *stringList) Set(v string) error { + *s = append(*s, v) + return nil +} +func (s *stringList) Type() string { return "string" } + +type orderedOptionKind int + +const ( + orderedOptionFieldSeparator orderedOptionKind = iota + orderedOptionAssignment +) + +type orderedOption struct { + kind orderedOptionKind + value string +} + +type fieldSeparatorOption struct { + options *[]orderedOption + value string +} + +func (f *fieldSeparatorOption) String() string { return f.value } +func (f *fieldSeparatorOption) Set(v string) error { + f.value = v + *f.options = append(*f.options, orderedOption{kind: orderedOptionFieldSeparator, value: v}) + return nil +} +func (f *fieldSeparatorOption) Type() string { return "string" } + +type assignmentOption struct { + options *[]orderedOption + values []string +} + +func (a *assignmentOption) String() string { return strings.Join(a.values, ",") } +func (a *assignmentOption) Set(v string) error { + a.values = append(a.values, v) + *a.options = append(*a.options, orderedOption{kind: orderedOptionAssignment, value: v}) + return nil +} +func (a *assignmentOption) Type() string { return "string" } + +func registerFlags(fs *builtins.FlagSet) builtins.HandlerFunc { + fs.SetInterspersed(false) + help := fs.BoolP("help", "h", false, "print usage and exit") + var orderedOptions []orderedOption + fieldSep := fieldSeparatorOption{options: &orderedOptions} + fs.VarP(&fieldSep, "field-separator", "F", "use a single-character input field separator") + var programFiles stringList + fs.VarP(&programFiles, "file", "f", "read awk program from file") + assignments := assignmentOption{options: &orderedOptions} + fs.VarP(&assignments, "assign", "v", "assign awk variable before execution") + + return func(ctx context.Context, callCtx *builtins.CallContext, args []string) builtins.Result { + if *help { + printHelp(callCtx, fs) + return builtins.Result{} + } + programText, files, err := loadProgram(ctx, callCtx, args, programFiles) + if err != nil { + callCtx.Errf("awk: %v\n", err) + return builtins.Result{Code: 1} + } + prog, err := parseProgram(programText) + if err != nil { + callCtx.Errf("awk: %v\n", err) + return builtins.Result{Code: 1} + } + rt := newRuntime(callCtx, prog) + for _, opt := range orderedOptions { + name := "FS" + value := opt.value + if opt.kind == orderedOptionAssignment { + var ok bool + name, value, ok = strings.Cut(opt.value, "=") + if !ok || !validVarName(name) { + callCtx.Errf("awk: invalid -v assignment %q\n", opt.value) + return builtins.Result{Code: 1} + } + } + if err := rt.setVar(name, inputStringValue(DecodeAwkEscapes(value))); err != nil { + callCtx.Errf("awk: %v\n", err) + return builtins.Result{Code: 1} + } + } + return rt.run(ctx, files) + } +} + +func printHelp(callCtx *builtins.CallContext, fs *builtins.FlagSet) { + callCtx.Out("Usage: awk [OPTION]... 'program' [FILE]...\n") + callCtx.Out("Pattern scanning and text processing.\n") + callCtx.Out("With no FILE, or when FILE is -, read standard input.\n\n") + fs.SetOutput(callCtx.Stdout) + fs.PrintDefaults() +} + +func loadProgram(ctx context.Context, callCtx *builtins.CallContext, args []string, programFiles []string) (string, []string, error) { + var parts []string + var files []string + total := 0 + if len(programFiles) > 0 { + for _, path := range programFiles { + text, err := readProgramFile(ctx, callCtx, path, &total) + if err != nil { + return "", nil, err + } + parts = append(parts, text) + } + files = args + } else { + if len(args) == 0 { + return "", nil, fmt.Errorf("missing program") + } + total += len(args[0]) + if total > MaxProgramBytes { + return "", nil, fmt.Errorf("program exceeds %d bytes", MaxProgramBytes) + } + parts = append(parts, args[0]) + files = args[1:] + } + if len(parts) == 0 { + return "", nil, fmt.Errorf("missing program") + } + return strings.Join(parts, "\n"), files, nil +} + +func readProgramFile(ctx context.Context, callCtx *builtins.CallContext, path string, total *int) (string, error) { + if path == "-" { + if callCtx.Stdin == nil { + return "", nil + } + return readProgram(ctx, callCtx.Stdin, total) + } + rc, err := callCtx.OpenFile(ctx, path, os.O_RDONLY, 0) + if err != nil { + return "", err + } + defer rc.Close() + return readProgram(ctx, rc, total) +} + +type byteReader interface { + Read([]byte) (int, error) +} + +func readProgram(ctx context.Context, r byteReader, total *int) (string, error) { + var b strings.Builder + buf := make([]byte, 32*1024) + for { + if err := ctx.Err(); err != nil { + return "", err + } + n, err := r.Read(buf) + if n > 0 { + *total += n + if *total > MaxProgramBytes { + return "", fmt.Errorf("program exceeds %d bytes", MaxProgramBytes) + } + b.WriteString(string(buf[:n])) + } + if err == io.EOF { + break + } + if err != nil { + return "", err + } + } + return b.String(), nil +} + +func validVarName(name string) bool { + return validIdentifierName(name) && !isSpecialPatternName(name) +} + +func validIdentifierName(name string) bool { + if name == "" || !isIdentStart(rune(name[0])) { + return false + } + for _, ch := range name[1:] { + if !isIdentPart(ch) { + return false + } + } + return true +} + +func isSpecialPatternName(name string) bool { + return name == "BEGIN" || name == "END" +} diff --git a/builtins/awk/eval.go b/builtins/awk/eval.go new file mode 100644 index 000000000..41f069fd9 --- /dev/null +++ b/builtins/awk/eval.go @@ -0,0 +1,325 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package awk + +import ( + "fmt" + "math" + "strings" +) + +func (rt *runtime) execStatements(stmts []stmt) error { + for _, st := range stmts { + switch s := st.(type) { + case *printStmt: + vals := make([]value, 0, len(s.args)) + if len(s.args) == 0 { + vals = append(vals, rt.field(0)) + } else { + for _, arg := range s.args { + v, err := rt.eval(arg) + if err != nil { + return err + } + vals = append(vals, v) + } + } + if err := rt.printValues(vals); err != nil { + return err + } + case *exprStmt: + if _, err := rt.eval(s.x); err != nil { + return err + } + default: + return fmt.Errorf("unknown statement") + } + } + return nil +} + +func (rt *runtime) printValues(vals []value) error { + parts := make([]string, len(vals)) + for i, v := range vals { + parts[i] = v.String() + } + rt.callCtx.Out(strings.Join(parts, rt.getVar("OFS").String())) + rt.callCtx.Out(rt.getVar("ORS").String()) + return nil +} + +func (rt *runtime) eval(x expr) (value, error) { + switch e := x.(type) { + case *numberExpr: + return numberValue(e.num), nil + case *stringExpr: + return stringValue(e.value), nil + case *regexExpr: + re, err := compileRegex(e.pattern) + if err != nil { + return value{}, err + } + return boolValue(re.MatchString(rt.record)), nil + case *varExpr: + return rt.getVar(e.name), nil + case *fieldExpr: + v, err := rt.eval(e.index) + if err != nil { + return value{}, err + } + n := int(v.Number()) + if n < 0 { + return value{}, fmt.Errorf("invalid field index") + } + return rt.field(n), nil + case *groupedExpr: + return rt.eval(e.x) + case *unaryExpr: + v, err := rt.eval(e.x) + if err != nil { + return value{}, err + } + switch e.op { + case "+": + return numberValue(v.Number()), nil + case "-": + return numberValue(-v.Number()), nil + case "!": + if v.Bool() { + return numberValue(0), nil + } + return numberValue(1), nil + default: + return value{}, fmt.Errorf("unknown unary operator %s", e.op) + } + case *binaryExpr: + return rt.evalBinary(e) + case *assignExpr: + return rt.evalAssign(e) + case *incDecExpr: + return rt.evalIncDec(e) + default: + return value{}, fmt.Errorf("unknown expression") + } +} + +func (rt *runtime) evalBinary(e *binaryExpr) (value, error) { + if e.op == "&&" { + left, err := rt.eval(e.left) + if err != nil { + return value{}, err + } + if !left.Bool() { + return numberValue(0), nil + } + right, err := rt.eval(e.right) + if err != nil { + return value{}, err + } + if right.Bool() { + return numberValue(1), nil + } + return numberValue(0), nil + } + if e.op == "||" { + left, err := rt.eval(e.left) + if err != nil { + return value{}, err + } + if left.Bool() { + return numberValue(1), nil + } + right, err := rt.eval(e.right) + if err != nil { + return value{}, err + } + if right.Bool() { + return numberValue(1), nil + } + return numberValue(0), nil + } + left, err := rt.eval(e.left) + if err != nil { + return value{}, err + } + switch e.op { + case "~", "!~": + matched, err := rt.matchRegexExpr(left, e.right) + if err != nil { + return value{}, err + } + if e.op == "!~" { + matched = !matched + } + return boolValue(matched), nil + } + right, err := rt.eval(e.right) + if err != nil { + return value{}, err + } + switch e.op { + case "concat": + return stringValue(left.String() + right.String()), nil + case "+": + return numberValue(left.Number() + right.Number()), nil + case "-": + return numberValue(left.Number() - right.Number()), nil + case "*": + return numberValue(left.Number() * right.Number()), nil + case "/": + if right.Number() == 0 { + return value{}, fmt.Errorf("division by zero attempted") + } + return numberValue(left.Number() / right.Number()), nil + case "%": + if right.Number() == 0 { + return value{}, fmt.Errorf("division by zero attempted") + } + return numberValue(math.Mod(left.Number(), right.Number())), nil + case "==", "!=", "<", "<=", ">", ">=": + return boolValue(compareValues(left, right, e.op)), nil + default: + return value{}, fmt.Errorf("unknown binary operator %s", e.op) + } +} + +func (rt *runtime) matchRegexExpr(left value, rightExpr expr) (bool, error) { + if rx, ok := rightExpr.(*regexExpr); ok { + re, err := compileRegex(rx.pattern) + if err != nil { + return false, err + } + return re.MatchString(left.String()), nil + } + right, err := rt.eval(rightExpr) + if err != nil { + return false, err + } + re, err := compileRegex(right.String()) + if err != nil { + return false, err + } + return re.MatchString(left.String()), nil +} + +func (rt *runtime) evalAssign(e *assignExpr) (value, error) { + lhs, ok := e.left.(*varExpr) + if !ok { + return value{}, fmt.Errorf("assignment requires a scalar variable") + } + right, err := rt.eval(e.right) + if err != nil { + return value{}, err + } + if e.op != "=" { + left := rt.getVar(lhs.name) + switch e.op { + case "+=": + right = numberValue(left.Number() + right.Number()) + case "-=": + right = numberValue(left.Number() - right.Number()) + case "*=": + right = numberValue(left.Number() * right.Number()) + case "/=": + if right.Number() == 0 { + return value{}, fmt.Errorf("division by zero attempted") + } + right = numberValue(left.Number() / right.Number()) + case "%=": + if right.Number() == 0 { + return value{}, fmt.Errorf("division by zero attempted") + } + right = numberValue(math.Mod(left.Number(), right.Number())) + default: + return value{}, fmt.Errorf("unknown assignment operator %s", e.op) + } + } + if err := rt.setVar(lhs.name, right); err != nil { + return value{}, err + } + return right, nil +} + +func (rt *runtime) evalIncDec(e *incDecExpr) (value, error) { + vref, ok := e.x.(*varExpr) + if !ok { + return value{}, fmt.Errorf("increment and decrement require scalar variables") + } + old := rt.getVar(vref.name) + next := old.Number() + if e.op == "++" { + next++ + } else { + next-- + } + nv := numberValue(next) + if err := rt.setVar(vref.name, nv); err != nil { + return value{}, err + } + if e.prefix { + return nv, nil + } + return old, nil +} + +func boolValue(ok bool) value { + if ok { + return numberValue(1) + } + return numberValue(0) +} + +func compareValues(left, right value, op string) bool { + var cmp int + if valuesAreNumeric(left, right) { + ln, rn := left.Number(), right.Number() + switch { + case ln < rn: + cmp = -1 + case ln > rn: + cmp = 1 + default: + cmp = 0 + } + } else { + ls, rs := left.String(), right.String() + switch { + case ls < rs: + cmp = -1 + case ls > rs: + cmp = 1 + default: + cmp = 0 + } + } + switch op { + case "==": + return cmp == 0 + case "!=": + return cmp != 0 + case "<": + return cmp < 0 + case "<=": + return cmp <= 0 + case ">": + return cmp > 0 + case ">=": + return cmp >= 0 + default: + return false + } +} + +func valuesAreNumeric(left, right value) bool { + switch left.kind { + case valueNumber, valueStrNum: + switch right.kind { + case valueNumber, valueStrNum: + return true + } + } + return false +} diff --git a/builtins/awk/lexer.go b/builtins/awk/lexer.go new file mode 100644 index 000000000..b9d9c1ef6 --- /dev/null +++ b/builtins/awk/lexer.go @@ -0,0 +1,390 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package awk + +import ( + "fmt" + "strings" + "unicode/utf8" +) + +type tokenKind int + +const ( + tokEOF tokenKind = iota + tokNewline + tokIdent + tokNumber + tokString + tokRegex + tokLBrace + tokRBrace + tokLParen + tokRParen + tokLBracket + tokRBracket + tokSemicolon + tokComma + tokDollar + tokAssign + tokPlus + tokMinus + tokStar + tokSlash + tokPercent + tokBang + tokTilde + tokLT + tokGT + tokLE + tokGE + tokEQ + tokNE + tokAnd + tokOr + tokMatch + tokNotMatch + tokPlusAssign + tokMinusAssign + tokStarAssign + tokSlashAssign + tokPercentAssign + tokInc + tokDec + tokAppend + tokPipe +) + +type token struct { + kind tokenKind + lit string + pos int +} + +type lexer struct { + src []rune + pos int + last tokenKind + lastLit string +} + +func lex(src string) ([]token, error) { + l := &lexer{src: []rune(src), last: tokEOF} + var toks []token + for { + tok, err := l.next() + if err != nil { + return nil, err + } + toks = append(toks, tok) + if tok.kind == tokEOF { + return toks, nil + } + if tok.kind != tokEOF { + l.last = tok.kind + l.lastLit = tok.lit + } + } +} + +func (l *lexer) next() (token, error) { + for l.pos < len(l.src) { + ch := l.src[l.pos] + if ch == ' ' || ch == '\t' || ch == '\r' { + l.pos++ + continue + } + if ch == '#' { + for l.pos < len(l.src) && l.src[l.pos] != '\n' { + l.pos++ + } + continue + } + break + } + start := l.pos + if l.pos >= len(l.src) { + return token{kind: tokEOF, pos: start}, nil + } + ch := l.src[l.pos] + l.pos++ + switch ch { + case '\n': + return token{kind: tokNewline, lit: "\n", pos: start}, nil + case '{': + return token{kind: tokLBrace, lit: "{", pos: start}, nil + case '}': + return token{kind: tokRBrace, lit: "}", pos: start}, nil + case '(': + return token{kind: tokLParen, lit: "(", pos: start}, nil + case ')': + return token{kind: tokRParen, lit: ")", pos: start}, nil + case '[': + return token{kind: tokLBracket, lit: "[", pos: start}, nil + case ']': + return token{kind: tokRBracket, lit: "]", pos: start}, nil + case ';': + return token{kind: tokSemicolon, lit: ";", pos: start}, nil + case ',': + return token{kind: tokComma, lit: ",", pos: start}, nil + case '$': + return token{kind: tokDollar, lit: "$", pos: start}, nil + case '~': + return token{kind: tokMatch, lit: "~", pos: start}, nil + case '+': + if l.match('+') { + return token{kind: tokInc, lit: "++", pos: start}, nil + } + if l.match('=') { + return token{kind: tokPlusAssign, lit: "+=", pos: start}, nil + } + return token{kind: tokPlus, lit: "+", pos: start}, nil + case '-': + if l.match('-') { + return token{kind: tokDec, lit: "--", pos: start}, nil + } + if l.match('=') { + return token{kind: tokMinusAssign, lit: "-=", pos: start}, nil + } + return token{kind: tokMinus, lit: "-", pos: start}, nil + case '*': + if l.match('=') { + return token{kind: tokStarAssign, lit: "*=", pos: start}, nil + } + return token{kind: tokStar, lit: "*", pos: start}, nil + case '/': + if l.match('=') { + return token{kind: tokSlashAssign, lit: "/=", pos: start}, nil + } + if canStartRegex(l.last, l.lastLit) { + return l.scanRegex(start) + } + return token{kind: tokSlash, lit: "/", pos: start}, nil + case '%': + if l.match('=') { + return token{kind: tokPercentAssign, lit: "%=", pos: start}, nil + } + return token{kind: tokPercent, lit: "%", pos: start}, nil + case '!': + if l.match('=') { + return token{kind: tokNE, lit: "!=", pos: start}, nil + } + if l.match('~') { + return token{kind: tokNotMatch, lit: "!~", pos: start}, nil + } + return token{kind: tokBang, lit: "!", pos: start}, nil + case '<': + if l.match('=') { + return token{kind: tokLE, lit: "<=", pos: start}, nil + } + return token{kind: tokLT, lit: "<", pos: start}, nil + case '>': + if l.match('=') { + return token{kind: tokGE, lit: ">=", pos: start}, nil + } + if l.match('>') { + return token{kind: tokAppend, lit: ">>", pos: start}, nil + } + return token{kind: tokGT, lit: ">", pos: start}, nil + case '=': + if l.match('=') { + return token{kind: tokEQ, lit: "==", pos: start}, nil + } + return token{kind: tokAssign, lit: "=", pos: start}, nil + case '&': + if l.match('&') { + return token{kind: tokAnd, lit: "&&", pos: start}, nil + } + case '|': + if l.match('|') { + return token{kind: tokOr, lit: "||", pos: start}, nil + } + return token{kind: tokPipe, lit: "|", pos: start}, nil + case '"': + return l.scanString(start) + } + if isDigit(ch) || (ch == '.' && l.pos < len(l.src) && isDigit(l.src[l.pos])) { + return l.scanNumber(start) + } + if isIdentStart(ch) { + return l.scanIdent(start), nil + } + return token{}, fmt.Errorf("unexpected character %q", ch) +} + +func (l *lexer) match(ch rune) bool { + if l.pos >= len(l.src) || l.src[l.pos] != ch { + return false + } + l.pos++ + return true +} + +func (l *lexer) scanIdent(start int) token { + for l.pos < len(l.src) && isIdentPart(l.src[l.pos]) { + l.pos++ + } + lit := string(l.src[start:l.pos]) + return token{kind: tokIdent, lit: lit, pos: start} +} + +func (l *lexer) scanNumber(start int) (token, error) { + if l.src[start] == '.' { + for l.pos < len(l.src) && isDigit(l.src[l.pos]) { + l.pos++ + } + } else { + for l.pos < len(l.src) && isDigit(l.src[l.pos]) { + l.pos++ + } + if l.pos < len(l.src) && l.src[l.pos] == '.' { + l.pos++ + for l.pos < len(l.src) && isDigit(l.src[l.pos]) { + l.pos++ + } + } + } + if l.pos < len(l.src) && (l.src[l.pos] == 'e' || l.src[l.pos] == 'E') { + save := l.pos + l.pos++ + if l.pos < len(l.src) && (l.src[l.pos] == '+' || l.src[l.pos] == '-') { + l.pos++ + } + if l.pos >= len(l.src) || !isDigit(l.src[l.pos]) { + l.pos = save + } else { + for l.pos < len(l.src) && isDigit(l.src[l.pos]) { + l.pos++ + } + } + } + return token{kind: tokNumber, lit: string(l.src[start:l.pos]), pos: start}, nil +} + +func (l *lexer) scanString(start int) (token, error) { + var b strings.Builder + for l.pos < len(l.src) { + ch := l.src[l.pos] + l.pos++ + if ch == '"' { + return token{kind: tokString, lit: b.String(), pos: start}, nil + } + if ch == '\n' { + return token{}, fmt.Errorf("unterminated string") + } + if ch == '\\' { + if l.pos >= len(l.src) { + return token{}, fmt.Errorf("unterminated string escape") + } + esc := l.src[l.pos] + l.pos++ + b.WriteRune(decodeSimpleEscape(esc)) + continue + } + b.WriteRune(ch) + } + return token{}, fmt.Errorf("unterminated string") +} + +func (l *lexer) scanRegex(start int) (token, error) { + var b strings.Builder + inClass := false + for l.pos < len(l.src) { + ch := l.src[l.pos] + l.pos++ + if ch == '/' && !inClass { + return token{kind: tokRegex, lit: b.String(), pos: start}, nil + } + if ch == '\n' { + return token{}, fmt.Errorf("unterminated regular expression") + } + if ch == '\\' { + if l.pos >= len(l.src) { + return token{}, fmt.Errorf("unterminated regular expression escape") + } + next := l.src[l.pos] + l.pos++ + b.WriteRune('\\') + b.WriteRune(next) + continue + } + if ch == '[' { + inClass = true + } else if ch == ']' && inClass { + inClass = false + } + b.WriteRune(ch) + } + return token{}, fmt.Errorf("unterminated regular expression") +} + +func canStartRegex(prev tokenKind, prevLit string) bool { + if prev == tokIdent && prevLit == "print" { + return true + } + switch prev { + case tokEOF, tokNewline, tokLBrace, tokRBrace, tokLParen, tokComma, tokSemicolon, + tokAssign, tokPlus, tokMinus, tokStar, tokSlash, tokPercent, tokBang, + tokLT, tokGT, tokLE, tokGE, tokEQ, tokNE, tokAnd, tokOr, tokMatch, + tokNotMatch, tokPlusAssign, tokMinusAssign, tokStarAssign, + tokSlashAssign, tokPercentAssign: + return true + default: + return false + } +} + +func DecodeAwkEscapes(s string) string { + var b strings.Builder + for len(s) > 0 { + r, size := utf8.DecodeRuneInString(s) + if r == utf8.RuneError && size == 0 { + break + } + s = s[size:] + if r != '\\' || len(s) == 0 { + b.WriteRune(r) + continue + } + esc, escSize := utf8.DecodeRuneInString(s) + s = s[escSize:] + b.WriteRune(decodeSimpleEscape(esc)) + } + return b.String() +} + +func decodeSimpleEscape(esc rune) rune { + switch esc { + case 'n': + return '\n' + case 't': + return '\t' + case 'r': + return '\r' + case 'b': + return '\b' + case 'f': + return '\f' + case 'a': + return '\a' + case 'v': + return '\v' + default: + return esc + } +} + +func isIdentStart(ch rune) bool { + return ch == '_' || (ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') +} + +func isIdentPart(ch rune) bool { + return isIdentStart(ch) || isDigit(ch) +} + +func isDigit(ch rune) bool { + return ch >= '0' && ch <= '9' +} diff --git a/builtins/awk/parser.go b/builtins/awk/parser.go new file mode 100644 index 000000000..16aeae1bd --- /dev/null +++ b/builtins/awk/parser.go @@ -0,0 +1,504 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package awk + +import ( + "fmt" + "strconv" +) + +const ( + precAssign = 10 + precOr = 20 + precAnd = 30 + precCompare = 40 + precConcat = 50 + precAdd = 60 + precMul = 70 + precPrefix = 80 + precPostfix = 90 +) + +var unsupportedBuiltinFunctions = map[string]struct{}{ + "and": {}, + "asort": {}, + "asorti": {}, + "atan2": {}, + "bindtextdomain": {}, + "close": {}, + "compl": {}, + "cos": {}, + "dcgettext": {}, + "dcngettext": {}, + "exp": {}, + "fflush": {}, + "gensub": {}, + "gsub": {}, + "index": {}, + "int": {}, + "isarray": {}, + "length": {}, + "log": {}, + "lshift": {}, + "match": {}, + "mktime": {}, + "or": {}, + "patsplit": {}, + "rand": {}, + "rshift": {}, + "sin": {}, + "split": {}, + "sprintf": {}, + "sqrt": {}, + "srand": {}, + "strftime": {}, + "strtonum": {}, + "sub": {}, + "substr": {}, + "system": {}, + "systime": {}, + "tolower": {}, + "toupper": {}, + "typeof": {}, + "xor": {}, +} + +type parser struct { + toks []token + pos int + stopPrintRedirect bool +} + +func parseProgram(src string) (*program, error) { + toks, err := lex(src) + if err != nil { + return nil, err + } + p := &parser{toks: toks} + prog := &program{} + p.skipSeparators() + for !p.at(tokEOF) { + r, err := p.parseRule() + if err != nil { + return nil, err + } + prog.rules = append(prog.rules, r) + p.skipSeparators() + } + return prog, nil +} + +func (p *parser) parseRule() (rule, error) { + if p.atIdent("BEGIN") { + p.advance() + action, err := p.parseAction() + return rule{kind: ruleBegin, action: action}, err + } + if p.atIdent("END") { + p.advance() + action, err := p.parseAction() + return rule{kind: ruleEnd, action: action}, err + } + if p.at(tokLBrace) { + action, err := p.parseAction() + return rule{kind: ruleNormal, action: action}, err + } + pattern, err := p.parseExpression(0) + if err != nil { + return rule{}, err + } + if p.at(tokComma) { + return rule{}, fmt.Errorf("range patterns are not supported") + } + if p.at(tokLBrace) { + action, err := p.parseAction() + return rule{kind: ruleNormal, pattern: pattern, action: action}, err + } + return rule{kind: ruleNormal, pattern: pattern}, nil +} + +func (p *parser) parseAction() ([]stmt, error) { + if !p.match(tokLBrace) { + return nil, fmt.Errorf("expected action") + } + stmts := []stmt{} + p.skipSeparators() + for !p.at(tokRBrace) { + if p.at(tokEOF) { + return nil, fmt.Errorf("unterminated action") + } + st, err := p.parseStatement() + if err != nil { + return nil, err + } + stmts = append(stmts, st) + if !p.at(tokRBrace) && !p.at(tokEOF) && !isSeparator(p.cur().kind) { + return nil, fmt.Errorf("expected statement separator") + } + p.skipSeparators() + } + p.advance() + return stmts, nil +} + +func (p *parser) parseStatement() (stmt, error) { + if p.atIdent("print") { + return p.parsePrint() + } + if p.atIdent("printf") { + return nil, fmt.Errorf("printf is not supported") + } + if p.atIdent("if") || p.atIdent("while") || p.atIdent("for") || + p.atIdent("next") || p.atIdent("nextfile") || p.atIdent("exit") || + p.atIdent("break") || p.atIdent("continue") { + return nil, fmt.Errorf("control flow statements are not supported") + } + if p.atIdent("delete") { + return nil, fmt.Errorf("arrays are not supported") + } + if p.atIdent("getline") { + return nil, fmt.Errorf("getline is not supported") + } + x, err := p.parseExpression(0) + if err != nil { + return nil, err + } + return &exprStmt{x: x}, nil +} + +func (p *parser) parsePrint() (stmt, error) { + p.advance() + ps := &printStmt{} + if p.at(tokRBrace) || p.at(tokEOF) || isSeparator(p.cur().kind) { + return ps, nil + } + old := p.stopPrintRedirect + p.stopPrintRedirect = true + defer func() { p.stopPrintRedirect = old }() + for { + x, err := p.parseExpression(0) + if err != nil { + return nil, err + } + ps.args = append(ps.args, x) + if p.at(tokGT) || p.at(tokAppend) || p.at(tokPipe) { + return nil, fmt.Errorf("print redirection and command pipes are not supported") + } + if !p.match(tokComma) { + break + } + p.skipSeparators() + } + return ps, nil +} + +func (p *parser) parseExpression(minPrec int) (expr, error) { + left, err := p.parsePrefix() + if err != nil { + return nil, err + } + for { + if p.at(tokInc) || p.at(tokDec) { + if precPostfix < minPrec { + break + } + op := p.cur().lit + p.advance() + if _, ok := left.(*varExpr); !ok { + return nil, fmt.Errorf("increment and decrement require scalar variables") + } + left = &incDecExpr{op: op, x: left} + continue + } + if p.stopPrintRedirect && (p.at(tokGT) || p.at(tokAppend) || p.at(tokPipe)) { + break + } + if op, prec, assoc, ok := p.binaryOp(); ok { + if prec < minPrec { + break + } + p.advance() + nextMin := prec + 1 + if assoc == "right" { + nextMin = prec + } + right, err := p.parseExpression(nextMin) + if err != nil { + return nil, err + } + if isComparisonOp(op) { + if b, ok := left.(*binaryExpr); ok && isComparisonOp(b.op) { + return nil, fmt.Errorf("chained comparisons are not supported") + } + } + if isAssignOp(op) { + if _, ok := left.(*fieldExpr); ok { + return nil, fmt.Errorf("field assignment is not supported") + } + if _, ok := left.(*varExpr); !ok { + return nil, fmt.Errorf("assignment requires a scalar variable") + } + left = &assignExpr{op: op, left: left, right: right} + } else { + left = &binaryExpr{op: op, left: left, right: right} + } + continue + } + if minPrec <= precConcat && p.canStartConcatenation() { + right, err := p.parseExpression(precConcat + 1) + if err != nil { + return nil, err + } + left = &binaryExpr{op: "concat", left: left, right: right} + continue + } + break + } + return left, nil +} + +func (p *parser) parsePrefix() (expr, error) { + tok := p.cur() + switch tok.kind { + case tokNumber: + p.advance() + n, err := strconv.ParseFloat(tok.lit, 64) + if err != nil { + return nil, fmt.Errorf("invalid number %q", tok.lit) + } + return &numberExpr{text: tok.lit, num: n}, nil + case tokString: + p.advance() + return &stringExpr{value: tok.lit}, nil + case tokRegex: + p.advance() + return ®exExpr{pattern: tok.lit}, nil + case tokIdent: + p.advance() + if err := validateIdentifierReference(tok.lit); err != nil { + return nil, err + } + if p.at(tokLParen) { + return nil, fmt.Errorf("function calls are not supported") + } + if p.at(tokLBracket) { + return nil, fmt.Errorf("arrays are not supported") + } + return &varExpr{name: tok.lit}, nil + case tokDollar: + return p.parseFieldRef() + case tokLParen: + p.advance() + old := p.stopPrintRedirect + p.stopPrintRedirect = false + x, err := p.parseExpression(0) + p.stopPrintRedirect = old + if err != nil { + return nil, err + } + if !p.match(tokRParen) { + return nil, fmt.Errorf("expected )") + } + return &groupedExpr{x: x}, nil + case tokPlus, tokMinus, tokBang: + p.advance() + x, err := p.parseExpression(precPrefix) + if err != nil { + return nil, err + } + return &unaryExpr{op: tok.lit, x: x}, nil + case tokInc, tokDec: + p.advance() + x, err := p.parseExpression(precPrefix) + if err != nil { + return nil, err + } + if _, ok := x.(*varExpr); !ok { + return nil, fmt.Errorf("increment and decrement require scalar variables") + } + return &incDecExpr{op: tok.lit, x: x, prefix: true}, nil + default: + return nil, fmt.Errorf("expected expression") + } +} + +func validateIdentifierReference(name string) error { + if msg, ok := unsupportedExpressionKeyword(name); ok { + return fmt.Errorf("%s", msg) + } + if name == "system" { + return fmt.Errorf("system() is not supported") + } + if _, ok := unsupportedBuiltinFunctions[name]; ok { + return fmt.Errorf("function calls are not supported") + } + return nil +} + +func unsupportedExpressionKeyword(name string) (string, bool) { + switch name { + case "BEGIN", "END": + return "BEGIN and END are reserved patterns", true + case "if", "while", "for", "next", "nextfile", "exit", "break", "continue": + return "control flow statements are not supported", true + case "delete": + return "arrays are not supported", true + case "getline": + return "getline is not supported", true + case "printf": + return "printf is not supported", true + case "print": + return "print is not supported in expressions", true + default: + return "", false + } +} + +func isComparisonOp(op string) bool { + switch op { + case "==", "!=", "<", ">", "<=", ">=": + return true + default: + return false + } +} + +func (p *parser) parseFieldRef() (expr, error) { + p.advance() + switch tok := p.cur(); tok.kind { + case tokNumber: + p.advance() + n, err := strconv.ParseFloat(tok.lit, 64) + if err != nil { + return nil, fmt.Errorf("invalid field number %q", tok.lit) + } + return &fieldExpr{index: &numberExpr{text: tok.lit, num: n}}, nil + case tokIdent: + p.advance() + if err := validateIdentifierReference(tok.lit); err != nil { + return nil, err + } + return &fieldExpr{index: &varExpr{name: tok.lit}}, nil + case tokLParen: + p.advance() + x, err := p.parseExpression(0) + if err != nil { + return nil, err + } + if !p.match(tokRParen) { + return nil, fmt.Errorf("expected ) after field expression") + } + return &fieldExpr{index: x}, nil + default: + return nil, fmt.Errorf("expected field reference") + } +} + +func (p *parser) binaryOp() (string, int, string, bool) { + switch p.cur().kind { + case tokAssign: + return "=", precAssign, "right", true + case tokPlusAssign: + return "+=", precAssign, "right", true + case tokMinusAssign: + return "-=", precAssign, "right", true + case tokStarAssign: + return "*=", precAssign, "right", true + case tokSlashAssign: + return "/=", precAssign, "right", true + case tokPercentAssign: + return "%=", precAssign, "right", true + case tokOr: + return "||", precOr, "left", true + case tokAnd: + return "&&", precAnd, "left", true + case tokEQ: + return "==", precCompare, "left", true + case tokNE: + return "!=", precCompare, "left", true + case tokLT: + return "<", precCompare, "left", true + case tokGT: + return ">", precCompare, "left", true + case tokLE: + return "<=", precCompare, "left", true + case tokGE: + return ">=", precCompare, "left", true + case tokMatch: + return "~", precCompare, "left", true + case tokNotMatch: + return "!~", precCompare, "left", true + case tokPlus: + return "+", precAdd, "left", true + case tokMinus: + return "-", precAdd, "left", true + case tokStar: + return "*", precMul, "left", true + case tokSlash: + return "/", precMul, "left", true + case tokPercent: + return "%", precMul, "left", true + default: + return "", 0, "", false + } +} + +func (p *parser) canStartConcatenation() bool { + switch p.cur().kind { + case tokIdent, tokNumber, tokString, tokRegex, tokDollar, tokLParen: + return true + default: + return false + } +} + +func isAssignOp(op string) bool { + switch op { + case "=", "+=", "-=", "*=", "/=", "%=": + return true + default: + return false + } +} + +func (p *parser) skipSeparators() { + for isSeparator(p.cur().kind) { + p.advance() + } +} + +func isSeparator(k tokenKind) bool { + return k == tokNewline || k == tokSemicolon +} + +func (p *parser) cur() token { + if p.pos >= len(p.toks) { + return token{kind: tokEOF} + } + return p.toks[p.pos] +} + +func (p *parser) at(k tokenKind) bool { + return p.cur().kind == k +} + +func (p *parser) atIdent(s string) bool { + return p.cur().kind == tokIdent && p.cur().lit == s +} + +func (p *parser) match(k tokenKind) bool { + if !p.at(k) { + return false + } + p.advance() + return true +} + +func (p *parser) advance() { + if p.pos < len(p.toks) { + p.pos++ + } +} diff --git a/builtins/awk/parser_test.go b/builtins/awk/parser_test.go new file mode 100644 index 000000000..85cdbe4c9 --- /dev/null +++ b/builtins/awk/parser_test.go @@ -0,0 +1,36 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package awk + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParsePracticalAwkProgram(t *testing.T) { + prog, err := parseProgram(`BEGIN { label = "sum=" } $2 > 1 { total += $2; print label total } END { print total }`) + require.NoError(t, err) + require.Len(t, prog.rules, 3) + assert.Equal(t, ruleBegin, prog.rules[0].kind) + assert.Equal(t, ruleNormal, prog.rules[1].kind) + assert.Equal(t, ruleEnd, prog.rules[2].kind) +} + +func TestParseRejectsUnsafeFeatures(t *testing.T) { + for _, src := range []string{ + `{ system("sh") }`, + `{ print $1 > "out" }`, + `{ "cmd" | getline }`, + `{ $1 = "x" }`, + `{ next; print $1 }`, + `{ exit 1 }`, + } { + _, err := parseProgram(src) + require.Error(t, err, src) + } +} diff --git a/builtins/awk/runtime.go b/builtins/awk/runtime.go new file mode 100644 index 000000000..250c48906 --- /dev/null +++ b/builtins/awk/runtime.go @@ -0,0 +1,553 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package awk + +import ( + "bufio" + "context" + "fmt" + "io" + "os" + "regexp" + "strconv" + "strings" + "unicode/utf8" + + "github.com/DataDog/rshell/builtins" +) + +const ( + MaxProgramBytes = 256 << 10 + MaxRecordBytes = 1 << 20 + MaxFields = 16_384 + MaxVariableBytes = 1 << 20 + maxFiniteFloat64 = 1.79769313486231570814527423731704357e+308 +) + +type valueKind int + +const ( + valueString valueKind = iota + valueNumber + valueStrNum + valueRegex +) + +type value struct { + kind valueKind + s string + n float64 + pattern string +} + +func stringValue(s string) value { + return value{kind: valueString, s: s} +} + +func inputStringValue(s string) value { + if n, ok := parseFullNumericString(s); ok { + return value{kind: valueStrNum, s: s, n: n} + } + return value{kind: valueString, s: s} +} + +func unassignedValue() value { + return value{kind: valueStrNum, s: "", n: 0} +} + +func numberValue(n float64) value { + return value{kind: valueNumber, n: n} +} + +func regexValue(pattern string) value { + return value{kind: valueRegex, pattern: pattern} +} + +func (v value) String() string { + switch v.kind { + case valueNumber: + return formatAwkNumber(v.n) + case valueRegex: + return v.pattern + default: + return v.s + } +} + +func formatAwkNumber(n float64) string { + if n == 0 { + n = 0 + } + fixed := strconv.FormatFloat(n, 'f', -1, 64) + for i := 0; i < len(fixed); i++ { + if fixed[i] == '.' { + return strconv.FormatFloat(n, 'g', 6, 64) + } + } + return fixed +} + +func (v value) Number() float64 { + switch v.kind { + case valueNumber, valueStrNum: + return v.n + case valueRegex: + if n, ok := parseNumericPrefix(v.pattern); ok { + return n + } + default: + if n, ok := parseNumericPrefix(v.s); ok { + return n + } + } + return 0 +} + +func (v value) Bool() bool { + switch v.kind { + case valueNumber: + return v.n != 0 + case valueStrNum: + return v.n != 0 + case valueRegex: + return v.pattern != "" + default: + return v.s != "" + } +} + +func parseFullNumericString(s string) (float64, bool) { + if s == "" { + return 0, false + } + n, err := strconv.ParseFloat(strings.TrimSpace(s), 64) + if err != nil || n != n || n > maxFiniteFloat64 || n < -maxFiniteFloat64 { + return 0, false + } + return n, true +} + +func parseNumericPrefix(s string) (float64, bool) { + prefix := numericPrefix(trimLeadingAwkSpace(s)) + if prefix == "" { + return 0, false + } + n, err := strconv.ParseFloat(prefix, 64) + return n, err == nil +} + +func trimLeadingAwkSpace(s string) string { + for len(s) > 0 { + switch s[0] { + case ' ', '\t', '\n', '\r', '\f', '\v': + s = s[1:] + default: + return s + } + } + return s +} + +func numericPrefix(s string) string { + i := 0 + if i < len(s) && (s[i] == '+' || s[i] == '-') { + i++ + } + digits := 0 + for i < len(s) && s[i] >= '0' && s[i] <= '9' { + i++ + digits++ + } + if i < len(s) && s[i] == '.' { + i++ + for i < len(s) && s[i] >= '0' && s[i] <= '9' { + i++ + digits++ + } + } + if digits == 0 { + return "" + } + end := i + if i < len(s) && (s[i] == 'e' || s[i] == 'E') { + j := i + 1 + if j < len(s) && (s[j] == '+' || s[j] == '-') { + j++ + } + expDigits := 0 + for j < len(s) && s[j] >= '0' && s[j] <= '9' { + j++ + expDigits++ + } + if expDigits > 0 { + end = j + } + } + return s[:end] +} + +type runtime struct { + callCtx *builtins.CallContext + prog *program + vars map[string]value + varSizes map[string]int + varBytes int + + record string + fields []string + filename string + nr int + fnr int +} + +func newRuntime(callCtx *builtins.CallContext, prog *program) *runtime { + rt := &runtime{ + callCtx: callCtx, + prog: prog, + vars: make(map[string]value), + varSizes: make(map[string]int), + } + rt.vars["FS"] = stringValue(" ") + rt.vars["OFS"] = stringValue(" ") + rt.vars["ORS"] = stringValue("\n") + return rt +} + +func (rt *runtime) run(ctx context.Context, files []string) builtins.Result { + if err := rt.runRules(ctx, ruleBegin); err != nil { + rt.callCtx.Errf("awk: %v\n", err) + return builtins.Result{Code: 1} + } + if rt.needsInput() { + if len(files) == 0 { + files = []string{"-"} + } + ranInput := false + for _, file := range files { + assigned, err := rt.applyOperandAssignment(file) + if err != nil { + rt.callCtx.Errf("awk: %v\n", err) + return builtins.Result{Code: 1} + } + if assigned { + continue + } + ranInput = true + if err := rt.runFile(ctx, file); err != nil { + rt.callCtx.Errf("awk: %s: %v\n", file, err) + return builtins.Result{Code: 1} + } + } + if !ranInput { + if err := rt.runFile(ctx, "-"); err != nil { + rt.callCtx.Errf("awk: -: %v\n", err) + return builtins.Result{Code: 1} + } + } + } + if err := rt.runRules(ctx, ruleEnd); err != nil { + rt.callCtx.Errf("awk: %v\n", err) + return builtins.Result{Code: 1} + } + return builtins.Result{} +} + +func (rt *runtime) applyOperandAssignment(arg string) (bool, error) { + name, value, ok := strings.Cut(arg, "=") + if !ok || !validIdentifierName(name) { + return false, nil + } + if !validVarName(name) { + return true, fmt.Errorf("invalid variable assignment %q", arg) + } + if err := rt.setVar(name, inputStringValue(DecodeAwkEscapes(value))); err != nil { + return true, err + } + return true, nil +} + +func (rt *runtime) needsInput() bool { + for _, r := range rt.prog.rules { + if r.kind == ruleNormal || r.kind == ruleEnd { + return true + } + } + return false +} + +func (rt *runtime) runFile(ctx context.Context, file string) error { + rc, err := rt.openInput(ctx, file) + if err != nil { + return err + } + defer rc.Close() + rt.filename = file + rt.fnr = 0 + sc := bufio.NewScanner(rc) + sc.Split(scanAwkRecord) + sc.Buffer(make([]byte, 4096), MaxRecordBytes+1) + for sc.Scan() { + if err := ctx.Err(); err != nil { + return err + } + rec := sc.Text() + if len(rec) > MaxRecordBytes { + return fmt.Errorf("record exceeds %d bytes", MaxRecordBytes) + } + if err := rt.setRecord(rec); err != nil { + return err + } + rt.nr++ + rt.fnr++ + if err := rt.runRules(ctx, ruleNormal); err != nil { + return err + } + } + if err := sc.Err(); err != nil { + return err + } + return nil +} + +func scanAwkRecord(data []byte, atEOF bool) (int, []byte, error) { + for i, b := range data { + if b == '\n' { + return i + 1, data[:i], nil + } + } + if atEOF { + if len(data) == 0 { + return 0, nil, nil + } + return len(data), data, nil + } + return 0, nil, nil +} + +func (rt *runtime) openInput(ctx context.Context, file string) (io.ReadCloser, error) { + if file == "-" { + if rt.callCtx.Stdin == nil { + return io.NopCloser(strings.NewReader("")), nil + } + return io.NopCloser(rt.callCtx.Stdin), nil + } + f, err := rt.callCtx.OpenFile(ctx, file, os.O_RDONLY, 0) + if err != nil { + return nil, err + } + return f, nil +} + +func (rt *runtime) runRules(ctx context.Context, kind ruleKind) error { + for _, r := range rt.prog.rules { + if err := ctx.Err(); err != nil { + return err + } + if r.kind != kind { + continue + } + if kind == ruleNormal && r.pattern != nil { + ok, err := rt.matchPattern(r.pattern) + if err != nil { + return err + } + if !ok { + continue + } + } + if r.action == nil { + if err := rt.printValues([]value{rt.field(0)}); err != nil { + return err + } + continue + } + if err := rt.execStatements(r.action); err != nil { + return err + } + } + return nil +} + +func (rt *runtime) matchPattern(x expr) (bool, error) { + if rx, ok := x.(*regexExpr); ok { + re, err := compileRegex(rx.pattern) + if err != nil { + return false, err + } + return re.MatchString(rt.record), nil + } + v, err := rt.eval(x) + if err != nil { + return false, err + } + return v.Bool(), nil +} + +func (rt *runtime) setRecord(rec string) error { + rt.record = rec + fs := rt.getVar("FS").String() + if fs == " " { + rt.fields = splitAwkWhitespaceFields(rec) + } else { + if err := validateFS(fs); err != nil { + return err + } + if rec == "" { + rt.fields = nil + } else { + rt.fields = strings.Split(rec, fs) + } + } + if len(rt.fields) > MaxFields { + return fmt.Errorf("record has too many fields") + } + return nil +} + +func splitAwkWhitespaceFields(rec string) []string { + var fields []string + for i := 0; i < len(rec); { + for i < len(rec) && isAwkFieldBlank(rec[i]) { + i++ + } + start := i + for i < len(rec) && !isAwkFieldBlank(rec[i]) { + i++ + } + if start < i { + fields = append(fields, rec[start:i]) + } + } + return fields +} + +func isAwkFieldBlank(b byte) bool { + return b == ' ' || b == '\t' || b == '\n' +} + +func (rt *runtime) field(n int) value { + if n == 0 { + return inputStringValue(rt.record) + } + if n < 0 || n > len(rt.fields) { + return stringValue("") + } + return inputStringValue(rt.fields[n-1]) +} + +func (rt *runtime) getVar(name string) value { + switch name { + case "NF": + return numberValue(float64(len(rt.fields))) + case "NR": + return numberValue(float64(rt.nr)) + case "FNR": + return numberValue(float64(rt.fnr)) + case "FILENAME": + return stringValue(rt.filename) + default: + if v, ok := rt.vars[name]; ok { + return v + } + return unassignedValue() + } +} + +func (rt *runtime) setVar(name string, v value) error { + switch name { + case "NF": + return fmt.Errorf("assignment to NF is not supported") + case "NR", "FNR", "FILENAME": + return fmt.Errorf("assignment to %s is not supported", name) + case "FS": + if err := validateFS(v.String()); err != nil { + return err + } + } + size := len(v.String()) + if size > MaxVariableBytes { + return fmt.Errorf("variable value exceeds %d bytes", MaxVariableBytes) + } + old := rt.varSizes[name] + if rt.varBytes-old+size > MaxVariableBytes { + return fmt.Errorf("variable storage limit exceeded (%d bytes total)", rt.varBytes-old+size) + } + rt.varBytes = rt.varBytes - old + size + rt.varSizes[name] = size + rt.vars[name] = v + return nil +} + +func validateFS(fs string) error { + if fs == " " { + return nil + } + if fs == "" { + return fmt.Errorf("empty FS is not supported") + } + r, size := utf8.DecodeRuneInString(fs) + if r == utf8.RuneError && size == 0 { + return fmt.Errorf("empty FS is not supported") + } + if size != len(fs) { + return fmt.Errorf("multi-character and regex FS values are not supported") + } + return nil +} + +func compileRegex(pattern string) (*regexp.Regexp, error) { + normalized := normalizeAwkRegex(pattern) + re, err := regexp.Compile(normalized) + if err != nil { + return nil, fmt.Errorf("invalid regular expression %q: %v", pattern, err) + } + return re, nil +} + +func normalizeAwkRegex(pattern string) string { + var b strings.Builder + for i := 0; i < len(pattern); i++ { + ch := pattern[i] + if ch != '\\' { + b.WriteByte(ch) + continue + } + if i+1 >= len(pattern) { + b.WriteByte(ch) + continue + } + i++ + writeAwkRegexEscape(&b, pattern[i]) + } + return b.String() +} + +func writeAwkRegexEscape(b *strings.Builder, esc byte) { + switch esc { + case 'n': + b.WriteString(`\n`) + case 't': + b.WriteString(`\t`) + case 'r': + b.WriteString(`\r`) + case 'b': + b.WriteString(`\x08`) + case 'f': + b.WriteString(`\f`) + case 'a': + b.WriteString(`\x07`) + case 'v': + b.WriteString(`\x0b`) + case '.', '[', ']', '(', ')', '{', '}', '*', '+', '?', '|', '^', '$', '\\': + b.WriteByte('\\') + b.WriteByte(esc) + case 'w', 'W', 's', 'S': + b.WriteByte('\\') + b.WriteByte(esc) + default: + b.WriteByte(esc) + } +} diff --git a/builtins/tests/awk/awk_test.go b/builtins/tests/awk/awk_test.go new file mode 100644 index 000000000..68b5cd88a --- /dev/null +++ b/builtins/tests/awk/awk_test.go @@ -0,0 +1,339 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2026-present Datadog, Inc. + +package awk_test + +import ( + "bytes" + "context" + "errors" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "mvdan.cc/sh/v3/syntax" + + "github.com/DataDog/rshell/internal/interpoption" + "github.com/DataDog/rshell/interp" +) + +func runScript(t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { + t.Helper() + return runScriptCtx(context.Background(), t, script, dir, opts...) +} + +func runScriptCtx(ctx context.Context, t *testing.T, script, dir string, opts ...interp.RunnerOption) (string, string, int) { + t.Helper() + parser := syntax.NewParser() + prog, err := parser.Parse(strings.NewReader(script), "") + require.NoError(t, err) + var outBuf, errBuf bytes.Buffer + allOpts := append([]interp.RunnerOption{interp.StdIO(nil, &outBuf, &errBuf), interpoption.AllowAllCommands().(interp.RunnerOption)}, opts...) + runner, err := interp.New(allOpts...) + require.NoError(t, err) + defer runner.Close() + if dir != "" { + runner.Dir = dir + } + err = runner.Run(ctx, prog) + exitCode := 0 + if err != nil { + var es interp.ExitStatus + if errors.As(err, &es) { + exitCode = int(es) + } else if ctx.Err() == nil { + t.Fatalf("unexpected error: %v", err) + } + } + return outBuf.String(), errBuf.String(), exitCode +} + +func cmdRun(t *testing.T, script, dir string) (stdout, stderr string, exitCode int) { + t.Helper() + return runScript(t, script, dir, interp.AllowedPaths([]string{dir})) +} + +func writeFile(t *testing.T, dir, name, content string) { + t.Helper() + require.NoError(t, os.WriteFile(filepath.Join(dir, name), []byte(content), 0644)) +} + +func TestAwkPrintFields(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "alpha beta gamma\none two three\n") + stdout, stderr, code := cmdRun(t, `awk '{ print $1, $3 }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "alpha gamma\none three\n", stdout) +} + +func TestAwkFieldSeparatorAndConcat(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "root:x:0\nagent:x:42\n") + stdout, _, code := cmdRun(t, `awk -F: '{ print "user=" $1 ":" $3 }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "user=root:0\nuser=agent:42\n", stdout) +} + +func TestAwkFieldIndexTruncatesTowardZero(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "alpha beta gamma\n") + stdout, stderr, code := cmdRun(t, `awk '{ print $(NF/2), $(-0.5) }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "alpha alpha beta gamma\n", stdout) +} + +func TestAwkStopsOptionParsingAfterProgram(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "-F,", "a,b c\n") + stdout, stderr, code := cmdRun(t, `awk '{ print $1 }' -F,`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "a,b\n", stdout) +} + +func TestAwkBeginEndAndAggregation(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "a 2\nb 3\n") + stdout, _, code := cmdRun(t, `awk 'BEGIN { print "start" } { sum += $2 } END { print "sum", sum }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "start\nsum 5\n", stdout) +} + +func TestAwkExplicitEmptyActionDoesNothing(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "alpha\n") + stdout, stderr, code := cmdRun(t, `awk 'BEGIN {} 1 {}' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "", stdout) +} + +func TestAwkPatternsAndRegexMatch(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "ok 1\nerror 2\nwarn 3\n") + stdout, _, code := cmdRun(t, `awk '$2 > 1 && $1 !~ /warn/ { print $1 }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "error\n", stdout) +} + +func TestAwkRegexRuleAfterNewline(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "bar\nfoo\n") + stdout, stderr, code := cmdRun(t, "awk 'BEGIN { print \"h\" }\n/foo/ { print $1 }' input.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "h\nfoo\n", stdout) +} + +func TestAwkPrintSkipsNewlineAfterComma(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "a b\n") + stdout, stderr, code := cmdRun(t, "awk '{ print $1,\n$2 }' input.txt", dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "a b\n", stdout) +} + +func TestAwkPreservesCarriageReturnInRecords(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "a\r\n") + stdout, stderr, code := cmdRun(t, `awk '{ print $0 == "a", $0 == "a\r", $1 == "a", $1 == "a\r" }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "0 1 0 1\n", stdout) +} + +func TestAwkPrintParenthesizedComparison(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "1\n2\n") + stdout, stderr, code := cmdRun(t, `awk '{ print ($1 > 1) }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "0\n1\n", stdout) +} + +func TestAwkRegexLiteralExpression(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "bar\nfoo\n") + stdout, stderr, code := cmdRun(t, `awk '{ print (/foo/) } /foo/ == 1 { print "matched", $0 }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "0\n1\nmatched foo\n", stdout) +} + +func TestAwkPrintRegexLiteralExpression(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "foo\nbar\n") + stdout, stderr, code := cmdRun(t, `awk '{ print /foo/ }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "1\n0\n", stdout) +} + +func TestAwkRegexBracketClassCanContainSlash(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "/\nx\n") + stdout, stderr, code := cmdRun(t, `awk '/[/]/ { print }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "/\n", stdout) +} + +func TestAwkRegexUnknownEscapesBecomeLiterals(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "5\nd\n") + stdout, stderr, code := cmdRun(t, `awk '/\d/ { print }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "d\n", stdout) +} + +func TestAwkLiteralFieldSeparatorBlankRecordNF(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "a:b\n\n:\n") + stdout, stderr, code := cmdRun(t, `awk -F: '{ print NF }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "2\n0\n2\n", stdout) +} + +func TestAwkStringNumericSemantics(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "0\n10\n123abc\n-4.5x\nabc123\n") + stdout, stderr, code := cmdRun(t, `awk 'BEGIN { print ("10" < "2"), (x == 0), (x == ""), !"0", "123abc" + 1 } $1 { print "truthy", $1 } { print $1 + 1, ($1 == 0), ($1 < 2), ($1 < "2") }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "1 1 1 0 124\n1 1 1 1\ntruthy 10\n11 0 0 1\ntruthy 123abc\n124 0 1 1\ntruthy -4.5x\n-3.5 0 1 1\ntruthy abc123\n1 0 0 0\n", stdout) +} + +func TestAwkEmptyProgramIsNoOp(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := cmdRun(t, `awk '' missing.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "", stdout) +} + +func TestAwkIntegerNumberFormatting(t *testing.T) { + stdout, stderr, code := cmdRun(t, `awk 'BEGIN { print 999999, 1000000, 123456789, 1000000.5; x=-0; print x, -1e-400 }'`, t.TempDir()) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "999999 1000000 123456789 1e+06\n0 0\n", stdout) +} + +func TestAwkBeginOnlySkipsInputFiles(t *testing.T) { + dir := t.TempDir() + stdout, stderr, code := cmdRun(t, `awk 'BEGIN { print "x" }' missing.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "x\n", stdout) +} + +func TestAwkProgramFileAndDashStdin(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "prog.awk", `{ print NR ":" $2 }`) + stdout, _, code := runScript(t, `printf 'a b\nc d\n' | awk -f prog.awk -`, dir, interp.AllowedPaths([]string{dir})) + assert.Equal(t, 0, code) + assert.Equal(t, "1:b\n2:d\n", stdout) +} + +func TestAwkDashProgramFileReadsStdin(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "prog.awk", `{ print $1 + 1 }`) + writeFile(t, dir, "input.txt", "123abc\n") + stdout, stderr, code := runScript(t, `awk -f - input.txt < prog.awk`, dir, interp.AllowedPaths([]string{dir})) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "124\n", stdout) +} + +func TestAwkVariablesTabFSAndMultipleFiles(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "one.tsv", "a\t1\n") + writeFile(t, dir, "two.tsv", "b\t2\n") + stdout, _, code := cmdRun(t, `awk -F '\t' -v prefix=row 'BEGIN { OFS=":" } { print prefix, FILENAME, FNR, NR, $2 }' one.tsv two.tsv`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "row:one.tsv:1:1:1\nrow:two.tsv:1:2:2\n", stdout) +} + +func TestAwkOperandAssignments(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "one.txt", "a\n") + writeFile(t, dir, "two.txt", "b\n") + stdout, stderr, code := cmdRun(t, `awk '{ print x ":" $0 }' one.txt x=foo two.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, ":a\nfoo:b\n", stdout) + + stdout, stderr, code = runScript(t, `awk '{ print x, $0 }' x=foo < one.txt`, dir, interp.AllowedPaths([]string{dir})) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "foo a\n", stdout) + + writeFile(t, dir, "1=x", "c\n") + stdout, stderr, code = cmdRun(t, `awk '{ print $0 }' 1=x`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "c\n", stdout) +} + +func TestAwkAppliesFieldSeparatorOptionsInOrder(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "a:b,c\n") + stdout, stderr, code := cmdRun(t, `awk -v FS=: -F, '{ print $1, $2 }' input.txt; awk -F, -v FS=: '{ print $1, $2 }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "a:b c\na b,c\n", stdout) +} + +func TestAwkRejectsNaNAndInfNumericStrings(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "NaN Inf Infinity\n") + stdout, stderr, code := cmdRun(t, `awk '{ print $1 + 1, $2 + 1, $3 + 1, ($1 == $1), ($2 == $2) }' input.txt`, dir) + assert.Equal(t, 0, code) + assert.Equal(t, "", stderr) + assert.Equal(t, "1 1 1 1 1\n", stdout) +} + +func TestAwkRejectsUnsafeFeatures(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "a b\n") + for _, script := range []string{ + `awk '{ system("sh") }' input.txt`, + `awk '{ print $1 > "out" }' input.txt`, + `awk '{ $1 = "x" }' input.txt`, + `awk '{ next; print $1 }' input.txt`, + `awk '{ print getline }' input.txt`, + `awk '{ x = next }' input.txt`, + `awk '{ exit 0 }' input.txt`, + `awk 'BEGIN { BEGIN=1; print BEGIN }' input.txt`, + `awk 'BEGIN { END=1; print END }' input.txt`, + `awk '{ print $BEGIN }' input.txt`, + `awk '{ print $if }' input.txt`, + `awk '{ print $length }' input.txt`, + `awk -v BEGIN=x 'BEGIN { print 1 }' input.txt`, + `awk '{ print $0 }' BEGIN=x input.txt`, + `awk 'BEGIN { print 1 < 2 < 3 }' input.txt`, + `awk '{ print 1 / 0 }' input.txt`, + `awk -F '' '{ print $1 }' input.txt`, + } { + _, stderr, code := cmdRun(t, script, dir) + assert.Equal(t, 1, code, script) + assert.Contains(t, stderr, "awk:", script) + } +} + +func TestAwkRejectsUnsupportedBuiltinWithoutParens(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "input.txt", "abc\n") + _, stderr, code := cmdRun(t, `awk '{ print length }' input.txt`, dir) + assert.Equal(t, 1, code) + assert.Contains(t, stderr, "awk: function calls are not supported") +} diff --git a/docs/AWK_IMPLEMENTATION_PLAN.md b/docs/AWK_IMPLEMENTATION_PLAN.md new file mode 100644 index 000000000..2a13a9591 --- /dev/null +++ b/docs/AWK_IMPLEMENTATION_PLAN.md @@ -0,0 +1,385 @@ +# AWK Implementation Plan + +This document is the shared plan for implementing the rshell `awk` builtin +across multiple sessions. It complements the external harness in +`tools/awk-harness/` and the repo-local `.claude/skills/implement-awk` skill. + +The goal is not to implement every GNU awk feature in one pass. The goal is to +define a useful, safe rshell awk profile, use GNU awk as the compatibility +oracle for features we intentionally support, and reject or defer features that +conflict with rshell's safety model. + +## Compatibility Model + +- The reference behavior for supported features is pinned GNU awk, as installed + by `tools/awk-harness/run.sh install-gawk`. +- GNU awk is an oracle for supported behavior, not a requirement to implement + every extension. +- One True Awk remains a supporting regression suite, but GNU awk wins when + behavior differs. +- Upstream gawk and One True Awk tests, logs, and generated outputs are external + data. Do not vendor them or copy their test bodies into this repository. +- Write original rshell tests for every behavior that lands in the builtin. + +## Parser Strategy + +Build one long-lived parser and extend it over time. + +The first implementation should not be a throwaway parser or a collection of +ad hoc string splits. It should be a real, intentionally small awk parser with: + +- a hand-written lexer, +- a recursive-descent parser for programs, rules, and statements, +- a Pratt parser for expressions and operator precedence, +- a small AST shared by parser tests and the interpreter, +- clear syntax errors for unsupported grammar. + +The parser should initially accept only the Practical awk subset below, but its +shape should be ready for later additions such as arrays, control flow, +functions, `printf`, and string builtins. + +Important lexer/parser detail: `/.../` is context-sensitive in awk. It can be a +regular expression literal where an expression or pattern can begin, or division +after an operand. Handle this through parser-aware lexing or parser state rather +than a global "slash always starts regex" rule. + +## Phase 1: Practical Awk + +This is the recommended first implementation PR, ideally on a branch based on +the harness branch, for example `codex/awk-phase-1`. + +### CLI And Program Loading + +Support: + +- `awk 'program' [FILE]...` +- `awk -f program.awk [FILE]...` +- multiple `-f` program files, read in order through `callCtx.OpenFile` +- `awk -F SEP 'program' [FILE]...` +- `awk -v name=value 'program' [FILE]...` +- `awk --help` +- `-` as an input operand for stdin +- implicit stdin when no input files are given + +Reject malformed CLI input with an exit code of 1 and a clear `awk:` error. + +### Program Structure + +Support: + +- `BEGIN { ... }` +- record rules: `pattern { ... }`, `{ ... }`, and pattern-only rules +- `END { ... }` +- multiple rules, executed in source order +- semicolons and newlines as statement separators where awk accepts them + +Default actions: + +- pattern-only rule: print `$0` +- omitted pattern with action: run action for every record +- omitted action with omitted pattern is invalid + +### Records, Fields, And Built-In Variables + +Support: + +- `$0` +- `$1`, `$2`, ... and expression-based field references such as `$NF` +- `NF` +- `NR` +- `FNR` +- `FILENAME` +- `FS` +- `OFS` +- `ORS` + +Initial defaults: + +- `FS = " "` +- `OFS = " "` +- `ORS = "\n"` + +For Phase 1, prioritize correct field splitting for `FS = " "` and +single-character literal separators supplied by `-F` or `FS = value`. The +single-character separator should be interpreted after awk string escape +decoding, so tab is supported via values such as `-F '\t'` or `FS = "\t"`. +Reject empty separators, multi-character separators, and regex-looking +separators with a clear unsupported-feature error. + +Defer regex `FS` support to Phase 2. In awk, `FS` is normally a regular +expression, so values like `-F'[,:]'` or `FS = "[[:space:]]+"` split on regex +matches rather than a literal separator. That is useful, but Phase 1 only needs +the core extractor behavior: default whitespace splitting and one-character +literal separators such as `:`, `,`, and tab. Regex `FS` should land after the +record runtime and regex matcher are stable. + +Defer field mutation and `$0` rebuilding to Phase 2. Phase 1 may read `$0`, +`$1`, `$NF`, and `NF`, but assignments such as `$1 = "new"`, `$0 = value`, or +`NF = 2` should be rejected with a clear unsupported-feature error. Correct awk +field mutation requires tracking whether `$0` or the fields are authoritative, +rebuilding `$0` with `OFS`, and recomputing fields when `$0` changes; that state +machine deserves its own focused PR. + +### Statements + +Support: + +- `print` +- expression statements +- scalar assignment: `name = expr` +- scalar compound assignment: `+=`, `-=`, `*=`, `/=`, `%=` +- scalar increment/decrement: `name++`, `name--`, `++name`, `--name` + +Defer: + +- `printf` +- `if`, `while`, `for` +- `break`, `continue`, `next`, `nextfile` +- arrays and `delete` +- user-defined functions + +### Expressions + +Support: + +- numeric literals +- string literals with common awk escapes +- variable references +- field references +- parenthesized expressions +- unary `+`, unary `-`, and `!` +- arithmetic: `+`, `-`, `*`, `/`, `%` +- comparisons: `==`, `!=`, `<`, `<=`, `>`, `>=` +- boolean operators: `&&`, `||` +- regex match operators: `~`, `!~` +- regex literals in patterns and match expressions +- string concatenation by adjacent expressions, such as `"user=" $1` and + `$1 ":" $2` + +Use awk-style truthiness: + +- numeric zero is false, +- empty string is false, +- nonzero numbers and non-empty strings are true. + +String concatenation is part of Phase 1 even though it has no explicit operator +in awk. Model adjacency as an implicit binary operator in the Pratt parser, with +tests around ambiguous cases. Without concatenation, common formatting one-liners +feel broken. + +### Resource Limits + +Use explicit caps so malformed or adversarial awk programs cannot grow parser or +runtime memory without bound: + +- `MaxProgramBytes = 256 KiB`: maximum combined source size from inline programs + and all `-f` files. This is much larger than normal one-liners and small awk + scripts while keeping parse-time memory predictable. +- `MaxRecordBytes = 1 MiB`: maximum input record size. This matches the existing + line-oriented builtin pattern and should be enforced before a record is stored + or split into fields. +- `MaxFields = 16,384`: maximum fields per record. This is generous for logs and + tabular data but prevents separator-heavy records from creating unbounded field + slices. +- `MaxVariableBytes = 1 MiB`: maximum aggregate storage for awk scalar variable + string values. This mirrors rshell's shell variable storage cap. Built-in + record/field storage is governed by the record and field caps above; later + array support should share this aggregate variable-storage budget or introduce + an equally explicit array budget. + +Also reject a single scalar assignment whose string value exceeds +`MaxVariableBytes`, even if the aggregate budget is otherwise empty. + +### Patterns + +Support: + +- empty pattern for "all records" +- expression patterns such as `$2 > 10` +- regex patterns such as `/error/`, tested against `$0` +- `BEGIN` and `END` + +Defer: + +- range patterns: `pat1, pat2` + +### Regular Expressions + +Use Go's `regexp` package for linear-time matching. Document and test any +intentional difference from GNU awk regex behavior that comes from rshell's +safety or implementation constraints. + +Avoid backtracking engines. + +Regex field separators are deferred even though regex patterns and `~`/`!~` are +in Phase 1. This keeps the initial field-splitting semantics small and +reviewable. + +### ENVIRON + +Do not populate `ENVIRON` from the host process environment. + +When `ENVIRON` is implemented, populate it from rshell's script-visible +environment only: caller-provided `interp.Env(...)` values, current shell +variables, and interpreter-provided variables such as `PWD`, `IFS`, `OPTIND`, +`RSHELL_VERSION`, and `ALLOWED_PATHS` when present. This matches rshell's +documented environment model: no host environment is inherited by default. + +Defer `ENVIRON` until array/indexing support lands. In awk, `ENVIRON` is an +associative array (`ENVIRON["NAME"]`), so supporting it in Phase 1 would require +a one-off special array before the language supports arrays generally. The +reason to defer is implementation shape, not safety; the source of truth is +clear and should be rshell's environment snapshot. + +Do not add a `builtins.CallContext` environment iterator in Phase 1 solely for +future `ENVIRON` support. Phase 1 has no feature that needs access to rshell's +environment beyond explicit awk variables supplied by `-v`. When `ENVIRON` +lands with arrays, add the necessary builtin capability then, either as a +read-only environment iterator/snapshot on `CallContext` or an equivalent +narrow API. + +## Safety Policy + +The builtin must preserve rshell's no-write, no-host-exec safety model. + +Reject or defer: + +- `system()` +- command pipes: `print | "cmd"` and `"cmd" | getline` +- coprocesses +- output redirection to files: `print > "file"` and `print >> "file"` +- `getline` in all forms for Phase 1 +- dynamic extension loading +- network special files +- any feature that executes host commands +- any feature that writes, creates, modifies, or deletes files + +All file reads must go through `callCtx.OpenFile`. + +## Implementation Files + +Expected production files: + +- `builtins/awk/awk.go`: command registration, flags, help, top-level runner +- `builtins/awk/lexer.go`: tokenization +- `builtins/awk/parser.go`: program, rules, statements, expression parser +- `builtins/awk/ast.go`: AST node definitions +- `builtins/awk/runtime.go`: records, fields, variables, built-in variables +- `builtins/awk/eval.go`: statement and expression evaluation +- `builtins/awk/regex.go`: regex compilation and matching helpers + +Expected integration edits: + +- `interp/register_builtins.go` +- `SHELL_FEATURES.md` +- `README.md`, if the builtin list or examples need updating +- `analysis/symbols_builtins.go`, with narrow symbol allowlist entries and + safety comments for any stdlib symbols used by `builtins/awk` + +## Testing Plan + +Prefer original scenario tests for externally visible behavior. + +Expected test locations: + +- `tests/scenarios/cmd/awk/` +- `builtins/tests/awk/` +- parser and runtime unit tests under `builtins/awk/` where scenario tests are + too coarse + +Scenario coverage should include: + +- help output +- inline program argument +- `-f` program loading +- `-F` +- `-v` +- stdin and `-` +- multiple input files, including `FNR`, `NR`, and `FILENAME` +- `BEGIN`, main rules, and `END` +- field extraction with `$0`, `$1`, `$NF`, and `NF` +- `print` with `OFS` and `ORS` +- numeric comparisons and arithmetic +- aggregation with `sum += $2` +- regex patterns and `~` / `!~` +- missing files and malformed programs +- rejected unsafe features + +For upstream-derived failures, write new original rshell tests. Do not copy +gawk test data, fixture bodies, comments, helper scripts, expected output, or +generated files into this repository. + +## Verification Loop + +After each coherent implementation step: + +```bash +make fmt +go test ./... +``` + +When `./rshell` is needed and does not exist: + +```bash +make build +``` + +For AWK compatibility work, run focused harness filters first, then expand: + +```bash +RSHELL_BIN=./rshell AWK_UNDER_TEST=tools/awk-harness/rshell-awk GAWK_TEST_FILTER= tools/awk-harness/run.sh gawk +RSHELL_BIN=./rshell AWK_UNDER_TEST=tools/awk-harness/rshell-awk ONETRUEAWK_SUITE=t tools/awk-harness/run.sh onetrueawk +``` + +Before considering an implementation PR complete, run the full required awk +sequence from `.claude/skills/implement-awk/SKILL.md`: + +```bash +make fmt +go test ./... +RSHELL_BIN=./rshell AWK_UNDER_TEST=tools/awk-harness/rshell-awk tools/awk-harness/run.sh gawk +RSHELL_BIN=./rshell AWK_UNDER_TEST=tools/awk-harness/rshell-awk tools/awk-harness/run.sh onetrueawk +``` + +If scenarios or builtin implementations changed, also run the bash comparison +suite when Docker is available: + +```bash +RSHELL_BASH_TEST=1 go test ./tests/ -run TestShellScenariosAgainstBash -timeout 120s +``` + +## Later Phases + +Phase 2 candidates: + +- `printf` +- `if` +- `next` +- range patterns +- regex `FS` +- field assignment and `$0` rebuilding +- common string builtins: `length`, `substr`, `index`, `split`, `tolower`, + `toupper`, `int` + +Phase 3 candidates: + +- arrays, including `count[$1]++` +- `ENVIRON`, populated from the rshell environment snapshot +- `in` +- `delete` +- `for (k in array)` +- `for` and `while` +- `break` and `continue` + +Phase 4 candidates: + +- user-defined functions +- additional POSIX awk builtins +- carefully restricted `getline`, only if a safe design is approved +- safe GNU awk compatibility extensions that do not violate rshell policy + +## Open Design Questions + +There are no unresolved Phase 1 scope questions at the time of writing. Add new +questions here if implementation uncovers behavior that needs explicit product +or safety direction before proceeding. diff --git a/interp/register_builtins.go b/interp/register_builtins.go index 0f8458878..50addc149 100644 --- a/interp/register_builtins.go +++ b/interp/register_builtins.go @@ -9,6 +9,7 @@ import ( "sync" "github.com/DataDog/rshell/builtins" + "github.com/DataDog/rshell/builtins/awk" breakcmd "github.com/DataDog/rshell/builtins/break" "github.com/DataDog/rshell/builtins/cat" continuecmd "github.com/DataDog/rshell/builtins/continue" @@ -48,6 +49,7 @@ var registerOnce sync.Once func registerBuiltins() { registerOnce.Do(func() { for _, cmd := range []builtins.Command{ + awk.Cmd, breakcmd.Cmd, cat.Cmd, cut.Cmd, diff --git a/tests/awk_scenarios/enabled.txt b/tests/awk_scenarios/enabled.txt index e69de29bb..d237b1777 100644 --- a/tests/awk_scenarios/enabled.txt +++ b/tests/awk_scenarios/enabled.txt @@ -0,0 +1,107 @@ +gawk/basic/begin_end_records.yaml +gawk/basic/field_separator.yaml +gawk/expressions/appended_numeric_string_reconverts.yaml +gawk/expressions/concat_literal_punctuation.yaml +gawk/expressions/leading_digit_exponent_fragment.yaml +gawk/expressions/nondecimal_string_parameter.yaml +gawk/expressions/numeric_string_division.yaml +gawk/expressions/string_constant_numeric_comparison.yaml +gawk/expressions/string_field_number_reference.yaml +gawk/expressions/unary_minus_string_operand.yaml +gawk/expressions/unary_plus_preserves_decimal_string_value.yaml +gawk/fields/numeric_field_terminator.yaml +gawk/input/no_trailing_newline_regex.yaml +gawk/input/nr_concat_builtin_records.yaml +gawk/misc/begin_print_hello.yaml +gawk/misc/last_field_concat_once.yaml +gawk/misc/nested_self_compound_assignment.yaml +gawk/output/hex_input_numeric_conversion.yaml +gawk/output/print_separators.yaml +gawk/records/fs_single_backslash.yaml +gawk/regex/dfa_nested_closure_alternation.yaml +gawk/regex/escaped_left_brace_literal.yaml +gawk/regex/pattern_match.yaml +gawk/text/print_records_verbatim.yaml +onetrueawk/basic/begin_filename_and_end_nr.yaml +onetrueawk/basic/comments_ignored.yaml +onetrueawk/basic/pattern_action.yaml +onetrueawk/basic/record_counter_nr.yaml +onetrueawk/core/custom_ors_without_final_newline.yaml +onetrueawk/core/end_record_count.yaml +onetrueawk/core/field_reference_order.yaml +onetrueawk/core/inline_comments_inside_action.yaml +onetrueawk/core/not_operator_patterns.yaml +onetrueawk/core/numeric_field_comparison_pattern.yaml +onetrueawk/core/numeric_literal_regex_pattern.yaml +onetrueawk/core/or_pattern_with_regex.yaml +onetrueawk/core/prefix_postfix_increment_counters.yaml +onetrueawk/core/regex_bracket_classes_dynamic.yaml +onetrueawk/core/regex_bracket_classes_literal.yaml +onetrueawk/core/regex_match_operator.yaml +onetrueawk/core/running_sum_and_final_total.yaml +onetrueawk/core/tt01_print_records.yaml +onetrueawk/core/tt02_nr_nf_record.yaml +onetrueawk/core/tt07_even_field_count_pattern.yaml +onetrueawk/core/tt09_empty_record_pattern.yaml +onetrueawk/core/tt10_nonempty_end_pattern.yaml +onetrueawk/core/uninitialized_concat_prefix.yaml +onetrueawk/expressions/number_string_conversion.yaml +onetrueawk/expressions/numeric_string_exclusions.yaml +onetrueawk/expressions/string_range_comparisons.yaml +onetrueawk/fields/colon_field_separator.yaml +onetrueawk/fields/field_regex_condition.yaml +onetrueawk/fixtures/t_1_x_concatenated_assignment.yaml +onetrueawk/fixtures/t_4_x_parenthesized_field_reference.yaml +onetrueawk/fixtures/t_6_x_nf_and_record_printing.yaml +onetrueawk/fixtures/t_d_x_colon_separator_nf.yaml +onetrueawk/fixtures/t_monotone_optional_regex_chain.yaml +onetrueawk/fixtures/t_quote_field_with_literal_quotes.yaml +onetrueawk/fixtures/t_sep_digit_field_separator.yaml +onetrueawk/fixtures/t_seqno_record_numbers.yaml +onetrueawk/fixtures/t_stately_grouped_alternation_repetition.yaml +onetrueawk/fixtures/t_vf_dynamic_field_read.yaml +onetrueawk/fixtures/t_x_regex_default_print.yaml +onetrueawk/fixtures/tt_03a_third_field_sum.yaml +onetrueawk/fixtures/tt_10a_dynamic_dot_end_regex.yaml +onetrueawk/output/custom_ofs.yaml +onetrueawk/output/ofs_ors_print.yaml +onetrueawk/programs/constant_string_concatenation.yaml +onetrueawk/programs/expression_result_numeric_conversion.yaml +onetrueawk/programs/p01_print_records.yaml +onetrueawk/programs/p02_print_selected_fields.yaml +onetrueawk/programs/p04_record_numbers.yaml +onetrueawk/programs/p06_end_record_count.yaml +onetrueawk/programs/p07_numeric_pattern_default_print.yaml +onetrueawk/programs/p08_field_equality_action.yaml +onetrueawk/programs/p09_lexicographic_pattern.yaml +onetrueawk/programs/p10_field_equality_between_columns.yaml +onetrueawk/programs/p11_regex_default_print.yaml +onetrueawk/programs/p12_field_regex_action.yaml +onetrueawk/programs/p13_negated_field_regex.yaml +onetrueawk/programs/p14_literal_dollar_regex.yaml +onetrueawk/programs/p15_literal_backslash_regex.yaml +onetrueawk/programs/p16_single_character_regex.yaml +onetrueawk/programs/p17_non_numeric_field_regex.yaml +onetrueawk/programs/p18_grouped_alternation_regex.yaml +onetrueawk/programs/p19_variable_regex_numeric_field.yaml +onetrueawk/programs/p20_compound_condition.yaml +onetrueawk/programs/p21_field_or_continent.yaml +onetrueawk/programs/p21a_record_regex_or.yaml +onetrueawk/programs/p22_anchored_alternation_field_regex.yaml +onetrueawk/programs/p26_accumulate_asia_long_assignment.yaml +onetrueawk/programs/p26a_accumulate_asia_compound_assignment.yaml +onetrueawk/programs/p27_maximum_numeric_field.yaml +onetrueawk/programs/p28_nr_colon_record_concat.yaml +onetrueawk/programs/p37_concatenated_field_equality.yaml +onetrueawk/programs/p45_ofs_ors_print.yaml +onetrueawk/programs/p46_adjacent_field_concatenation.yaml +onetrueawk/programs/regular_expression_operator_matrix.yaml +onetrueawk/records/modulo_pattern_default_print.yaml +onetrueawk/records/sum_count_average.yaml +onetrueawk/regex/compound_pattern_conditions.yaml +onetrueawk/regex/dynamic_regex_from_field.yaml +onetrueawk/regex/dynamic_regex_literals.yaml +onetrueawk/regex/empty_group_and_nonempty_patterns.yaml +onetrueawk/regex/negated_class_vowel_shape.yaml +onetrueawk/regex/or_pattern_only.yaml +onetrueawk/regex/ordered_class_chain.yaml diff --git a/tests/scenarios/cmd/awk/basic/begin_end_sum.yaml b/tests/scenarios/cmd/awk/basic/begin_end_sum.yaml new file mode 100644 index 000000000..536420b8e --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/begin_end_sum.yaml @@ -0,0 +1,19 @@ +description: awk runs BEGIN, per-record actions, and END. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + a 2 + b 3 +input: + allowed_paths: ["$DIR"] + script: |+ + awk 'BEGIN { print "start" } { sum += $2 } END { print "sum", sum }' input.txt +expect: + stdout: |+ + start + sum 5 + stderr: "" + exit_code: 0 + diff --git a/tests/scenarios/cmd/awk/basic/begin_only_skips_input.yaml b/tests/scenarios/cmd/awk/basic/begin_only_skips_input.yaml new file mode 100644 index 000000000..18b472288 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/begin_only_skips_input.yaml @@ -0,0 +1,11 @@ +description: awk does not read input files for BEGIN-only programs. +oracle: gawk +input: + allowed_paths: ["$DIR"] + script: |+ + awk 'BEGIN { print "x" }' missing.txt +expect: + stdout: |+ + x + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/crlf_records.yaml b/tests/scenarios/cmd/awk/basic/crlf_records.yaml new file mode 100644 index 000000000..f54a04163 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/crlf_records.yaml @@ -0,0 +1,10 @@ +description: awk preserves carriage returns before newline record separators. +oracle: gawk +input: + script: |+ + printf 'a\r\n' | awk '{ print $0 == "a", $0 == "a\r", $1 == "a", $1 == "a\r" }' +expect: + stdout: |+ + 0 1 0 1 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/empty_action.yaml b/tests/scenarios/cmd/awk/basic/empty_action.yaml new file mode 100644 index 000000000..6fa61a848 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/empty_action.yaml @@ -0,0 +1,15 @@ +description: awk explicit empty actions do not run the default print action. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + alpha +input: + allowed_paths: ["$DIR"] + script: |+ + awk 'BEGIN {} 1 {}' input.txt +expect: + stdout: |+ + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/empty_program.yaml b/tests/scenarios/cmd/awk/basic/empty_program.yaml new file mode 100644 index 000000000..540722ec2 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/empty_program.yaml @@ -0,0 +1,10 @@ +description: awk treats an empty program as a no-op. +oracle: gawk +input: + allowed_paths: ["$DIR"] + script: |+ + awk '' missing.txt +expect: + stdout: |+ + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/integer_number_formatting.yaml b/tests/scenarios/cmd/awk/basic/integer_number_formatting.yaml new file mode 100644 index 000000000..f6ed1ed49 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/integer_number_formatting.yaml @@ -0,0 +1,11 @@ +description: awk prints integer-valued numbers without exponent notation. +oracle: gawk +input: + script: |+ + awk 'BEGIN { print 999999, 1000000, 123456789, 1000000.5; x=-0; print x, -1e-400 }' +expect: + stdout: |+ + 999999 1000000 123456789 1e+06 + 0 0 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/operand_assignment.yaml b/tests/scenarios/cmd/awk/basic/operand_assignment.yaml new file mode 100644 index 000000000..a87ece4c2 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/operand_assignment.yaml @@ -0,0 +1,20 @@ +description: awk applies name=value operands in input-stream order. +oracle: gawk +setup: + files: + - path: one.txt + content: |+ + a + - path: two.txt + content: |+ + b +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ print x ":" $0 }' one.txt x=foo two.txt +expect: + stdout: |+ + :a + foo:b + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/options_stop_after_program.yaml b/tests/scenarios/cmd/awk/basic/options_stop_after_program.yaml new file mode 100644 index 000000000..f75be90f6 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/options_stop_after_program.yaml @@ -0,0 +1,16 @@ +description: awk treats flag-like operands after the program as file names. +oracle: gawk +setup: + files: + - path: "-F," + content: |+ + a,b c +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ print $1 }' -F, +expect: + stdout: |+ + a,b + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/print_fields.yaml b/tests/scenarios/cmd/awk/basic/print_fields.yaml new file mode 100644 index 000000000..2accd93c7 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/print_fields.yaml @@ -0,0 +1,19 @@ +description: awk prints selected whitespace-separated fields. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + alpha beta gamma + one two three +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ print $1, $3 }' input.txt +expect: + stdout: |+ + alpha gamma + one three + stderr: "" + exit_code: 0 + diff --git a/tests/scenarios/cmd/awk/basic/print_newline_after_comma.yaml b/tests/scenarios/cmd/awk/basic/print_newline_after_comma.yaml new file mode 100644 index 000000000..c2a57cb03 --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/print_newline_after_comma.yaml @@ -0,0 +1,17 @@ +description: awk print accepts a newline after an argument comma. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + a b +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ print $1, + $2 }' input.txt +expect: + stdout: |+ + a b + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/print_parenthesized_comparison.yaml b/tests/scenarios/cmd/awk/basic/print_parenthesized_comparison.yaml new file mode 100644 index 000000000..57756489b --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/print_parenthesized_comparison.yaml @@ -0,0 +1,18 @@ +description: awk print supports parenthesized comparison expressions. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + 1 + 2 +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ print ($1 > 1) }' input.txt +expect: + stdout: |+ + 0 + 1 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/basic/string_numeric_semantics.yaml b/tests/scenarios/cmd/awk/basic/string_numeric_semantics.yaml new file mode 100644 index 000000000..6a3c10ece --- /dev/null +++ b/tests/scenarios/cmd/awk/basic/string_numeric_semantics.yaml @@ -0,0 +1,29 @@ +description: awk preserves GNU awk string and numeric-string comparison semantics. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + 0 + 10 + 123abc + -4.5x + abc123 +input: + allowed_paths: ["$DIR"] + script: |+ + awk 'BEGIN { print ("10" < "2"), (x == 0), (x == ""), !"0", "123abc" + 1 } $1 { print "truthy", $1 } { print $1 + 1, ($1 == 0), ($1 < 2), ($1 < "2") }' input.txt +expect: + stdout: |+ + 1 1 1 0 124 + 1 1 1 1 + truthy 10 + 11 0 0 1 + truthy 123abc + 124 0 1 1 + truthy -4.5x + -3.5 0 1 1 + truthy abc123 + 1 0 0 0 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/errors/multichar_fs_rejected.yaml b/tests/scenarios/cmd/awk/errors/multichar_fs_rejected.yaml new file mode 100644 index 000000000..7399c888d --- /dev/null +++ b/tests/scenarios/cmd/awk/errors/multichar_fs_rejected.yaml @@ -0,0 +1,16 @@ +description: awk rejects multi-character field separators in Phase 1. +skip_assert_against_bash: true +setup: + files: + - path: input.txt + content: |+ + a::b +input: + allowed_paths: ["$DIR"] + script: |+ + awk -F:: '{ print $1 }' input.txt +expect: + stdout: "" + stderr: |+ + awk: multi-character and regex FS values are not supported + exit_code: 1 diff --git a/tests/scenarios/cmd/awk/fields/field_separator.yaml b/tests/scenarios/cmd/awk/fields/field_separator.yaml new file mode 100644 index 000000000..961781526 --- /dev/null +++ b/tests/scenarios/cmd/awk/fields/field_separator.yaml @@ -0,0 +1,19 @@ +description: awk -F uses a single-character field separator. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + root:x:0 + agent:x:42 +input: + allowed_paths: ["$DIR"] + script: |+ + awk -F: '{ print $1, $3 }' input.txt +expect: + stdout: |+ + root 0 + agent 42 + stderr: "" + exit_code: 0 + diff --git a/tests/scenarios/cmd/awk/fields/field_separator_option_order.yaml b/tests/scenarios/cmd/awk/fields/field_separator_option_order.yaml new file mode 100644 index 000000000..6090c9f3c --- /dev/null +++ b/tests/scenarios/cmd/awk/fields/field_separator_option_order.yaml @@ -0,0 +1,18 @@ +description: awk applies -F and -v FS options in command-line order. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + a:b,c +input: + allowed_paths: ["$DIR"] + script: |+ + awk -v FS=: -F, '{ print $1, $2 }' input.txt + awk -F, -v FS=: '{ print $1, $2 }' input.txt +expect: + stdout: |+ + a:b c + a b,c + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/fields/fractional_field_index.yaml b/tests/scenarios/cmd/awk/fields/fractional_field_index.yaml new file mode 100644 index 000000000..e858518b0 --- /dev/null +++ b/tests/scenarios/cmd/awk/fields/fractional_field_index.yaml @@ -0,0 +1,16 @@ +description: awk truncates fractional field indexes toward zero. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + alpha beta gamma +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ print $(NF/2), $(-0.5) }' input.txt +expect: + stdout: |+ + alpha alpha beta gamma + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/fields/literal_fs_blank_record.yaml b/tests/scenarios/cmd/awk/fields/literal_fs_blank_record.yaml new file mode 100644 index 000000000..a057c3a13 --- /dev/null +++ b/tests/scenarios/cmd/awk/fields/literal_fs_blank_record.yaml @@ -0,0 +1,20 @@ +description: awk reports NF 0 for blank records with a literal field separator. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + a:b + + : +input: + allowed_paths: ["$DIR"] + script: |+ + awk -F: '{ print NF }' input.txt +expect: + stdout: |+ + 2 + 0 + 2 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/fields/tab_separator_and_vars.yaml b/tests/scenarios/cmd/awk/fields/tab_separator_and_vars.yaml new file mode 100644 index 000000000..b9a658afa --- /dev/null +++ b/tests/scenarios/cmd/awk/fields/tab_separator_and_vars.yaml @@ -0,0 +1,21 @@ +description: awk supports tab field separators, -v variables, and multi-file counters. +oracle: gawk +setup: + files: + - path: one.tsv + content: |+ + a 1 + - path: two.tsv + content: |+ + b 2 +input: + allowed_paths: ["$DIR"] + script: |+ + awk -F '\t' -v prefix=row 'BEGIN { OFS=":" } { print prefix, FILENAME, FNR, NR, $2 }' one.tsv two.tsv +expect: + stdout: |+ + row:one.tsv:1:1:1 + row:two.tsv:1:2:2 + stderr: "" + exit_code: 0 + diff --git a/tests/scenarios/cmd/awk/patterns/print_regex_literal.yaml b/tests/scenarios/cmd/awk/patterns/print_regex_literal.yaml new file mode 100644 index 000000000..02f6afd0d --- /dev/null +++ b/tests/scenarios/cmd/awk/patterns/print_regex_literal.yaml @@ -0,0 +1,18 @@ +description: awk print accepts a regex literal as an expression. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + foo + bar +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ print /foo/ }' input.txt +expect: + stdout: |+ + 1 + 0 + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/patterns/regex_and_comparison.yaml b/tests/scenarios/cmd/awk/patterns/regex_and_comparison.yaml new file mode 100644 index 000000000..a700a3b71 --- /dev/null +++ b/tests/scenarios/cmd/awk/patterns/regex_and_comparison.yaml @@ -0,0 +1,19 @@ +description: awk filters records with comparisons and regex match operators. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + ok 1 + error 2 + warn 3 +input: + allowed_paths: ["$DIR"] + script: |+ + awk '$2 > 1 && $1 !~ /warn/ { print $1 }' input.txt +expect: + stdout: |+ + error + stderr: "" + exit_code: 0 + diff --git a/tests/scenarios/cmd/awk/patterns/regex_bracket_slash.yaml b/tests/scenarios/cmd/awk/patterns/regex_bracket_slash.yaml new file mode 100644 index 000000000..3773021fc --- /dev/null +++ b/tests/scenarios/cmd/awk/patterns/regex_bracket_slash.yaml @@ -0,0 +1,17 @@ +description: awk regex bracket classes may contain slash literals. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + / + x +input: + allowed_paths: ["$DIR"] + script: |+ + awk '/[/]/ { print }' input.txt +expect: + stdout: |+ + / + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/patterns/regex_literal_expression.yaml b/tests/scenarios/cmd/awk/patterns/regex_literal_expression.yaml new file mode 100644 index 000000000..b496426b7 --- /dev/null +++ b/tests/scenarios/cmd/awk/patterns/regex_literal_expression.yaml @@ -0,0 +1,19 @@ +description: awk regexp literals in expressions match the current record. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + bar + foo +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ print (/foo/) } /foo/ == 1 { print "matched", $0 }' input.txt +expect: + stdout: |+ + 0 + 1 + matched foo + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/patterns/regex_rule_after_newline.yaml b/tests/scenarios/cmd/awk/patterns/regex_rule_after_newline.yaml new file mode 100644 index 000000000..a9b1f86ed --- /dev/null +++ b/tests/scenarios/cmd/awk/patterns/regex_rule_after_newline.yaml @@ -0,0 +1,19 @@ +description: awk treats regex after a newline rule separator as a pattern. +oracle: gawk +setup: + files: + - path: input.txt + content: |+ + bar + foo +input: + allowed_paths: ["$DIR"] + script: |+ + awk 'BEGIN { print "h" } + /foo/ { print $1 }' input.txt +expect: + stdout: |+ + h + foo + stderr: |+ + exit_code: 0 diff --git a/tests/scenarios/cmd/awk/safety/print_redirect_rejected.yaml b/tests/scenarios/cmd/awk/safety/print_redirect_rejected.yaml new file mode 100644 index 000000000..03b4dd4fe --- /dev/null +++ b/tests/scenarios/cmd/awk/safety/print_redirect_rejected.yaml @@ -0,0 +1,16 @@ +description: awk rejects output redirection from print. +skip_assert_against_bash: true +setup: + files: + - path: input.txt + content: |+ + a b +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ print $1 > "out" }' input.txt +expect: + stdout: "" + stderr: |+ + awk: print redirection and command pipes are not supported + exit_code: 1 diff --git a/tests/scenarios/cmd/awk/safety/system_rejected.yaml b/tests/scenarios/cmd/awk/safety/system_rejected.yaml new file mode 100644 index 000000000..6f86aa3cf --- /dev/null +++ b/tests/scenarios/cmd/awk/safety/system_rejected.yaml @@ -0,0 +1,16 @@ +description: awk rejects system command execution. +skip_assert_against_bash: true +setup: + files: + - path: input.txt + content: |+ + a b +input: + allowed_paths: ["$DIR"] + script: |+ + awk '{ system("sh") }' input.txt +expect: + stdout: "" + stderr: |+ + awk: system() is not supported + exit_code: 1 diff --git a/tests/scenarios/cmd/awk/stdin/dash.yaml b/tests/scenarios/cmd/awk/stdin/dash.yaml new file mode 100644 index 000000000..5d1c2f01c --- /dev/null +++ b/tests/scenarios/cmd/awk/stdin/dash.yaml @@ -0,0 +1,18 @@ +description: awk reads standard input when the file operand is dash. +oracle: gawk +setup: + files: + - path: prog.awk + content: |+ + { print NR ":" $2 } +input: + allowed_paths: ["$DIR"] + script: |+ + printf 'a b\nc d\n' | awk -f prog.awk - +expect: + stdout: |+ + 1:b + 2:d + stderr: "" + exit_code: 0 + diff --git a/tests/scenarios/cmd/awk/stdin/dash_program_file.yaml b/tests/scenarios/cmd/awk/stdin/dash_program_file.yaml new file mode 100644 index 000000000..a072c0502 --- /dev/null +++ b/tests/scenarios/cmd/awk/stdin/dash_program_file.yaml @@ -0,0 +1,19 @@ +description: awk -f dash reads the program from standard input. +oracle: gawk +setup: + files: + - path: prog.awk + content: |+ + { print $1 + 1 } + - path: input.txt + content: |+ + 123abc +input: + allowed_paths: ["$DIR"] + script: |+ + awk -f - input.txt < prog.awk +expect: + stdout: |+ + 124 + stderr: |+ + exit_code: 0