Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions docs/bootstrap-inventory.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
Wrote /compiler/docs/bootstrap-inventory.md (252 files, 10 blockers)
Wrote /compiler/docs/bootstrap-profile.json (2 AOT lint targets, 2 excluded files)
# Bootstrap inventory (vm.php path)

Auto-generated by `script/bootstrap-inventory.php`. Tracks **Phase A** of [#212](https://github.com/PurHur/php-compiler/issues/212) (self-host bootstrap).
Expand All @@ -8,9 +10,9 @@ Regenerate: `php script/bootstrap-inventory.php`

| Metric | Count |
|--------|------:|
| PHP files on vm.php path | 251 |
| PHP files on vm.php path | 252 |
| Source constructs flagged (blockers) | 10 |
| Source constructs flagged (warnings) | 656 |
| Source constructs flagged (warnings) | 657 |

## Compiler CFG gaps (`lib/Compiler.php`)

Expand Down Expand Up @@ -182,6 +184,7 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered:
| `ext/standard/web_bool.php` | 0 | 1 |
| `ext/standard/web_int.php` | 0 | 1 |
| `ext/standard/web_string.php` | 0 | 1 |
| `ext/types/JitMbStrlen.php` | 0 | 1 |
| `ext/types/Module.php` | 0 | 13 |
| `ext/types/is_type.php` | 0 | 1 |
| `ext/types/mb_strlen.php` | 0 | 1 |
Expand Down Expand Up @@ -591,9 +594,9 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered:
### `ext/standard/VmString.php`

**Warnings** (review for bootstrap subset):
- new Exception (line 182)
- new Exception (line 190)
- 56 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler
- new Exception (line 207)
- new Exception (line 215)
- 57 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler
- 1 closure(s)

### `ext/standard/abs.php`
Expand Down Expand Up @@ -1185,6 +1188,11 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered:
**Warnings** (review for bootstrap subset):
- 2 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler

### `ext/types/JitMbStrlen.php`

**Warnings** (review for bootstrap subset):
- 1 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler

### `ext/types/Module.php`

**Warnings** (review for bootstrap subset):
Expand All @@ -1210,7 +1218,7 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered:
### `ext/types/mb_strlen.php`

**Warnings** (review for bootstrap subset):
- 2 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler
- 3 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler

### `ext/types/strlen.php`

Expand Down
6 changes: 4 additions & 2 deletions docs/bootstrap-profile.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
Wrote /compiler/docs/bootstrap-profile.json (2 AOT lint targets, 2 excluded files)
{
"phase": "B",
"issue": 212,
Expand Down Expand Up @@ -181,6 +182,7 @@
"ext/standard/web_bool.php",
"ext/standard/web_int.php",
"ext/standard/web_string.php",
"ext/types/JitMbStrlen.php",
"ext/types/Module.php",
"ext/types/is_type.php",
"ext/types/mb_strlen.php",
Expand Down Expand Up @@ -285,9 +287,9 @@
"test/bootstrap-aot/echo_hello.php"
],
"totals": {
"inventory_files": 251,
"inventory_files": 252,
"excluded": 2,
"eligible": 249,
"eligible": 250,
"aot_lint_targets": 2
}
}
25 changes: 25 additions & 0 deletions ext/standard/VmString.php
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,31 @@ public static function byteLength(string $string): int
return $len;
}

/**
* UTF-8 codepoint count for BMP web text (issue #158). Invalid bytes count as one character.
*/
public static function utf8CharLength(string $string): int
{
$byteLen = self::byteLength($string);
$count = 0;
for ($i = 0; $i < $byteLen; ++$count) {
$byte = \ord($string[$i]);
if ($byte < 0x80) {
$i += 1;
} elseif (($byte & 0xE0) === 0xC0 && $i + 1 < $byteLen) {
$i += 2;
} elseif (($byte & 0xF0) === 0xE0 && $i + 2 < $byteLen) {
$i += 3;
} elseif (($byte & 0xF8) === 0xF0 && $i + 3 < $byteLen) {
$i += 4;
} else {
$i += 1;
}
}

return $count;
}

public static function byteSlice(string $string, int $offset, ?int $length = null): string
{
$len = self::byteLength($string);
Expand Down
38 changes: 38 additions & 0 deletions ext/types/JitMbStrlen.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<?php

declare(strict_types=1);

namespace PHPCompiler\ext\types;

use PHPCompiler\ext\standard\VmString;
use PHPCompiler\JIT\Context;
use PHPCompiler\JIT\Variable as JITVariable;
use PHPLLVM\Value;

/**
* LLVM JIT/AOT helpers for mb_strlen() UTF-8 counting (issue #158).
*/
final class JitMbStrlen
{
public static function utf8Length(Context $context, JITVariable $arg): Value
{
if (JITVariable::TYPE_STRING !== $arg->type) {
throw new \LogicException('mb_strlen() only supports strings in this compiler build');
}

$literal = $arg->compileTimeString ?? null;
if (null !== $literal) {
return $context->constantFromInteger(
VmString::utf8CharLength($literal),
'int64'
);
}

$strPtr = $context->helper->loadValue($arg);

return $context->builder->call(
$context->lookupFunction('__compiler_utf8_strlen'),
$strPtr
);
}
}
79 changes: 64 additions & 15 deletions ext/types/mb_strlen.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
use PHPCompiler\Func\Internal;
use PHPCompiler\JIT\Context;
use PHPCompiler\JIT\Variable;
use PHPCompiler\VM\Variable as VMVariable;
use PHPLLVM\Value;

/**
* mb_strlen() — byte length of string (UTF-8 safe for ASCII subset in this compiler).
* mb_strlen() — UTF-8 character count for web forms (issue #158).
*/
final class mb_strlen extends Internal
{
Expand All @@ -23,31 +24,79 @@ public function __construct()

public function execute(Frame $frame): void
{
if (1 !== \count($frame->calledArgs)) {
throw new \LogicException('mb_strlen() requires exactly one argument');
$argc = \count($frame->calledArgs);
if ($argc < 1 || $argc > 2) {
throw new \LogicException('mb_strlen() requires one or two arguments');
}
$var = $frame->calledArgs[0];
if (null !== $frame->returnVar) {
$frame->returnVar->int(VmString::byteLength($var->resolveIndirect()->toString()));
$strVar = $frame->calledArgs[0]->resolveIndirect();
if (VMVariable::TYPE_STRING !== $strVar->type) {
throw new \LogicException('mb_strlen() only supports strings in this compiler build');
}
if (null === $frame->returnVar) {
return;
}
$str = $strVar->toString();
$encoding = 'UTF-8';
if (2 === $argc) {
$encVar = $frame->calledArgs[1]->resolveIndirect();
if (VMVariable::TYPE_STRING !== $encVar->type) {
throw new \LogicException('mb_strlen() encoding must be a string in this compiler build');
}
$encoding = $encVar->toString();
}
$frame->returnVar->int(self::lengthForEncoding($str, $encoding));
}

public Context $context;

public function call(Context $context, Variable ...$args): Value
{
$this->context = $context;
if (1 !== \count($args)) {
throw new \LogicException('mb_strlen() requires exactly one argument');
$argc = \count($args);
if ($argc < 1 || $argc > 2) {
throw new \LogicException('mb_strlen() requires one or two arguments');
}
if (1 === $argc) {
return JitMbStrlen::utf8Length($context, $args[0]);
}
if (Variable::TYPE_STRING !== $args[1]->type) {
throw new \LogicException('mb_strlen() encoding must be a string in this compiler build');
}
$encoding = $args[1]->compileTimeString ?? null;
if ('UTF-8' === $encoding) {
return JitMbStrlen::utf8Length($context, $args[0]);
}
if (null !== $encoding && 'ASCII' !== $encoding && '8BIT' !== $encoding) {
throw new \LogicException(
'mb_strlen() JIT only supports UTF-8, ASCII, or 8BIT encoding literals in this compiler build'
);
}
$argValue = $context->helper->loadValue($args[0]);
if (Variable::TYPE_STRING !== $args[0]->type) {
throw new \LogicException('mb_strlen() only supports strings in this compiler build');
}
$offset = $this->context->structFieldMap[$argValue->typeOf()->getElementType()->getName()]['length'];
$argValue = $context->helper->loadValue($args[0]);
$offset = $context->structFieldMap[$argValue->typeOf()->getElementType()->getName()]['length'];

return $context->builder->load(
$context->builder->structGep($argValue, $offset)
);
}

private static function lengthForEncoding(string $str, string $encoding): int
{
if ('UTF-8' === $encoding) {
if (\function_exists('mb_strlen')) {
return (int) \mb_strlen($str, 'UTF-8');
}

return VmString::utf8CharLength($str);
}
if ('ASCII' === $encoding || '8BIT' === $encoding) {
return VmString::byteLength($str);
}
if (\function_exists('mb_strlen')) {
return (int) \mb_strlen($str, $encoding);
}

return $this->context->builder->load(
$this->context->builder->structGep($argValue, $offset)
throw new \LogicException(
'mb_strlen() requires mbstring for encoding '.$encoding.' in this compiler build'
);
}
}
34 changes: 34 additions & 0 deletions lib/AOT/runtime/superglobals_refresh.c
Original file line number Diff line number Diff line change
Expand Up @@ -1114,6 +1114,40 @@ __string__ *__compiler_strip_tags(__string__ *input, __string__ *allowed)
}
}

/*
* UTF-8 character count for mb_strlen() JIT/AOT (issue #158).
*/
long long __compiler_utf8_strlen(__string__ *input)
{
const char *src;
size_t slen;
size_t i = 0;
long long count = 0;

if (input == NULL) {
return 0;
}
src = nf_strdata(input);
slen = nf_strlen(input);
while (i < slen) {
unsigned char b = (unsigned char) src[i];
if (b < 0x80) {
i += 1;
} else if ((b & 0xE0) == 0xC0 && i + 1 < slen) {
i += 2;
} else if ((b & 0xF0) == 0xE0 && i + 2 < slen) {
i += 3;
} else if ((b & 0xF8) == 0xF0 && i + 3 < slen) {
i += 4;
} else {
i += 1;
}
count++;
}

return count;
}

/*
* Zend parity for missing array string keys (issue #273).
* Called from JIT __hashtable__readStringKeyValue when lookup returns NULL.
Expand Down
8 changes: 8 additions & 0 deletions lib/JIT/Builtin/Type.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ public function register(): void {
);
$fnStripTags = $this->context->module->addFunction('__compiler_strip_tags', $fntypeStripTags);
$this->context->registerFunction('__compiler_strip_tags', $fnStripTags);
$i64 = $this->context->getTypeFromString('int64');
$fntypeUtf8Strlen = $this->context->context->functionType(
$i64,
false,
$this->context->getTypeFromString('__string__*')
);
$fnUtf8Strlen = $this->context->module->addFunction('__compiler_utf8_strlen', $fntypeUtf8Strlen);
$this->context->registerFunction('__compiler_utf8_strlen', $fnUtf8Strlen);
HttpResponseCode::implement($this->context);
$i8p = $this->context->getTypeFromString('int8*');
$i32 = $this->context->getTypeFromString('int32');
Expand Down
13 changes: 13 additions & 0 deletions test/compliance/cases/stdlib/mb_strlen.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
--TEST--
stdlib mb_strlen() UTF-8 character count (VM)
--FILE--
<?php
echo mb_strlen('é', 'UTF-8'), "\n";
echo mb_strlen('hello', 'UTF-8'), "\n";
echo mb_strlen(''), "\n";
echo mb_strlen('abc'), "\n";
--EXPECT--
1
5
0
3
8 changes: 5 additions & 3 deletions test/compliance/cases/stdlib/mb_strlen_jit.phpt
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
--TEST--
stdlib mb_strlen() JIT byte length
stdlib mb_strlen() JIT UTF-8 character count
--FILE--
<?php
echo mb_strlen('é', 'UTF-8'), "\n";
echo mb_strlen('hello', 'UTF-8'), "\n";
echo mb_strlen(''), "\n";
echo mb_strlen('abc'), "\n";
echo mb_strlen("hello"), "\n";
--EXPECT--
1
5
0
3
5
Loading