diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp index 2375158e7d11b1..25f6d163f20e32 100644 --- a/src/coreclr/jit/assertionprop.cpp +++ b/src/coreclr/jit/assertionprop.cpp @@ -228,6 +228,31 @@ bool IntegralRange::Contains(int64_t value) const rangeType = compiler->lvaGetDesc(node->AsLclVar())->TypeGet(); } + // If the local's conservative VN identifies it as the result of a CAST + // to a small type, the value stored is bounded by that type's range. + // + // This refinement is only sound when the local's storage is fully + // normalized to the cast's destination type. For "normalize on load" + // small-type locals the store writes only the low bits, leaving the + // upper bits stale; tightening the range here would let downstream + // optimizations (e.g. fgOptimizeCast) drop a required sign/zero + // extending load and read those stale bits. + if ((compiler->vnStore != nullptr) && (!varTypeIsSmall(varDsc->TypeGet()) || varDsc->lvNormalizeOnStore())) + { + ValueNum vn = compiler->vnStore->VNConservativeNormalValue(node->gtVNPair); + VNFuncApp funcApp; + if (compiler->vnStore->GetVNFunc(vn, &funcApp) && (funcApp.m_func == VNF_Cast)) + { + var_types castToType; + bool srcIsUnsigned; + compiler->vnStore->GetCastOperFromVN(funcApp.m_args[1], &castToType, &srcIsUnsigned); + if (varTypeIsSmall(castToType)) + { + return ForType(castToType); + } + } + } + if (varDsc->IsNeverNegative()) { return {SymbolicIntegerValue::Zero, UpperBoundForType(rangeType)}; @@ -532,12 +557,21 @@ bool IntegralRange::Contains(int64_t value) const // CAST(ulong/long <- int) - [INT_MIN..INT_MAX] if (!cast->gtOverflow()) { - if ((fromType == TYP_INT) && fromUnsigned) + IntegralRange typeRange = (fromType == TYP_INT) && fromUnsigned + ? IntegralRange{SymbolicIntegerValue::Zero, SymbolicIntegerValue::UIntMax} + : IntegralRange{SymbolicIntegerValue::IntMin, SymbolicIntegerValue::IntMax}; + + // Refine using the operand's known range: when the operand fits entirely + // inside what the cast can represent, the cast preserves the value, so we + // can tighten the output range to the operand's range. This handles cases + // like CAST(int <- ulong) of a value already known to be in uint16 range. + IntegralRange operandRange = ForNode(cast->CastOp(), compiler); + if (typeRange.Contains(operandRange)) { - return {SymbolicIntegerValue::Zero, SymbolicIntegerValue::UIntMax}; + return operandRange; } - return {SymbolicIntegerValue::IntMin, SymbolicIntegerValue::IntMax}; + return typeRange; } SymbolicIntegerValue lowerBound; @@ -4044,6 +4078,7 @@ GenTree* Compiler::optAssertionProp_AddMulSub(ASSERT_VALARG_TP assertions, GenTr // 1) Convert DIV/MOD to UDIV/UMOD if both operands are proven to be never negative // 2) Marks DIV/UDIV/MOD/UMOD with GTF_DIV_MOD_NO_BY_ZERO if divisor is proven to be never zero // 3) Marks DIV/UDIV/MOD/UMOD with GTF_DIV_MOD_NO_OVERFLOW if both operands are proven to be never negative +// 4) Marks UMOD with GTF_UMOD_UINT16_OPERANDS if both operands are proven to be in uint16 range // // Arguments: // assertions - set of live assertions @@ -4091,6 +4126,22 @@ GenTree* Compiler::optAssertionProp_ModDiv(ASSERT_VALARG_TP assertions, changed = true; } +#ifdef TARGET_64BIT + // Detect "uint16 % const-uint16" patterns so lowering can emit a cheaper FastMod sequence. + if (((tree->gtFlags & GTF_UMOD_UINT16_OPERANDS) == 0) && tree->OperIs(GT_UMOD) && !opts.MinOpts() && + op2->IsCnsIntOrI()) + { + ssize_t divisorValue = op2->AsIntCon()->IconValue(); + if ((divisorValue > 0) && FitsIn(divisorValue) && + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(op1, this))) + { + JITDUMP("Both operands for UMOD are in uint16 range...\n") + tree->gtFlags |= GTF_UMOD_UINT16_OPERANDS; + changed = true; + } + } +#endif // TARGET_64BIT + return changed ? optAssertionProp_Update(tree, tree, stmt) : nullptr; } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 1946ace685bc22..162c5f9f828582 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -2672,8 +2672,8 @@ bool GenTree::Compare(GenTree* op1, GenTree* op2, bool swapOK) } if (op1->OperIs(GT_MOD, GT_UMOD, GT_DIV, GT_UDIV)) { - if ((op1->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW)) != - (op2->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW))) + if ((op1->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW | GTF_UMOD_UINT16_OPERANDS)) != + (op2->gtFlags & (GTF_DIV_MOD_NO_BY_ZERO | GTF_DIV_MOD_NO_OVERFLOW | GTF_UMOD_UINT16_OPERANDS))) { return false; } diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index a71ed68351c9b4..bec522a8efe539 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -528,6 +528,8 @@ enum GenTreeFlags : unsigned GTF_DIV_MOD_NO_OVERFLOW = 0x40000000, // GT_DIV, GT_MOD -- Div or mod definitely does not overflow. + GTF_UMOD_UINT16_OPERANDS = 0x80000000, // UMOD -- Both operands to a mod are in uint16 range. The divisor is non-zero constant. + GTF_CHK_INDEX_INBND = 0x80000000, // GT_BOUNDS_CHECK -- have proven this check is always in-bounds GTF_ARRLEN_NONFAULTING = 0x20000000, // GT_ARR_LENGTH -- An array length operation that cannot fault. Same as GT_IND_NONFAULTING. diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 6e7f6824ecea3a..6ab398c96ee9c1 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -8088,7 +8088,10 @@ bool Lowering::TryLowerConstIntUDivOrUMod(GenTreeOp* divMod) assert(varTypeIsFloating(divMod->TypeGet())); #endif // USE_HELPERS_FOR_INT_DIV #if defined(TARGET_ARM64) - assert(!divMod->OperIs(GT_UMOD)); + // ARM64 has no remainder instruction. Morph rewrites every non-pow2 MOD/UMOD + // into a SUB-MUL-DIV form except when the FastMod path here can apply, which + // requires a constant uint16 divisor (and a uint16-range dividend). + assert(!divMod->OperIs(GT_UMOD) || divMod->gtGetOp2()->IsCnsIntOrI()); #endif // TARGET_ARM64 GenTree* dividend = divMod->gtGetOp1(); @@ -8170,10 +8173,78 @@ bool Lowering::TryLowerConstIntUDivOrUMod(GenTreeOp* divMod) } } + assert(divisorValue >= 3); + // TODO-ARM-CQ: Currently there's no GT_MULHI for ARM32 #if defined(TARGET_XARCH) || defined(TARGET_ARM64) || defined(TARGET_LOONGARCH64) || defined(TARGET_RISCV64) - if (!m_compiler->opts.MinOpts() && (divisorValue >= 3)) + if (!m_compiler->opts.MinOpts()) { +#ifdef TARGET_64BIT + // Replace (uint16 % uint16) with a cheaper variant of FastMod, specialized for 16-bit operands. + // + // uint multiplier = uint.MaxValue / divisor + 1; + // ulong result = ((ulong)(dividend * multiplier) * divisor) >> 32; + // return (int)result; + // + // The dividend is first truncated to TYP_INT (safe because we know it fits in + // uint16), and the final value (always in [0, divisor)) is widened back to the + // mod's result type. The dividend's range may have been recovered either + // statically by IntegralRange::ForNode at this point, or earlier by assertion + // propagation (which leaves GTF_UMOD_UINT16_OPERANDS for us). + if (!isDiv && FitsIn(divisorValue) && + (((divMod->gtFlags & GTF_UMOD_UINT16_OPERANDS) != 0) || + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(dividend, m_compiler)))) + { + GenTree* dividendInt = dividend; + if (genActualType(dividend) != TYP_INT) + { + assert(genActualType(dividend) == TYP_LONG); + dividendInt = m_compiler->gtNewCastNode(TYP_INT, dividend, /* unsigned */ true, TYP_INT); + BlockRange().InsertBefore(divMod, dividendInt); + } + + GenTree* multiplier = + m_compiler->gtNewIconNode(static_cast((UINT32_MAX / divisorValue) + 1), TYP_INT); + GenTree* mul1 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, dividendInt, multiplier); + mul1->SetUnsigned(); + GenTree* castUp = m_compiler->gtNewCastNode(TYP_LONG, mul1, /* unsigned */ true, TYP_LONG); + BlockRange().InsertBefore(divMod, multiplier, mul1, castUp); + + // Reuse the existing constant divisor as a TYP_LONG operand for the second multiply. + divisor->BashToConst(static_cast(divisorValue)); + + GenTree* mul2 = m_compiler->gtNewOperNode(GT_MUL, TYP_LONG, castUp, divisor); + mul2->SetUnsigned(); + GenTree* shiftAmount = m_compiler->gtNewIconNode(32); + BlockRange().InsertBefore(divMod, mul2, shiftAmount); + + GenTree* firstNode = (dividendInt == dividend) ? multiplier : dividendInt; + + if (type == TYP_LONG) + { + // The shift result is already TYP_LONG; turn divMod itself into the shift. + divMod->ChangeOper(GT_RSZ); + divMod->gtOp1 = mul2; + divMod->gtOp2 = shiftAmount; + } + else + { + assert(type == TYP_INT); + GenTree* shift = m_compiler->gtNewOperNode(GT_RSZ, TYP_LONG, mul2, shiftAmount); + BlockRange().InsertBefore(divMod, shift); + + divMod->ChangeOper(GT_CAST); + divMod->AsCast()->gtCastType = TYP_INT; + divMod->gtOp1 = shift; + divMod->gtOp2 = nullptr; + divMod->SetUnsigned(); + } + + ContainCheckRange(firstNode, divMod); + return true; + } +#endif // TARGET_64BIT + size_t magic; bool increment; int preShift; diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index f5dd9fe9bef06a..b7aceaf6119f0d 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -7424,6 +7424,26 @@ GenTree* Compiler::fgMorphSmpOp(GenTree* tree, bool* optAssertionPropDone) else if (tree->OperIs(GT_MOD) && op2->IsIntegralConst() && !op2->IsIntegralConstAbsPow2()) #endif { +#if defined(TARGET_64BIT) + // Skip this transform when lowering will be able to apply the + // cheaper uint16 FastMod sequence. This requires the divisor to + // be a non-zero constant in uint16 range and the dividend's + // static IntegralRange to already fit in uint16. + if (!opts.MinOpts() && op2->IsCnsIntOrI()) + { + ssize_t modDivisor = op2->AsIntCon()->IconValue(); + if ((modDivisor > 0) && FitsIn(modDivisor) && + IntegralRange::ForType(TYP_USHORT).Contains(IntegralRange::ForNode(op1, this))) + { + if (tree->OperIs(GT_MOD)) + { + tree->SetOper(GT_UMOD); + } + break; + } + } +#endif // TARGET_64BIT + // Transformation: a % b = a - (a / b) * b; tree = fgMorphModToSubMulDiv(tree->AsOp()); op1 = tree->AsOp()->gtOp1; diff --git a/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.cs b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.cs new file mode 100644 index 00000000000000..eec5af4a8dd5bc --- /dev/null +++ b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.cs @@ -0,0 +1,169 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.X86; +using Xunit; + +public class Program +{ + private static ushort s_field1; + private static ulong s_field2; + + [MethodImpl(MethodImplOptions.NoInlining)] + public static uint Umod_U4_CharByZero(char c) + { + return (uint)c % 0; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ushort Umod_U2_CharByConst(char c) + { + return (ushort)(c % 42); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static int Umod_I4_CharByConst(char c) + { + return c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static uint Umod_U4_CharByConst(char c) + { + return (uint)c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static long Umod_I8_CharByConst(char c) + { + return (long)c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ulong Umod_U8_CharByConst(char c) + { + return (ulong)c % 42; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static bool TestIsWhiteSpace(char c) + { + ReadOnlySpan HashEntries = [' ', ' ', '\u00A0', ' ', ' ', ' ', ' ', ' ', ' ', '\t', '\n', '\v', '\f', '\r', ' ', ' ', '\u2028', '\u2029', ' ', ' ', ' ', ' ', ' ', '\u202F', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\u3000', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '\u0085', '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005', '\u2006', '\u2007', '\u2008', '\u2009', '\u200A', ' ', ' ', ' ', ' ', ' ', '\u205F', '\u1680', ' ', ' ', ' ', ' ', ' ', ' ']; + return HashEntries[c % HashEntries.Length] == c; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ushort Umod_TZC(ulong value) + { + return (ushort)(BitOperations.TrailingZeroCount(value) % 42); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static ushort Umod_TZC_Intrinsic(ulong value) + { + return (ushort)(Bmi1.X64.TrailingZeroCount(value) % 42); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + public static int Umod_UInt16Range_ByConst(int value) + { + if (value is > 0 and < 1234) + { + return value % 123; + } + + return -1; + } + + [MethodImpl(MethodImplOptions.AggressiveOptimization)] + private static void Test1() + { + for (int i = 0; i < 2; i++) + { + Core(); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void Core() + { + s_field1 = (ushort)(Bmi1.X64.TrailingZeroCount(s_field2) % 42); + } + } + + [Fact] + public static int TestEntryPoint() + { + try + { + Umod_U4_CharByZero('a'); + return 0; + } + catch (DivideByZeroException) { } + + if (Umod_U2_CharByConst('a') != 13) + return 0; + + if (Umod_I4_CharByConst('a') != 13) + return 0; + + if (Umod_U4_CharByConst('a') != 13) + return 0; + + if (Umod_I8_CharByConst('a') != 13) + return 0; + + if (Umod_U8_CharByConst('a') != 13) + return 0; + + if (!TestIsWhiteSpace(' ')) + return 0; + + if (!TestIsWhiteSpace('\u2029')) + return 0; + + if (TestIsWhiteSpace('\0')) + return 0; + + if (TestIsWhiteSpace('a')) + return 0; + + if (Umod_TZC(1L << 40) != 40) + return 0; + + if (Umod_TZC(1L << 50) != 8) + return 0; + + if (Bmi1.X64.IsSupported) + { + if (Umod_TZC_Intrinsic(1L << 40) != 40) + return 0; + + if (Umod_TZC_Intrinsic(1L << 50) != 8) + return 0; + } + + if (Umod_UInt16Range_ByConst(0) != -1) + return 0; + + if (Umod_UInt16Range_ByConst(42) != 42) + return 0; + + if (Umod_UInt16Range_ByConst(123) != 0) + return 0; + + if (Bmi1.X64.IsSupported) + { + s_field1 = 1; + s_field2 = 1L << 50; + Test1(); + + if (s_field1 != 8) + return 0; + } + + return 100; + } +} diff --git a/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.csproj b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.csproj new file mode 100644 index 00000000000000..501217e4d86892 --- /dev/null +++ b/src/tests/JIT/opt/Divide/Regressions/Regression4_Divide.csproj @@ -0,0 +1,9 @@ + + + None + True + + + + +