Skip to content

Commit 5c2e34b

Browse files
kubafloCopilot
andcommitted
Fix DataFrame.OrderBy to perform stable sorting
Replace the unstable IntrospectiveSort (introsort) with a stable merge sort implementation for per-buffer sorting in both PrimitiveDataFrameColumn and StringDataFrameColumn. Also fix the multi-buffer merge phase to always write left-to-right using a direction-aware comparer, ensuring stability for both ascending and descending sorts. Fixes #6443 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent df690c2 commit 5c2e34b

4 files changed

Lines changed: 326 additions & 15 deletions

File tree

src/Microsoft.Data.Analysis/DataFrameColumn.cs

Lines changed: 163 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -469,16 +469,15 @@ public string ToString(long rowsToShow)
469469
protected static void PopulateColumnSortIndicesWithHeap<T>(SortedDictionary<T, List<ValueTuple<int, int>>> heapOfValueAndListOfTupleOfSortAndBufferIndex,
470470
PrimitiveDataFrameColumn<long> columnSortIndices,
471471
PrimitiveDataFrameColumn<long> columnNullIndices,
472-
bool ascending,
473472
bool putNullValuesLast,
474473
GetBufferSortIndex getBufferSortIndex,
475474
GetValueAndBufferSortIndexAtBuffer<T> getValueAndBufferSortIndexAtBuffer,
476475
GetBufferLengthAtIndex getBufferLengthAtIndex)
477476
{
478-
long i = ascending ? columnNullIndices.Length : columnSortIndices.Length - 1;
479-
480-
if (putNullValuesLast)
481-
i -= columnNullIndices.Length;
477+
// Always write left-to-right. For descending order, callers pass a SortedDictionary
478+
// with a reversed comparer so that ElementAt(0) yields the largest value first.
479+
// This ensures stable sorting: equal elements preserve their original relative order.
480+
long i = putNullValuesLast ? 0 : columnNullIndices.Length;
482481

483482
while (heapOfValueAndListOfTupleOfSortAndBufferIndex.Count > 0)
484483
{
@@ -499,7 +498,7 @@ protected static void PopulateColumnSortIndicesWithHeap<T>(SortedDictionary<T, L
499498
int bufferIndex = sortAndBufferIndex.bufferIndex;
500499
long bufferSortIndex = getBufferSortIndex(bufferIndex, sortIndex);
501500

502-
columnSortIndices[ascending ? i++ : i--] = bufferSortIndex;
501+
columnSortIndices[i++] = bufferSortIndex;
503502

504503
if (sortIndex + 1 < getBufferLengthAtIndex(bufferIndex))
505504
{
@@ -758,5 +757,163 @@ private static void Sort2<TKey>(
758757
sortIndices[j] = temp;
759758
}
760759
}
760+
761+
/// <summary>
762+
/// Stable merge sort that reorders <paramref name="sortIndices"/> so that
763+
/// the values in <paramref name="span"/> are visited in sorted order.
764+
/// </summary>
765+
protected static void MergeSortIndices<TKey>(
766+
ReadOnlySpan<TKey> span,
767+
int length,
768+
Span<int> sortIndices,
769+
IComparer<TKey> comparer)
770+
{
771+
if (length <= 1)
772+
return;
773+
774+
int[] auxiliary = System.Buffers.ArrayPool<int>.Shared.Rent(length);
775+
try
776+
{
777+
MergeSortIndicesRecursive(span, sortIndices, auxiliary, 0, length - 1, comparer);
778+
}
779+
finally
780+
{
781+
System.Buffers.ArrayPool<int>.Shared.Return(auxiliary);
782+
}
783+
}
784+
785+
/// <summary>
786+
/// Stable merge sort overload that accepts an <see cref="IList{TKey}"/> instead of a span,
787+
/// avoiding the need to copy to an array first.
788+
/// </summary>
789+
protected static void MergeSortIndices<TKey>(
790+
IList<TKey> list,
791+
int length,
792+
Span<int> sortIndices,
793+
IComparer<TKey> comparer)
794+
{
795+
if (length <= 1)
796+
return;
797+
798+
int[] auxiliary = System.Buffers.ArrayPool<int>.Shared.Rent(length);
799+
try
800+
{
801+
MergeSortIndicesRecursive(list, sortIndices, auxiliary, 0, length - 1, comparer);
802+
}
803+
finally
804+
{
805+
System.Buffers.ArrayPool<int>.Shared.Return(auxiliary);
806+
}
807+
}
808+
809+
private static void MergeSortIndicesRecursive<TKey>(
810+
ReadOnlySpan<TKey> span,
811+
Span<int> sortIndices,
812+
int[] auxiliary,
813+
int lo,
814+
int hi,
815+
IComparer<TKey> comparer)
816+
{
817+
if (lo >= hi)
818+
return;
819+
820+
int mid = lo + (hi - lo) / 2;
821+
MergeSortIndicesRecursive(span, sortIndices, auxiliary, lo, mid, comparer);
822+
MergeSortIndicesRecursive(span, sortIndices, auxiliary, mid + 1, hi, comparer);
823+
Merge(span, sortIndices, auxiliary, lo, mid, hi, comparer);
824+
}
825+
826+
private static void MergeSortIndicesRecursive<TKey>(
827+
IList<TKey> list,
828+
Span<int> sortIndices,
829+
int[] auxiliary,
830+
int lo,
831+
int hi,
832+
IComparer<TKey> comparer)
833+
{
834+
if (lo >= hi)
835+
return;
836+
837+
int mid = lo + (hi - lo) / 2;
838+
MergeSortIndicesRecursive(list, sortIndices, auxiliary, lo, mid, comparer);
839+
MergeSortIndicesRecursive(list, sortIndices, auxiliary, mid + 1, hi, comparer);
840+
MergeList(list, sortIndices, auxiliary, lo, mid, hi, comparer);
841+
}
842+
843+
private static void Merge<TKey>(
844+
ReadOnlySpan<TKey> span,
845+
Span<int> sortIndices,
846+
int[] auxiliary,
847+
int lo,
848+
int mid,
849+
int hi,
850+
IComparer<TKey> comparer)
851+
{
852+
for (int k = lo; k <= hi; k++)
853+
{
854+
auxiliary[k] = sortIndices[k];
855+
}
856+
857+
int i = lo;
858+
int j = mid + 1;
859+
860+
for (int k = lo; k <= hi; k++)
861+
{
862+
if (i > mid)
863+
{
864+
sortIndices[k] = auxiliary[j++];
865+
}
866+
else if (j > hi)
867+
{
868+
sortIndices[k] = auxiliary[i++];
869+
}
870+
else if (comparer.Compare(span[auxiliary[i]], span[auxiliary[j]]) <= 0)
871+
{
872+
sortIndices[k] = auxiliary[i++];
873+
}
874+
else
875+
{
876+
sortIndices[k] = auxiliary[j++];
877+
}
878+
}
879+
}
880+
881+
private static void MergeList<TKey>(
882+
IList<TKey> list,
883+
Span<int> sortIndices,
884+
int[] auxiliary,
885+
int lo,
886+
int mid,
887+
int hi,
888+
IComparer<TKey> comparer)
889+
{
890+
for (int k = lo; k <= hi; k++)
891+
{
892+
auxiliary[k] = sortIndices[k];
893+
}
894+
895+
int i = lo;
896+
int j = mid + 1;
897+
898+
for (int k = lo; k <= hi; k++)
899+
{
900+
if (i > mid)
901+
{
902+
sortIndices[k] = auxiliary[j++];
903+
}
904+
else if (j > hi)
905+
{
906+
sortIndices[k] = auxiliary[i++];
907+
}
908+
else if (comparer.Compare(list[auxiliary[i]], list[auxiliary[j]]) <= 0)
909+
{
910+
sortIndices[k] = auxiliary[i++];
911+
}
912+
else
913+
{
914+
sortIndices[k] = auxiliary[j++];
915+
}
916+
}
917+
}
761918
}
762919
}

src/Microsoft.Data.Analysis/DataFrameColumns/StringDataFrameColumn.cs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,7 @@ public IEnumerator<string> GetEnumerator()
205205
protected internal override PrimitiveDataFrameColumn<long> GetSortIndices(bool ascending, bool putNullValuesLast)
206206
{
207207
var comparer = Comparer<string>.Default;
208+
IComparer<string> sortComparer = ascending ? comparer : Comparer<string>.Create((a, b) => comparer.Compare(b, a));
208209

209210
List<int[]> bufferSortIndices = new List<int[]>(_stringBuffers.Count);
210211
var columnNullIndices = new Int64DataFrameColumn("NullIndices", NullCount);
@@ -221,9 +222,7 @@ protected internal override PrimitiveDataFrameColumn<long> GetSortIndices(bool a
221222
nullIndicesSlot++;
222223
}
223224
}
224-
// TODO: Refactor the sort routine to also work with IList?
225-
string[] array = buffer.ToArray();
226-
IntrospectiveSort(array, array.Length, sortIndices, comparer);
225+
MergeSortIndices(buffer, buffer.Count, sortIndices, sortComparer);
227226
bufferSortIndices.Add(sortIndices);
228227
}
229228
// Simple merge sort to build the full column's sort indices
@@ -237,7 +236,7 @@ ValueTuple<string, int> GetFirstNonNullValueStartingAtIndex(int stringBufferInde
237236
return (value, startIndex);
238237
}
239238

240-
SortedDictionary<string, List<ValueTuple<int, int>>> heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary<string, List<ValueTuple<int, int>>>(comparer);
239+
SortedDictionary<string, List<ValueTuple<int, int>>> heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary<string, List<ValueTuple<int, int>>>(sortComparer);
241240
List<List<string>> buffers = _stringBuffers;
242241
for (int i = 0; i < buffers.Count; i++)
243242
{
@@ -266,7 +265,6 @@ ValueTuple<string, int> GetFirstNonNullValueStartingAtIndex(int stringBufferInde
266265
PopulateColumnSortIndicesWithHeap(heapOfValueAndListOfTupleOfSortAndBufferIndex,
267266
columnSortIndices,
268267
columnNullIndices,
269-
ascending,
270268
putNullValuesLast,
271269
getBufferSortIndex,
272270
getValueAtBuffer,

src/Microsoft.Data.Analysis/PrimitiveDataFrameColumn.Sort.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ public partial class PrimitiveDataFrameColumn<T> : DataFrameColumn
2121
protected internal override PrimitiveDataFrameColumn<long> GetSortIndices(bool ascending = true, bool putNullValuesLast = true)
2222
{
2323
var comparer = Comparer<T>.Default;
24+
IComparer<T> sortComparer = ascending ? comparer : Comparer<T>.Create((a, b) => comparer.Compare(b, a));
2425

2526
List<List<int>> bufferSortIndices = new List<List<int>>(_columnContainer.Buffers.Count);
2627
var columnNullIndices = new Int64DataFrameColumn("NullIndices", NullCount);
@@ -35,8 +36,8 @@ protected internal override PrimitiveDataFrameColumn<long> GetSortIndices(bool a
3536
{
3637
sortIndices[i] = i;
3738
}
38-
IntrospectiveSort(buffer.ReadOnlySpan, buffer.Length, sortIndices, comparer);
39-
// Bug fix: QuickSort is not stable. When PrimitiveDataFrameColumn has null values and default values, they move around
39+
MergeSortIndices(buffer.ReadOnlySpan, buffer.Length, sortIndices, sortComparer);
40+
// Stable sort preserves relative order of equal elements and null/default values
4041
List<int> nonNullSortIndices = new List<int>();
4142
for (int i = 0; i < sortIndices.Length; i++)
4243
{
@@ -78,7 +79,7 @@ ValueTuple<T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferI
7879
return (value, startIndex);
7980
}
8081

81-
SortedDictionary<T, List<ValueTuple<int, int>>> heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary<T, List<ValueTuple<int, int>>>(comparer);
82+
SortedDictionary<T, List<ValueTuple<int, int>>> heapOfValueAndListOfTupleOfSortAndBufferIndex = new SortedDictionary<T, List<ValueTuple<int, int>>>(sortComparer);
8283
IList<ReadOnlyDataFrameBuffer<T>> buffers = _columnContainer.Buffers;
8384
for (int i = 0; i < buffers.Count; i++)
8485
{
@@ -107,7 +108,6 @@ ValueTuple<T, int> GetFirstNonNullValueAndBufferIndexStartingAtIndex(int bufferI
107108
PopulateColumnSortIndicesWithHeap(heapOfValueAndListOfTupleOfSortAndBufferIndex,
108109
columnSortIndices,
109110
columnNullIndices,
110-
ascending,
111111
putNullValuesLast,
112112
getBufferSortIndex,
113113
getValueAndBufferSortIndexAtBuffer,

0 commit comments

Comments
 (0)