From d3feaf4a7484f0a249ad3874c8b8054441d354de Mon Sep 17 00:00:00 2001 From: hieuddo Date: Thu, 21 May 2026 16:53:02 +0800 Subject: [PATCH 1/3] dataset: add session-aware datasets: Diginetica, RetailRocket, and Cosmetics --- cornac/datasets/cosmetics.py | 88 ++++++++++++++++++++++ cornac/datasets/diginetica.py | 88 ++++++++++++++++++++++ cornac/datasets/retailrocket.py | 88 ++++++++++++++++++++++ tests/cornac/datasets/test_cosmetics.py | 38 ++++++++++ tests/cornac/datasets/test_diginetica.py | 38 ++++++++++ tests/cornac/datasets/test_retailrocket.py | 38 ++++++++++ 6 files changed, 378 insertions(+) create mode 100644 cornac/datasets/cosmetics.py create mode 100644 cornac/datasets/diginetica.py create mode 100644 cornac/datasets/retailrocket.py create mode 100644 tests/cornac/datasets/test_cosmetics.py create mode 100644 tests/cornac/datasets/test_diginetica.py create mode 100644 tests/cornac/datasets/test_retailrocket.py diff --git a/cornac/datasets/cosmetics.py b/cornac/datasets/cosmetics.py new file mode 100644 index 000000000..2cf0d7ae8 --- /dev/null +++ b/cornac/datasets/cosmetics.py @@ -0,0 +1,88 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Cosmetics dataset: an e-commerce dataset containing the event history of a cosmetics shop for five months. +""" + +from typing import List + +from ..data import Reader +from ..utils import cache + + +def load_train(fmt="USIT", reader: Reader = None) -> List: + """Load train data + + Parameters + ---------- + reader: `obj:cornac.data.Reader`, default: None + Reader object used to read the data. + + Returns + ------- + data: array-like + Data in the form of a list of tuples (user, session, item, timestamp). + """ + fpath = cache( + url="https://static.preferred.ai/cornac/datasets/cosmetics/train.zip", + unzip=True, + relative_path="cosmetics/train.csv", + ) + reader = Reader() if reader is None else reader + return reader.read(fpath, fmt=fmt, sep=",") + + +def load_val(fmt="USIT", reader: Reader = None) -> List: + """Load validation data + + Parameters + ---------- + reader: `obj:cornac.data.Reader`, default: None + Reader object used to read the data. + + Returns + ------- + data: array-like + Data in the form of a list of tuples (user, session, item, timestamp). + """ + fpath = cache( + url="https://static.preferred.ai/cornac/datasets/cosmetics/val.zip", + unzip=True, + relative_path="cosmetics/val.csv", + ) + reader = Reader() if reader is None else reader + return reader.read(fpath, fmt=fmt, sep=",") + + +def load_test(fmt="USIT", reader: Reader = None) -> List: + """Load test data + + Parameters + ---------- + reader: `obj:cornac.data.Reader`, default: None + Reader object used to read the data. + + Returns + ------- + data: array-like + Data in the form of a list of tuples (user, session, item, timestamp). + """ + fpath = cache( + url="https://static.preferred.ai/cornac/datasets/cosmetics/test.zip", + unzip=True, + relative_path="cosmetics/test.csv", + ) + reader = Reader() if reader is None else reader + return reader.read(fpath, fmt=fmt, sep=",") diff --git a/cornac/datasets/diginetica.py b/cornac/datasets/diginetica.py new file mode 100644 index 000000000..23daf6334 --- /dev/null +++ b/cornac/datasets/diginetica.py @@ -0,0 +1,88 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Diginetica dataset is originally from the CIKM 2016 competition. +""" + +from typing import List + +from ..data import Reader +from ..utils import cache + + +def load_train(fmt="USIT", reader: Reader = None) -> List: + """Load train data + + Parameters + ---------- + reader: `obj:cornac.data.Reader`, default: None + Reader object used to read the data. + + Returns + ------- + data: array-like + Data in the form of a list of tuples (user, session, item, timestamp). + """ + fpath = cache( + url="https://static.preferred.ai/cornac/datasets/diginetica/train.zip", + unzip=True, + relative_path="diginetica/train.csv", + ) + reader = Reader() if reader is None else reader + return reader.read(fpath, fmt=fmt, sep=",") + + +def load_val(fmt="USIT", reader: Reader = None) -> List: + """Load validation data + + Parameters + ---------- + reader: `obj:cornac.data.Reader`, default: None + Reader object used to read the data. + + Returns + ------- + data: array-like + Data in the form of a list of tuples (user, session, item, timestamp). + """ + fpath = cache( + url="https://static.preferred.ai/cornac/datasets/diginetica/val.zip", + unzip=True, + relative_path="diginetica/val.csv", + ) + reader = Reader() if reader is None else reader + return reader.read(fpath, fmt=fmt, sep=",") + + +def load_test(fmt="USIT", reader: Reader = None) -> List: + """Load test data + + Parameters + ---------- + reader: `obj:cornac.data.Reader`, default: None + Reader object used to read the data. + + Returns + ------- + data: array-like + Data in the form of a list of tuples (user, session, item, timestamp). + """ + fpath = cache( + url="https://static.preferred.ai/cornac/datasets/diginetica/test.zip", + unzip=True, + relative_path="diginetica/test.csv", + ) + reader = Reader() if reader is None else reader + return reader.read(fpath, fmt=fmt, sep=",") diff --git a/cornac/datasets/retailrocket.py b/cornac/datasets/retailrocket.py new file mode 100644 index 000000000..4fa4794f7 --- /dev/null +++ b/cornac/datasets/retailrocket.py @@ -0,0 +1,88 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +RetailRocket dataset: e-commerce web events (clicks, add to carts, transactions) data for 4.5 months. +""" + +from typing import List + +from ..data import Reader +from ..utils import cache + + +def load_train(fmt="USIT", reader: Reader = None) -> List: + """Load train data + + Parameters + ---------- + reader: `obj:cornac.data.Reader`, default: None + Reader object used to read the data. + + Returns + ------- + data: array-like + Data in the form of a list of tuples (user, session, item, timestamp). + """ + fpath = cache( + url="https://static.preferred.ai/cornac/datasets/retailrocket/train.zip", + unzip=True, + relative_path="retailrocket/train.csv", + ) + reader = Reader() if reader is None else reader + return reader.read(fpath, fmt=fmt, sep=",") + + +def load_val(fmt="USIT", reader: Reader = None) -> List: + """Load validation data + + Parameters + ---------- + reader: `obj:cornac.data.Reader`, default: None + Reader object used to read the data. + + Returns + ------- + data: array-like + Data in the form of a list of tuples (user, session, item, timestamp). + """ + fpath = cache( + url="https://static.preferred.ai/cornac/datasets/retailrocket/val.zip", + unzip=True, + relative_path="retailrocket/val.csv", + ) + reader = Reader() if reader is None else reader + return reader.read(fpath, fmt=fmt, sep=",") + + +def load_test(fmt="USIT", reader: Reader = None) -> List: + """Load test data + + Parameters + ---------- + reader: `obj:cornac.data.Reader`, default: None + Reader object used to read the data. + + Returns + ------- + data: array-like + Data in the form of a list of tuples (user, session, item, timestamp). + """ + fpath = cache( + url="https://static.preferred.ai/cornac/datasets/retailrocket/test.zip", + unzip=True, + relative_path="retailrocket/test.csv", + ) + reader = Reader() if reader is None else reader + return reader.read(fpath, fmt=fmt, sep=",") diff --git a/tests/cornac/datasets/test_cosmetics.py b/tests/cornac/datasets/test_cosmetics.py new file mode 100644 index 000000000..2a1333e99 --- /dev/null +++ b/tests/cornac/datasets/test_cosmetics.py @@ -0,0 +1,38 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import random +import time +import unittest + +from cornac.datasets import cosmetics + + +class TestCosmetics(unittest.TestCase): + + def test_load_train_val_test(self): + random.seed(time.time()) + if random.random() > 0.8: + train = cosmetics.load_train() + val = cosmetics.load_val() + test = cosmetics.load_test() + + self.assertEqual(len(train), 2069928) + self.assertEqual(len(val), 2302227) + self.assertEqual(len(test), 2300963) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/cornac/datasets/test_diginetica.py b/tests/cornac/datasets/test_diginetica.py new file mode 100644 index 000000000..142255906 --- /dev/null +++ b/tests/cornac/datasets/test_diginetica.py @@ -0,0 +1,38 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import random +import time +import unittest + +from cornac.datasets import diginetica + + +class TestDiginetica(unittest.TestCase): + + def test_load_train_val_test(self): + random.seed(time.time()) + if random.random() > 0.8: + train = diginetica.load_train() + val = diginetica.load_val() + test = diginetica.load_test() + + self.assertEqual(len(train), 7273) + self.assertEqual(len(val), 9733) + self.assertEqual(len(test), 9686) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/cornac/datasets/test_retailrocket.py b/tests/cornac/datasets/test_retailrocket.py new file mode 100644 index 000000000..af6e123cf --- /dev/null +++ b/tests/cornac/datasets/test_retailrocket.py @@ -0,0 +1,38 @@ +# Copyright 2026 The Cornac Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import random +import time +import unittest + +from cornac.datasets import retailrocket + + +class TestRetailRocket(unittest.TestCase): + + def test_load_train_val_test(self): + random.seed(time.time()) + if random.random() > 0.8: + train = retailrocket.load_train() + val = retailrocket.load_val() + test = retailrocket.load_test() + + self.assertEqual(len(train), 172626) + self.assertEqual(len(val), 202035) + self.assertEqual(len(test), 201408) + + +if __name__ == "__main__": + unittest.main() From 9bc47222ba3770640d076d9c5e1f77eabd41e2c0 Mon Sep 17 00:00:00 2001 From: hieuddo Date: Thu, 21 May 2026 17:20:32 +0800 Subject: [PATCH 2/3] fix: import datasets in `cornac/datasets/__init__.py` --- cornac/datasets/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cornac/datasets/__init__.py b/cornac/datasets/__init__.py index e46a967bb..2f955bb69 100644 --- a/cornac/datasets/__init__.py +++ b/cornac/datasets/__init__.py @@ -18,11 +18,14 @@ from . import amazon_office from . import amazon_toy from . import citeulike +from . import cosmetics +from . import diginetica from . import epinions from . import filmtrust from . import gowalla from . import movielens from . import netflix +from . import retailrocket from . import tafeng from . import tradesy from . import yoochoose From 09a170ffe5026725ebf93773b2f2cf70280849d5 Mon Sep 17 00:00:00 2001 From: hieuddo Date: Sat, 23 May 2026 19:41:28 +0800 Subject: [PATCH 3/3] docs: add dataset stats --- cornac/datasets/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cornac/datasets/README.md b/cornac/datasets/README.md index ae0bcb676..438fa6896 100644 --- a/cornac/datasets/README.md +++ b/cornac/datasets/README.md @@ -268,3 +268,14 @@ Our dataset is now ready to use for model training and evaluation. A concrete ex N/A + +--- + +## Session-aware Datasets +Session-aware recommendation extends next-item (session-based) recommendation by associating sessions with identified users. While next-item datasets rely on session-level sequences (e.g., `SIT` format), session-aware datasets incorporate user identities (e.g., `USIT` format), allowing models to capture both long-term user preferences across multiple sessions and short-term session-level dynamics. + +| Dataset | #Users | #Items | #Sessions | #Interactions | #Sessions per User | #Interactions per Item | #Interactions per Session | Density | +| :-------------------------------- | -----: | -----: | --------: | ------------: | -----------------: | ---------------------: | ------------------------: | ------: | +| [Diginetica](./diginetica.py) | 571 | 6,008 | 2,670 | 12,146 | 4.68 | 2.02 | 4.55 | 0.354% | +| [RetailRocket](./retailrocket.py) | 4,249 | 36,658 | 24,732 | 230,817 | 5.82 | 6.30 | 9.33 | 0.148% | +| [Cosmetics](./cosmetics.py) | 17,268 | 42,367 | 172,242 | 2,533,262 | 9.97 | 59.79 | 14.71 | 0.346% |