Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/superbench-config.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,7 @@ proc_num: int
node_num: int
env: dict
mca: dict
bind_to: string
prefix: str
parallel: bool
```
Expand Down Expand Up @@ -403,6 +404,7 @@ Some attributes may only be suitable for specific mode.
| `prefix` | ✓ | ✘ | ✘ |
| `env` | ✓ | ✓ | ✓ |
| `mca` | ✘ | ✘ | ✓ |
| `bind_to` | ✘ | ✘ | ✓ |
| `parallel` | ✓ | ✘ | ✘ |
| `pattern` | ✘ | ✘ | ✓ |

Expand Down Expand Up @@ -452,6 +454,16 @@ MCA (Modular Component Architecture) frameworks, components, or modules to use i
in a flatten key-value dictionary.
Only available for `mpi` mode.

### `bind_to`

Process binding policy passed to `mpirun -bind-to`.
Only available for `mpi` mode.

Use this option when a benchmark needs to override the runner's default MPI binding behavior,
for example when the benchmark implements its own topology-aware CPU/NUMA affinity logic.

* default value: `numa`

### `parallel`

Whether run benchmarks in parallel (all ranks at the same time) or in sequence (one rank at a time).
Expand Down
13 changes: 12 additions & 1 deletion superbench/runner/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,12 @@ def __set_logger(self, filename):
"""
SuperBenchLogger.add_handler(logger.logger, filename=str(self._output_path / filename))

def __validate_mpi_bind_to(self, bind_to):
"""Validate mpi bind_to option."""
valid_mpi_bind_to = {'slot', 'hwthread', 'core', 'l1cache', 'l2cache', 'l3cache', 'package', 'numa', 'none'}
if bind_to not in valid_mpi_bind_to:
raise ValueError('Invalid bind_to value {}. Must be one of: {}'.format(bind_to, sorted(valid_mpi_bind_to)))

def __validate_sb_config(self): # noqa: C901
"""Validate SuperBench config object.

Expand Down Expand Up @@ -91,6 +97,10 @@ def __validate_sb_config(self): # noqa: C901
'btl_tcp_if_exclude': 'lo,docker0',
'coll_hcoll_enable': 0,
}
if 'bind_to' not in mode:
self._sb_benchmarks[name].modes[idx].bind_to = 'numa'
Comment thread
alephpiece marked this conversation as resolved.
Comment thread
alephpiece marked this conversation as resolved.
else:
self.__validate_mpi_bind_to(mode.bind_to)
for key in ['PATH', 'LD_LIBRARY_PATH', 'SB_MICRO_PATH', 'SB_WORKSPACE']:
self._sb_benchmarks[name].modes[idx].env.setdefault(key, None)
if 'pattern' in mode:
Expand Down Expand Up @@ -182,13 +192,14 @@ def __get_mode_command(self, benchmark_name, mode, timeout=None):
'-tag-output ' # tag mpi output with [jobid,rank]<stdout/stderr> prefix
'-allow-run-as-root ' # allow mpirun to run when executed by root user
'{host_list} ' # use prepared hostfile or specify nodes and launch {proc_num} processes on each node
'-bind-to numa ' # bind processes to numa
'-bind-to {bind_to} ' # bind processes according to mode config
'{mca_list} {env_list} {command}'
).format(
trace=trace_command,
host_list=f'-host localhost:{mode.proc_num}' if 'node_num' in mode and mode.node_num == 1 else
f'-hostfile hostfile -map-by ppr:{mode.proc_num}:node' if 'host_list' not in mode else '-host ' +
','.join(f'{host}:{mode.proc_num}' for host in mode.host_list),
bind_to=mode.bind_to,
Comment thread
alephpiece marked this conversation as resolved.
mca_list=' '.join(f'-mca {k} {v}' for k, v in mode.mca.items()),
env_list=' '.join(
f'-x {k}={str(v).format(proc_rank=mode.proc_rank, proc_num=mode.proc_num)}'
Expand Down
59 changes: 59 additions & 0 deletions tests/runner/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ def test_validate_sb_config(self):
self.assertIn('proc_num', mode)
if mode.name == 'mpi':
self.assertIn('mca', mode)
self.assertIn('bind_to', mode)
self.assertEqual('numa', mode.bind_to)

def test_get_failure_count(self):
"""Test get_failure_count."""
Expand Down Expand Up @@ -153,6 +155,7 @@ def test_get_mode_command(self):
'name': 'mpi',
'proc_num': 8,
'proc_rank': 1,
'bind_to': 'numa',
'mca': {},
'env': {
'PATH': None,
Expand All @@ -172,6 +175,7 @@ def test_get_mode_command(self):
'name': 'mpi',
'proc_num': 8,
'proc_rank': 2,
'bind_to': 'numa',
'mca': {
'coll_hcoll_enable': 0,
},
Expand All @@ -196,6 +200,7 @@ def test_get_mode_command(self):
'node_num': 1,
'proc_num': 8,
'proc_rank': 2,
'bind_to': 'numa',
'mca': {
'coll_hcoll_enable': 0,
},
Expand All @@ -219,6 +224,7 @@ def test_get_mode_command(self):
'name': 'mpi',
'proc_num': 8,
'proc_rank': 1,
'bind_to': 'numa',
'mca': {},
'pattern': {
'type': 'all-nodes',
Expand All @@ -234,6 +240,44 @@ def test_get_mode_command(self):
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo'
),
},
{
'benchmark_name':
'foo',
'mode': {
'name': 'mpi',
'proc_num': 8,
'proc_rank': 0,
'bind_to': 'core',
'mca': {},
'env': {
'PATH': None,
},
},
'expected_command': (
'mpirun -tag-output -allow-run-as-root -hostfile hostfile -map-by ppr:8:node -bind-to core '
' -x PATH '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo'
),
},
{
'benchmark_name':
'foo',
'mode': {
'name': 'mpi',
'proc_num': 8,
'proc_rank': 0,
'bind_to': 'none',
'mca': {},
'env': {
'PATH': None,
},
},
'expected_command': (
'mpirun -tag-output -allow-run-as-root -hostfile hostfile -map-by ppr:8:node -bind-to none '
' -x PATH '
f'sb exec --output-dir {self.sb_output_dir} -c sb.config.yaml -C superbench.enable=foo'
),
},
]

for test_case in test_cases:
Expand Down Expand Up @@ -264,6 +308,21 @@ def test_get_mode_command(self):
), expected_command
)

def test_validate_sb_config_invalid_mpi_bind_to(self):
"""Test validate_sb_config rejects unsupported mpi bind_to values."""
test_config_file = Path(__file__).parent / '../../tests/data/test.yaml'
with test_config_file.open() as fp:
invalid_config = OmegaConf.create(yaml.load(fp, Loader=yaml.SafeLoader))
invalid_config.superbench.benchmarks['nccl-bw:all-nodes'].modes[0].bind_to = 'socket'

with self.assertRaisesRegex(ValueError, 'Invalid bind_to value'):
SuperBenchRunner(
invalid_config,
OmegaConf.create({}),
OmegaConf.create({}),
self.sb_output_dir,
)

def test_run_empty_benchmarks(self):
"""Test run empty benchmarks, nothing should happen."""
self.runner._sb_enabled_benchmarks = []
Expand Down