ampere_research/pytorch/batch.py

189 lines
6.3 KiB
Python
Raw Normal View History

from data_stat import Cpu, Format, MatrixType
2024-12-05 12:20:05 -05:00
2024-12-02 23:32:33 -05:00
import argparse
import glob
import os
import subprocess
2024-12-03 08:53:39 -05:00
import random
2024-12-02 23:32:33 -05:00
parser = argparse.ArgumentParser()
2024-12-05 12:20:05 -05:00
parser.add_argument('cpu', choices=[x.name.lower() for x in Cpu])
2024-12-02 23:32:33 -05:00
parser.add_argument('output_dir')
parser.add_argument('matrix_type', type=str,
choices=[t.name.lower() for t in MatrixType])
parser.add_argument('format', type=str,
choices=[fmt.name.lower() for fmt in Format])
parser.add_argument('base_iterations', type=int)
parser.add_argument('min_time_s', type=int)
2024-12-02 23:32:33 -05:00
parser.add_argument('baseline_time_s', type=int)
parser.add_argument('baseline_delay_s', type=int)
#parser.add_argument('--perf', action='store_const', const='--perf')
parser.add_argument('-m', '--matrix_dir', type=str)
parser.add_argument('-ss', '--synthetic_size', nargs="+", type=int)
parser.add_argument('-sd', '--synthetic_density', nargs="+", type=float)
2024-12-14 17:22:42 -05:00
parser.add_argument('-c', '--cores', type=int)
2024-12-02 23:32:33 -05:00
parser.add_argument('--power', action='store_const', const='--power')
2024-12-03 08:53:39 -05:00
parser.add_argument('--distribute', action='store_true')
2024-12-02 23:32:33 -05:00
args = parser.parse_args()
2024-12-05 12:20:05 -05:00
args.cpu = Cpu[args.cpu.upper()]
args.matrix_type = MatrixType[args.matrix_type.upper()]
args.format = Format[args.format.upper()]
2024-12-02 23:32:33 -05:00
2024-12-05 12:20:05 -05:00
srun_args = {
Cpu.ALTRA: [
'--account', 'oasis',
'--partition', 'oasis',
'--qos', 'oasis-exempt',
#'--cpus-per-task 160',
'--cpus-per-task', '160',
#'--mem 28114',
'--mem', '16G',
'--ntasks-per-node', '1',
'--time', '1-00:00:00'
2024-12-05 12:20:05 -05:00
#'--exclusive',
#'--output', '/dev/null',
#'--error', '/dev/null'
2024-12-09 10:57:15 -05:00
],
Cpu.EPYC_7313P: [
'--account', 'nexus',
'--partition', 'tron',
'--qos', 'high',
'--cpus-per-task', '16',
'--ntasks-per-node', '1',
'--time', '1-00:00:00',
2024-12-09 10:57:15 -05:00
'--prefer', 'EPYC-7313P'
],
Cpu.XEON_4216: [
'--account', 'nexus',
'--partition', 'tron',
'--qos', 'tron-exempt',
'--cpus-per-task', '32',
'--ntasks-per-node', '1',
'--time', '1-00:00:00',
'--prefer', 'Xeon,4216'
2024-12-02 23:32:33 -05:00
]
2024-12-05 12:20:05 -05:00
}
2024-12-09 10:57:15 -05:00
python = {
Cpu.ALTRA: 'python3',
Cpu.EPYC_7313P: 'python3.11',
Cpu.XEON_4216: 'python3.11'
2024-12-09 10:57:15 -05:00
}
2024-12-02 23:32:33 -05:00
def run(
run_args,
matrix_file: str,
synthetic_size: int,
synthetic_density: float,
srun_args_list: list = None
) -> list:
2024-12-02 23:32:33 -05:00
run_args_list = [
2024-12-05 12:20:05 -05:00
args.cpu.name.lower(),
args.matrix_type.name.lower(),
args.format.name.lower(),
str(args.base_iterations),
str(args.min_time_s),
2024-12-02 23:32:33 -05:00
str(args.baseline_time_s),
2024-12-14 18:19:33 -05:00
str(args.baseline_delay_s),
'-d']
if args.matrix_type == MatrixType.SUITESPARSE:
run_args_list += ['-m', matrix_file]
elif args.matrix_type == MatrixType.SYNTHETIC:
run_args_list += ['-ss', str(synthetic_size), '-sd', str(synthetic_density)]
else:
exit("Unrecognized matrix type!")
# if args.perf is not None:
# run_args_list += [args.perf]
2024-12-14 17:22:42 -05:00
if args.cores is not None:
2024-12-14 18:19:33 -05:00
run_args_list += ['-c', str(args.cores)]
2024-12-14 17:22:42 -05:00
2024-12-02 23:32:33 -05:00
if args.power is not None:
run_args_list += [args.power]
if srun_args_list is None:
command = [python[args.cpu], 'run.py'] + run_args_list
else:
command = ['srun'] + srun_args_list + [python[args.cpu], 'run.py'] + run_args_list
print(command)
return command
2024-12-03 08:53:39 -05:00
processes = list()
2024-12-02 23:32:33 -05:00
if args.matrix_type == MatrixType.SUITESPARSE:
parameter_list = enumerate(glob.glob(f'{args.matrix_dir.rstrip("/")}/*.mtx'))
elif args.matrix_type == MatrixType.SYNTHETIC:
parameter_list = enumerate([(size, density)
for size in args.synthetic_size
2024-12-14 18:19:33 -05:00
for density in args.synthetic_density
2024-12-18 21:26:39 -05:00
if size ** 2 * density <= 100000000])
#for i, matrix in enumerate(glob.glob(f'{args.matrix_dir.rstrip("/")}/*.mtx')):
for i, parameter in parameter_list:
#if args.distribute:
# if args.cpu == Cpu.ALTRA:
# i = i % 40
# srun_args_temp = srun_args[args.cpu] + ['--nodelist', f'oasis{i:02}']
# elif args.cpu == Cpu.EPYC_7313P:
# srun_args_temp = srun_args[args.cpu]
#else:
srun_args_temp = srun_args[args.cpu]
2024-12-05 12:20:05 -05:00
synthetic_size = args.synthetic_size
synthetic_density = args.synthetic_density
2024-12-15 15:04:09 -05:00
output_filename_list = [args.cpu.name.lower()]
if args.cores is not None:
output_filename_list += [str(args.cores)]
else:
output_filename_list += ['max']
output_filename_list += [
args.format.name.lower(),
str(args.min_time_s),
2024-12-05 12:20:05 -05:00
str(args.baseline_time_s),
str(args.baseline_delay_s)]
if args.matrix_type == MatrixType.SUITESPARSE:
output_filename_list += [os.path.splitext(os.path.basename(parameter))[0]]
elif args.matrix_type == MatrixType.SYNTHETIC:
2024-12-15 15:04:09 -05:00
output_filename_list += ['synthetic', str(parameter[0]), str(parameter[1])]
output_filename = '_'.join(output_filename_list)
2024-12-05 12:20:05 -05:00
json_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.json'
raw_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.output'
2024-12-17 12:31:16 -05:00
if os.path.isfile(json_filepath) or os.path.isfile(raw_filepath):
print("File " + json_filepath + " already exists! Skipping...")
continue
2024-12-05 12:20:05 -05:00
with open(json_filepath, 'w') as json_file, open(raw_filepath, 'w') as raw_file:
print(json_filepath)
print(raw_filepath)
2024-12-02 23:32:33 -05:00
if args.distribute:
processes.append(subprocess.Popen(run(
args,
parameter,
parameter[0],
parameter[1],
srun_args_temp),
stdout=json_file,
stderr=raw_file))
else:
subprocess.run(
run(args, parameter, parameter[0], parameter[1]),
stdout=json_file,
stderr=raw_file)
2024-12-02 23:32:33 -05:00
if args.distribute:
# Wait on every 10 jobs to avoid socket timeout.
if i % 10 == 9:
print("Waiting on 10 jobs")
for process in processes:
process.wait()
2024-12-03 08:53:39 -05:00
processes = list()
2024-12-02 23:32:33 -05:00
if args.distribute:
for process in processes:
process.wait()