From ee01fadf14e00f5e81fdf7ab24684c41df11da7c Mon Sep 17 00:00:00 2001 From: cephi Date: Thu, 5 Dec 2024 12:20:05 -0500 Subject: [PATCH] update --- analysis/data_stat.py | 8 +++- pytorch/batch.py | 85 +++++++++++++++++++++++++------------------ pytorch/run.py | 10 +++-- 3 files changed, 62 insertions(+), 41 deletions(-) diff --git a/analysis/data_stat.py b/analysis/data_stat.py index 0ee5c19..391bf87 100644 --- a/analysis/data_stat.py +++ b/analysis/data_stat.py @@ -5,6 +5,7 @@ from enum import Enum class Stat(Enum): CPU = 'cpu' + THREADS = 'threads' ITERATIONS = 'iterations' BASELINE_TIME_S = 'baseline time (sec)' BASELINE_DELAY_S = 'baseline delay (sec)' @@ -16,9 +17,12 @@ class Stat(Enum): MATRIX_SIZE = 'matrix size' MATRIX_NNZ = 'matrix nnz' MATRIX_DENSITY = 'matrix density %' + + POWER_BEFORE = 'power before' + POWER = 'power' + POWER_AFTER = 'power after' TIME_S = 'time (sec)' - - POWER_DELTA = 'Δ watt' + DELTA_WATT = 'Δ watt' JOULES = 'joules' TASK_CLK = 'task clock (msec)' diff --git a/pytorch/batch.py b/pytorch/batch.py index b71eeb6..db938cd 100755 --- a/pytorch/batch.py +++ b/pytorch/batch.py @@ -1,5 +1,7 @@ #! /bin/python3 +from data_stat import Cpu + import argparse import glob import os @@ -7,7 +9,7 @@ import subprocess import random parser = argparse.ArgumentParser() -parser.add_argument('arch') +parser.add_argument('cpu', choices=[x.name.lower() for x in Cpu]) parser.add_argument('output_dir') parser.add_argument('matrix_dir') parser.add_argument('iterations', type=int) @@ -17,24 +19,27 @@ parser.add_argument('--perf', action='store_const', const='--perf') parser.add_argument('--power', action='store_const', const='--power') parser.add_argument('--distribute', action='store_true') args = parser.parse_args() +args.cpu = Cpu[args.cpu.upper()] -srun_args_altra = [ - '--account', 'oasis', - '--partition', 'oasis', - '--qos', 'oasis-exempt', - #'--cpus-per-task 160', - '--cpus-per-task', '160', - #'--mem 28114', - '--mem', '16G', - '--ntasks-per-node', '1'#, - #'--exclusive', - #'--output', '/dev/null', - #'--error', '/dev/null' +srun_args = { + Cpu.ALTRA: [ + '--account', 'oasis', + '--partition', 'oasis', + '--qos', 'oasis-exempt', + #'--cpus-per-task 160', + '--cpus-per-task', '160', + #'--mem 28114', + '--mem', '16G', + '--ntasks-per-node', '1'#, + #'--exclusive', + #'--output', '/dev/null', + #'--error', '/dev/null' ] +} def srun(srun_args_list: list, run_args, matrix_file: str) -> list: run_args_list = [ - args.arch, + args.cpu.name.lower(), matrix_file, str(args.iterations), str(args.baseline_time_s), @@ -48,31 +53,41 @@ def srun(srun_args_list: list, run_args, matrix_file: str) -> list: processes = list() for i, matrix in enumerate(glob.glob(f'{args.matrix_dir.rstrip("/")}/*.mtx')): - if args.arch == 'altra': - if args.distribute: + if args.distribute: + if args.cpu == Cpu.ALTRA: i = i % 40 - srun_args = srun_args_altra + ['--nodelist', f'oasis{i:02}'] - else: - srun_args = srun_args_altra + srun_args_temp = srun_args[args.cpu] + ['--nodelist', f'oasis{i:02}'] + elif args.cpu == Cpu.EPYC_7313P: + srun_args_temp = srun_args[args.cpu] + else: + srun_args_temp = srun_args[args.cpu] - output_filename = '_'.join([ - args.arch, - str(args.baseline_time_s), - str(args.baseline_delay_s), - os.path.splitext(os.path.basename(matrix))[0], - str(args.iterations)]) + output_filename = '_'.join([ + args.cpu.name.lower(), + str(args.baseline_time_s), + str(args.baseline_delay_s), + os.path.splitext(os.path.basename(matrix))[0], + str(args.iterations)]) - json_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.json' - raw_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.output' - with open(json_filepath, 'w') as json_file, open(raw_filepath, 'w') as raw_file: - print(srun(srun_args, args, matrix)) - print(json_filepath) - print(raw_filepath) + json_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.json' + raw_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.output' + with open(json_filepath, 'w') as json_file, open(raw_filepath, 'w') as raw_file: + print(srun(srun_args_temp, args, matrix)) + print(json_filepath) + print(raw_filepath) - processes.append(subprocess.Popen( - srun(srun_args_altra, args, matrix), - stdout=json_file, - stderr=raw_file)) + processes.append(subprocess.Popen( + srun(srun_args_temp, args, matrix), + stdout=json_file, + stderr=raw_file)) + + # Wait on every 10 jobs to avoid socket timeout. + if i % 10 == 9: + print("Waiting on 10 jobs") + for process in processes: + process.wait() + + processes = list() for process in processes: process.wait() diff --git a/pytorch/run.py b/pytorch/run.py index 34aa811..682de75 100755 --- a/pytorch/run.py +++ b/pytorch/run.py @@ -58,11 +58,10 @@ result[Stat.BASELINE_DELAY_S.value] = args.baseline_delay_s if args.power: time.sleep(args.baseline_delay_s) - result['power_before'] = baseline_power(args.baseline_time_s) + result[Stat.POWER_BEFORE.value] = baseline_power(args.baseline_time_s) if args.debug: print(result) - print(program[args.cpu]) run_program(program[args.cpu]) # Warmup power_process = subprocess.Popen(['./power.py'], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True) @@ -73,14 +72,17 @@ if args.power: if args.debug: print(result) - result['power'] = [float(x) for x in power_process.communicate()[0].strip().split('\n')] + result[Stat.POWER.value] = [float(x) for x in power_process.communicate()[0].strip().split('\n')] + # Riemann Sum + from math import ceil + result[Stat.JOULES.value] = sum(result[Stat.POWER.value][-ceil(result[Stat.TIME_S.value]):-1]) + (result[Stat.POWER.value][-1] * (result[Stat.TIME_S.value] % 1)) if args.debug: print(result) #print(len(result['power'])) #print(sum(result['power']) / len(result['power'])) time.sleep(args.baseline_delay_s) - result['power_after'] = baseline_power(args.baseline_time_s) + result[Stat.POWER_AFTER.value] = baseline_power(args.baseline_time_s) if args.debug: print(result)