update

2024-12-05 12:20:05 -05:00 · 2024-12-05 12:20:05 -05:00 · ee01fadf14
commit ee01fadf14
parent 10934046f7
3 changed files with 62 additions and 41 deletions
--- a/analysis/data_stat.py
+++ b/analysis/data_stat.py
@ -5,6 +5,7 @@ from enum import Enum
 class Stat(Enum):
    CPU = 'cpu'

+    THREADS = 'threads'
    ITERATIONS = 'iterations'
    BASELINE_TIME_S = 'baseline time (sec)'
    BASELINE_DELAY_S = 'baseline delay (sec)'
@ -16,9 +17,12 @@ class Stat(Enum):
    MATRIX_SIZE = 'matrix size'
    MATRIX_NNZ = 'matrix nnz'
    MATRIX_DENSITY = 'matrix density %'
-    TIME_S = 'time (sec)'
 
-    POWER_DELTA = 'Δ watt'
+    POWER_BEFORE = 'power before'
+    POWER = 'power'
+    POWER_AFTER = 'power after'
+    TIME_S = 'time (sec)'
+    DELTA_WATT = 'Δ watt'
    JOULES = 'joules'

    TASK_CLK = 'task clock (msec)'
--- a/pytorch/batch.py
+++ b/pytorch/batch.py
@ -1,5 +1,7 @@
 #! /bin/python3

+from data_stat import Cpu
+
 import argparse
 import glob
 import os
@ -7,7 +9,7 @@ import subprocess
 import random

 parser = argparse.ArgumentParser()
-parser.add_argument('arch')
+parser.add_argument('cpu', choices=[x.name.lower() for x in Cpu])
 parser.add_argument('output_dir')
 parser.add_argument('matrix_dir')
 parser.add_argument('iterations', type=int)
@ -17,24 +19,27 @@ parser.add_argument('--perf', action='store_const', const='--perf')
 parser.add_argument('--power', action='store_const', const='--power')
 parser.add_argument('--distribute', action='store_true')
 args = parser.parse_args()
+args.cpu = Cpu[args.cpu.upper()]

-srun_args_altra = [
-        '--account', 'oasis',
-        '--partition', 'oasis',
-        '--qos', 'oasis-exempt',
-        #'--cpus-per-task 160',
-        '--cpus-per-task', '160',
-        #'--mem 28114',
-        '--mem', '16G',
-        '--ntasks-per-node', '1'#,
-        #'--exclusive',
-        #'--output', '/dev/null',
-        #'--error', '/dev/null'
+srun_args = {
+        Cpu.ALTRA: [
+            '--account', 'oasis',
+            '--partition', 'oasis',
+            '--qos', 'oasis-exempt',
+            #'--cpus-per-task 160',
+            '--cpus-per-task', '160',
+            #'--mem 28114',
+            '--mem', '16G',
+            '--ntasks-per-node', '1'#,
+            #'--exclusive',
+            #'--output', '/dev/null',
+            #'--error', '/dev/null'
        ]
+}

 def srun(srun_args_list: list, run_args, matrix_file: str) -> list:
    run_args_list = [
-            args.arch,
+            args.cpu.name.lower(),
            matrix_file,
            str(args.iterations),
            str(args.baseline_time_s),
@ -48,31 +53,41 @@ def srun(srun_args_list: list, run_args, matrix_file: str) -> list:
 processes = list()

 for i, matrix in enumerate(glob.glob(f'{args.matrix_dir.rstrip("/")}/*.mtx')):
-    if args.arch == 'altra':
-        if args.distribute:
+    if args.distribute:
+        if args.cpu == Cpu.ALTRA:
            i = i % 40
-            srun_args = srun_args_altra + ['--nodelist', f'oasis{i:02}']
-        else:
-            srun_args = srun_args_altra
+            srun_args_temp = srun_args[args.cpu] + ['--nodelist', f'oasis{i:02}']
+        elif args.cpu == Cpu.EPYC_7313P:
+            srun_args_temp = srun_args[args.cpu]
+    else:
+        srun_args_temp = srun_args[args.cpu]

-        output_filename = '_'.join([
-                args.arch,
-                str(args.baseline_time_s),
-                str(args.baseline_delay_s),
-                os.path.splitext(os.path.basename(matrix))[0],
-                str(args.iterations)])
+    output_filename = '_'.join([
+            args.cpu.name.lower(),
+            str(args.baseline_time_s),
+            str(args.baseline_delay_s),
+            os.path.splitext(os.path.basename(matrix))[0],
+            str(args.iterations)])

-        json_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.json'
-        raw_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.output'
-        with open(json_filepath, 'w') as json_file, open(raw_filepath, 'w') as raw_file:
-            print(srun(srun_args, args, matrix))
-            print(json_filepath)
-            print(raw_filepath)
+    json_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.json'
+    raw_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.output'
+    with open(json_filepath, 'w') as json_file, open(raw_filepath, 'w') as raw_file:
+        print(srun(srun_args_temp, args, matrix))
+        print(json_filepath)
+        print(raw_filepath)

-            processes.append(subprocess.Popen(
-                    srun(srun_args_altra, args, matrix),
-                    stdout=json_file,
-                    stderr=raw_file))
+        processes.append(subprocess.Popen(
+                srun(srun_args_temp, args, matrix),
+                stdout=json_file,
+                stderr=raw_file))
+
+    # Wait on every 10 jobs to avoid socket timeout.
+    if i % 10 == 9:
+        print("Waiting on 10 jobs")
+        for process in processes:
+            process.wait()
+
+        processes = list()

 for process in processes:
    process.wait()
--- a/pytorch/run.py
+++ b/pytorch/run.py
@ -58,11 +58,10 @@ result[Stat.BASELINE_DELAY_S.value] = args.baseline_delay_s

 if args.power:
    time.sleep(args.baseline_delay_s)
-    result['power_before'] = baseline_power(args.baseline_time_s)
+    result[Stat.POWER_BEFORE.value] = baseline_power(args.baseline_time_s)
    if args.debug:
        print(result)

-    print(program[args.cpu])
    run_program(program[args.cpu]) # Warmup
    power_process = subprocess.Popen(['./power.py'],
            stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True)
@ -73,14 +72,17 @@ if args.power:
    if args.debug:
        print(result)

-    result['power'] = [float(x) for x in power_process.communicate()[0].strip().split('\n')]
+    result[Stat.POWER.value] = [float(x) for x in power_process.communicate()[0].strip().split('\n')]
+    # Riemann Sum
+    from math import ceil
+    result[Stat.JOULES.value] = sum(result[Stat.POWER.value][-ceil(result[Stat.TIME_S.value]):-1]) + (result[Stat.POWER.value][-1] * (result[Stat.TIME_S.value] % 1))
    if args.debug:
        print(result)
        #print(len(result['power']))
        #print(sum(result['power']) / len(result['power']))

    time.sleep(args.baseline_delay_s)
-    result['power_after'] = baseline_power(args.baseline_time_s)
+    result[Stat.POWER_AFTER.value] = baseline_power(args.baseline_time_s)
    if args.debug:
        print(result)