6 changed files with 145 additions and 178 deletions
--- a/analysis/perf_stat.py
+++ b/analysis/perf_stat.py
@ -5,25 +5,13 @@ from enum import Enum
 class Stat(Enum):
    CPU = 'cpu'

-    THREADS = 'threads'
-    ITERATIONS = 'iterations'
-    BASELINE_TIME_S = 'baseline time (sec)'
-    BASELINE_DELAY_S = 'baseline delay (sec)'
-
    SOLVER = 'solver'
+    LIN_ALG = 'linear algebra'
+    INPUT_FILE = 'input file'
+    MAXWELL_SIZE = 'maxwell size'
+    MATRIX_COLS = 'matrix columns'

-    MATRIX_FILE = 'matrix file'
-    MATRIX_SHAPE = 'matrix shape'
-    MATRIX_SIZE = 'matrix size'
-    MATRIX_NNZ = 'matrix nnz'
-    MATRIX_DENSITY = 'matrix density %'
- 
-    POWER_BEFORE = 'power before'
-    POWER = 'power'
-    POWER_AFTER = 'power after'
-    TIME_S = 'time (sec)'
-    DELTA_WATT = 'Δ watt'
-    JOULES = 'joules'
+    POWER_DELTA = 'Δ watt'

    TASK_CLK = 'task clock (msec)'
    PAGE_FAULTS = 'page faults'
@ -57,57 +45,54 @@ class Stat(Enum):
    L2D_CACHE_MISS_RATE = 'L2D cache miss rate'
    LL_CACHE_MISS_RATE = 'LL cache miss rate'

-class Cpu(Enum):
-    #ALTRA = altra_names
-    #XEON = xeon_names
-    ALTRA = 'Altra'
-    EPYC_7313P = 'Epyc 7313P'
+altra_names = {
+    Stat.TASK_CLK: 'task-clock:u',
+    Stat.PAGE_FAULTS: 'page-faults:u',
+    Stat.CYCLES: 'cycles:u',
+    Stat.INSTS: 'instructions:u',

-names = {
-        Cpu.ALTRA: {
-            Stat.TASK_CLK: 'task-clock:u',
-            Stat.PAGE_FAULTS: 'page-faults:u',
-            Stat.CYCLES: 'cycles:u',
-            Stat.INSTS: 'instructions:u',
-
-            Stat.BR: 'BR_RETIRED:u',
-            Stat.BR_MISS: 'BR_MIS_PRED_RETIRED:u',
-            Stat.ITLB: 'L1I_TLB:u',
-            Stat.ITLB_MISS: 'ITLB_WALK:u',
-            Stat.DTLB: 'L1D_TLB:u',
-            Stat.DTLB_MISS: 'DTLB_WALK:u',
-            Stat.L2D_TLB: 'L2D_TLB:u',
-            Stat.L2D_TLB_MISS: 'L2D_TLB_REFILL:u',
-            Stat.L1I_CACHE: 'L1I_CACHE:u',
-            Stat.L1I_CACHE_MISS: 'L1I_CACHE_REFILL:u',
-            Stat.L1D_CACHE: 'L1D_CACHE:u',
-            Stat.L1D_CACHE_MISS: 'L1D_CACHE_REFILL:u',
-            Stat.L2D_CACHE: 'L2D_CACHE:u',
-            Stat.L2D_CACHE_MISS: 'L2D_CACHE_REFILL:u',
-            Stat.LL_CACHE: 'LL_CACHE_RD:u',
-            Stat.LL_CACHE_MISS: 'LL_CACHE_MISS_RD:u',
-        },
-        Cpu.EPYC_7313P: {
-            Stat.TASK_CLK: 'task-clock:u',
-            Stat.PAGE_FAULTS: 'page-faults:u',
-            Stat.CYCLES: 'cycles:u',
-            Stat.INSTS: 'instructions:u',
-
-            Stat.BR: 'branches:u',
-            Stat.BR_MISS: 'branch-misses:u',
-            Stat.ITLB: 'iTLB-loads:u',
-            Stat.ITLB_MISS: 'iTLB-load-misses:u',
-            Stat.DTLB: 'dTLB-loads:u',
-            Stat.DTLB_MISS: 'dTLB-load-misses:u',
-            Stat.L1I_CACHE: 'L1-icache-loads:u',
-            Stat.L1I_CACHE_MISS: 'L1-icache-load-misses:u',
-            Stat.L1D_CACHE: 'L1-dcache-loads:u',
-            Stat.L1D_CACHE_MISS: 'L1-dcache-load-misses:u',
-            Stat.LL_CACHE: 'LLC-loads:u',
-            Stat.LL_CACHE_MISS: 'LLC-load-misses:u',
-        }
+    Stat.BR: 'BR_RETIRED:u',
+    Stat.BR_MISS: 'BR_MIS_PRED_RETIRED:u',
+    Stat.ITLB: 'L1I_TLB:u',
+    Stat.ITLB_MISS: 'ITLB_WALK:u',
+    Stat.DTLB: 'L1D_TLB:u',
+    Stat.DTLB_MISS: 'DTLB_WALK:u',
+    Stat.L2D_TLB: 'L2D_TLB:u',
+    Stat.L2D_TLB_MISS: 'L2D_TLB_REFILL:u',
+    Stat.L1I_CACHE: 'L1I_CACHE:u',
+    Stat.L1I_CACHE_MISS: 'L1I_CACHE_REFILL:u',
+    Stat.L1D_CACHE: 'L1D_CACHE:u',
+    Stat.L1D_CACHE_MISS: 'L1D_CACHE_REFILL:u',
+    Stat.L2D_CACHE: 'L2D_CACHE:u',
+    Stat.L2D_CACHE_MISS: 'L2D_CACHE_REFILL:u',
+    Stat.LL_CACHE: 'LL_CACHE_RD:u',
+    Stat.LL_CACHE_MISS: 'LL_CACHE_MISS_RD:u',
 }

+xeon_names = {
+    Stat.TASK_CLK: 'task-clock:u',
+    Stat.PAGE_FAULTS: 'page-faults:u',
+    Stat.CYCLES: 'cycles:u',
+    Stat.INSTS: 'instructions:u',
+
+    Stat.BR: 'branches:u',
+    Stat.BR_MISS: 'branch-misses:u',
+    Stat.ITLB: 'iTLB-loads:u',
+    Stat.ITLB_MISS: 'iTLB-load-misses:u',
+    Stat.DTLB: 'dTLB-loads:u',
+    Stat.DTLB_MISS: 'dTLB-load-misses:u',
+    Stat.L1I_CACHE: 'L1-icache-loads:u',
+    Stat.L1I_CACHE_MISS: 'L1-icache-load-misses:u',
+    Stat.L1D_CACHE: 'L1-dcache-loads:u',
+    Stat.L1D_CACHE_MISS: 'L1-dcache-load-misses:u',
+    Stat.LL_CACHE: 'LLC-loads:u',
+    Stat.LL_CACHE_MISS: 'LLC-load-misses:u',
+}
+
+class CPU(Enum):
+    ALTRA = altra_names
+    XEON = xeon_names
+
 def parse_output_old(filename: str, data: dict[str, str]) -> dict:
    result: dict[str, int | float] = dict()
    cpu: CPU = CPU[data['cpu'].upper()]
@ -127,12 +112,12 @@ def parse_output_old(filename: str, data: dict[str, str]) -> dict:

    return result | parse_power(filename, cpu)

-def parse_output(output: str, cpu: Cpu) -> dict:
+def parse_output(output: str, cpu: CPU) -> dict:
    result = dict()

    for line in output.split('\n'):
-        for stat in [x for x in Stat if x in names[cpu]]:
-            regex = r'^\W*([\d+(,|\.)?]+)\W*.*' + names[cpu][stat]
+        for stat in [x for x in Stat if x in cpu.value]:
+            regex = r'^\W*([\d+(,|\.)?]+)\W*.*' + cpu.value[stat]
            value = re.search(regex, line)

            if value is None:
--- a/pytorch/batch.py
+++ b/pytorch/batch.py
@ -1,7 +1,5 @@
 #! /bin/python3

-from data_stat import Cpu
-
 import argparse
 import glob
 import os
@ -9,7 +7,7 @@ import subprocess
 import random

 parser = argparse.ArgumentParser()
-parser.add_argument('cpu', choices=[x.name.lower() for x in Cpu])
+parser.add_argument('arch')
 parser.add_argument('output_dir')
 parser.add_argument('matrix_dir')
 parser.add_argument('iterations', type=int)
@ -19,27 +17,24 @@ parser.add_argument('--perf', action='store_const', const='--perf')
 parser.add_argument('--power', action='store_const', const='--power')
 parser.add_argument('--distribute', action='store_true')
 args = parser.parse_args()
-args.cpu = Cpu[args.cpu.upper()]

-srun_args = {
-        Cpu.ALTRA: [
-            '--account', 'oasis',
-            '--partition', 'oasis',
-            '--qos', 'oasis-exempt',
-            #'--cpus-per-task 160',
-            '--cpus-per-task', '160',
-            #'--mem 28114',
-            '--mem', '16G',
-            '--ntasks-per-node', '1'#,
-            #'--exclusive',
-            #'--output', '/dev/null',
-            #'--error', '/dev/null'
+srun_args_altra = [
+        '--account', 'oasis',
+        '--partition', 'oasis',
+        '--qos', 'oasis-exempt',
+        #'--cpus-per-task 160',
+        '--cpus-per-task', '160',
+        #'--mem 28114',
+        '--mem', '16G',
+        '--ntasks-per-node', '1'#,
+        #'--exclusive',
+        #'--output', '/dev/null',
+        #'--error', '/dev/null'
        ]
-}

 def srun(srun_args_list: list, run_args, matrix_file: str) -> list:
    run_args_list = [
-            args.cpu.name.lower(),
+            args.arch,
            matrix_file,
            str(args.iterations),
            str(args.baseline_time_s),
@ -53,41 +48,31 @@ def srun(srun_args_list: list, run_args, matrix_file: str) -> list:
 processes = list()

 for i, matrix in enumerate(glob.glob(f'{args.matrix_dir.rstrip("/")}/*.mtx')):
-    if args.distribute:
-        if args.cpu == Cpu.ALTRA:
+    if args.arch == 'altra':
+        if args.distribute:
            i = i % 40
-            srun_args_temp = srun_args[args.cpu] + ['--nodelist', f'oasis{i:02}']
-        elif args.cpu == Cpu.EPYC_7313P:
-            srun_args_temp = srun_args[args.cpu]
-    else:
-        srun_args_temp = srun_args[args.cpu]
+            srun_args = srun_args_altra + ['--nodelist', f'oasis{i:02}']
+        else:
+            srun_args = srun_args_altra

-    output_filename = '_'.join([
-            args.cpu.name.lower(),
-            str(args.baseline_time_s),
-            str(args.baseline_delay_s),
-            os.path.splitext(os.path.basename(matrix))[0],
-            str(args.iterations)])
+        output_filename = '_'.join([
+                args.arch,
+                str(args.baseline_time_s),
+                str(args.baseline_delay_s),
+                os.path.splitext(os.path.basename(matrix))[0],
+                str(args.iterations)])

-    json_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.json'
-    raw_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.output'
-    with open(json_filepath, 'w') as json_file, open(raw_filepath, 'w') as raw_file:
-        print(srun(srun_args_temp, args, matrix))
-        print(json_filepath)
-        print(raw_filepath)
+        json_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.json'
+        raw_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.output'
+        with open(json_filepath, 'w') as json_file, open(raw_filepath, 'w') as raw_file:
+            print(srun(srun_args, args, matrix))
+            print(json_filepath)
+            print(raw_filepath)

-        processes.append(subprocess.Popen(
-                srun(srun_args_temp, args, matrix),
-                stdout=json_file,
-                stderr=raw_file))
-
-    # Wait on every 10 jobs to avoid socket timeout.
-    if i % 10 == 9:
-        print("Waiting on 10 jobs")
-        for process in processes:
-            process.wait()
-
-        processes = list()
+            processes.append(subprocess.Popen(
+                    srun(srun_args_altra, args, matrix),
+                    stdout=json_file,
+                    stderr=raw_file))

 for process in processes:
    process.wait()
--- a/pytorch/data_stat.py
+++ b/pytorch/data_stat.py
@ -1 +0,0 @@
-../analysis/data_stat.py
--- a/pytorch/perf_stat.py
+++ b/pytorch/perf_stat.py
@ -0,0 +1 @@
+../analysis/perf_stat.py
--- a/pytorch/run.py
+++ b/pytorch/run.py
@ -1,16 +1,14 @@
 #! /bin/python3

-import data_stat
-from data_stat import Stat, Cpu
+import perf_stat

 import argparse
 import os, sys
 import subprocess, signal
 import json
-import time

 parser = argparse.ArgumentParser()
-parser.add_argument('cpu', choices=[x.name.lower() for x in Cpu])
+parser.add_argument('arch')
 parser.add_argument('matrix_file')
 parser.add_argument('iterations', type=int)
 parser.add_argument('baseline_time_s', type=int)
@ -19,23 +17,17 @@ parser.add_argument('--perf', action='store_true')
 parser.add_argument('--power', action='store_true')
 parser.add_argument('-d', '--debug', action='store_true')
 args = parser.parse_args()
-args.cpu = Cpu[args.cpu.upper()]

-program = {
-        Cpu.ALTRA: [
-            'apptainer', 'run', 'pytorch-altra.sif', '-c',
-            'numactl --cpunodebind=0 --membind=0 '
-            + f'python spmv.py {args.matrix_file} {args.iterations}']
-}
+program_altra = [
+        'apptainer', 'run', 'pytorch-altra.sif', '-c',
+        'numactl --cpunodebind=0 --membind=0 '
+        + f'python spmv.py {args.matrix_file} {args.iterations}']
 perf = ['perf', 'stat']
-perf_args = {
-        Cpu.ALTRA: [
-            ['-d', '-d'],
-            ['-M', 'branch_misprediction_ratio'],
-            ['-M', 'dtlb_walk_ratio,itlb_walk_ratio'],
-            ['-M', 'l1d_cache_miss_ratio,l1i_cache_miss_ratio'],
-            ['-M', 'l2_cache_miss_ratio,l2_tlb_miss_ratio,ll_cache_read_miss_ratio']]
-}
+perf_altra = [['-d', '-d'],
+        ['-M', 'branch_misprediction_ratio'],
+        ['-M', 'dtlb_walk_ratio,itlb_walk_ratio'],
+        ['-M', 'l1d_cache_miss_ratio,l1i_cache_miss_ratio'],
+        ['-M', 'l2_cache_miss_ratio,l2_tlb_miss_ratio,ll_cache_read_miss_ratio']]

 def baseline_power(baseline_time_s: int) -> list:
    power_process = subprocess.Popen(['./power.py', '-s', str(baseline_time_s)],
@ -45,56 +37,66 @@ def baseline_power(baseline_time_s: int) -> list:
 def run_program(program: list[str]) -> tuple[dict, str]:
    process = subprocess.run(program,
            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-    if args.debug:
-        print(process.stdout)
-        print(process.stderr)
+    #print(json.loads(process.stdout))
+    #print(process.stderr)
    return (json.loads(process.stdout), process.stderr)

 result = dict()
-result[Stat.CPU.value] = args.cpu.value
-result[Stat.ITERATIONS.value] = args.iterations
-result[Stat.BASELINE_TIME_S.value] = args.baseline_time_s
-result[Stat.BASELINE_DELAY_S.value] = args.baseline_delay_s
+result['architecture'] = args.arch
+result['iterations'] = args.iterations
+result['baseline_time_s'] = args.baseline_time_s
+result['baseline_delay_s'] = args.baseline_delay_s

-if args.power:
-    time.sleep(args.baseline_delay_s)
-    result[Stat.POWER_BEFORE.value] = baseline_power(args.baseline_time_s)
+if args.power is True:
+    result['power_before'] = baseline_power(args.baseline_time_s)
    if args.debug:
        print(result)

-    run_program(program[args.cpu]) # Warmup
+    run_program(program_altra) # Warmup
    power_process = subprocess.Popen(['./power.py'],
            stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True)

-    result = result | run_program(program[args.cpu])[0]
+    if args.arch == 'altra':
+        result = result | run_program(program_altra)[0]
+    elif args.arch == 'x86':
+        print("Arch not implemented yet!")
+        exit(1)
+    else:
+        print("Unrecognized arch!")
+        exit(1)

    power_process.send_signal(signal.SIGINT)
    if args.debug:
        print(result)

-    result[Stat.POWER.value] = [float(x) for x in power_process.communicate()[0].strip().split('\n')]
-    # Riemann Sum
-    from math import ceil
-    result[Stat.JOULES.value] = sum(result[Stat.POWER.value][-ceil(result[Stat.TIME_S.value]):-1]) + (result[Stat.POWER.value][-1] * (result[Stat.TIME_S.value] % 1))
+    result['power'] = [float(x) for x in power_process.communicate()[0].strip().split('\n')]
    if args.debug:
        print(result)
        #print(len(result['power']))
        #print(sum(result['power']) / len(result['power']))

+    import time
    time.sleep(args.baseline_delay_s)
-    result[Stat.POWER_AFTER.value] = baseline_power(args.baseline_time_s)
+    result['power_after'] = baseline_power(args.baseline_time_s)
    if args.debug:
        print(result)

-if args.perf:
-    for perf_arg in perf_args[args.cpu]:
-        output = run_program(perf + perf_arg + program[args.cpu])[1]
-        print(output, file=sys.stderr)
-        result = result | data_stat.parse_output(output, args.cpu)
-        if args.debug:
-            print(result)
+if args.perf is True:
+    if args.arch == 'altra':
+        for perf_args in perf_altra:
+            output = run_program(perf + perf_args + program_altra)[1]
+            print(output, file=sys.stderr)
+            result = result | perf_stat.parse_output(output, perf_stat.CPU.ALTRA)
+            if args.debug:
+                print(result)
+    elif args.arch == 'x86':
+        print("no implement")
+        exit(1)
+    else:
+        print("Unrecognized arch!")
+        exit(1)

-    result = result | data_stat.derive_stats(result)
+    result = result | perf_stat.derive_stats(result)

    if args.debug:
        print(result)
--- a/pytorch/spmv.py
+++ b/pytorch/spmv.py
@ -1,5 +1,3 @@
-from data_stat import Stat
-
 import torch, scipy
 import numpy as np
 import argparse
@ -34,22 +32,19 @@ end = time.time()

 result = dict()

-result[Stat.MATRIX_FILE.value] = os.path.splitext(os.path.basename(args.matrix_file))[0]
-print(f"Matrix: {result[Stat.MATRIX_FILE.value]}", file=sys.stderr)
+result['matrix'] = os.path.splitext(os.path.basename(args.matrix_file))[0]
+print(f"Matrix: {result['matrix']}", file=sys.stderr)

-result[Stat.MATRIX_SHAPE.value] = matrix.shape
-print(f"Shape: {result[Stat.MATRIX_SHAPE.value]}", file=sys.stderr)
+result['shape'] = matrix.shape
+print(f"Shape: {result['shape']}", file=sys.stderr)

-result[Stat.MATRIX_SIZE.value] = matrix.shape[0] * matrix.shape[1]
-print(f"Size: {result[Stat.MATRIX_SIZE.value]}", file=sys.stderr)
+result['nnz'] = matrix.values().shape[0]
+print(f"NNZ: {result['nnz']}", file=sys.stderr)

-result[Stat.MATRIX_NNZ.value] = matrix.values().shape[0]
-print(f"NNZ: {result[Stat.MATRIX_NNZ.value]}", file=sys.stderr)
+result['% density'] = matrix.values().shape[0] / (matrix.shape[0] * matrix.shape[1])
+print(f"Density: {result['% density']}", file=sys.stderr)

-result[Stat.MATRIX_DENSITY.value] = matrix.values().shape[0] / (matrix.shape[0] * matrix.shape[1])
-print(f"Density: {result[Stat.MATRIX_DENSITY.value]}", file=sys.stderr)
-
-result[Stat.TIME_S.value] = end - start
-print(f"Time: {result[Stat.TIME_S.value]} seconds", file=sys.stderr)
+result['time_s'] = end - start
+print(f"Time: {result['time_s']} seconds", file=sys.stderr)

 print(json.dumps(result))