ampere_research/analysis/analyze.py

385 lines
14 KiB
Python
Raw Normal View History

2024-12-02 11:33:49 -05:00
#! /bin/python3
import argparse
import os
import re
import json
from enum import Enum
import math
import numpy as np
import matplotlib.pyplot as plt
import itertools
class Stat(Enum):
CPU = 'cpu'
SOLVER = 'solver'
LIN_ALG = 'linear algebra'
INPUT_FILE = 'input file'
MAXWELL_SIZE = 'maxwell size'
MATRIX_COLS = 'matrix columns'
POWER_DELTA = 'Δ watt'
TASK_CLK = 'task clock (msec)'
PAGE_FAULTS = 'page faults'
CYCLES = 'cycles'
INST = 'instructions'
BR = 'branches'
BR_MISS = 'branch mispredictions'
ITLB = 'ITLB accesses'
ITLB_MISS = 'ITLB misses'
DTLB = 'DTLB accesses'
DTLB_MISS = 'DTLB misses'
L2D_TLB = 'L2D TLB accesses'
L2D_TLB_MISS = 'L2D TLB misses'
L1I_CACHE = 'L1I cache accesses'
L1I_CACHE_MISS = 'L1I cache misses'
L1D_CACHE = 'L1D cache accesses'
L1D_CACHE_MISS = 'L1D cache misses'
L2D_CACHE = 'L2D cache accesses'
L2D_CACHE_MISS = 'L2D cache misses'
LL_CACHE = 'LL cache accesses'
LL_CACHE_MISS = 'LL cache misses'
BRANCH_MISS_RATE = 'branch miss rate'
ITLB_MISS_RATE = 'ITLB miss rate'
DTLB_MISS_RATE = 'DTLB miss rate'
L1I_CACHE_MISS_RATE = 'L1I cache miss rate'
L1D_CACHE_MISS_RATE = 'L1D cache miss rate'
L2D_CACHE_MISS_RATE = 'L2D cache miss rate'
LL_CACHE_MISS_RATE = 'LL cache miss rate'
altra_names = {
Stat.TASK_CLK: 'task-clock:u',
Stat.PAGE_FAULTS: 'page-faults:u',
Stat.CYCLES: 'cycles:u',
Stat.INST: 'instructions:u',
Stat.BR: 'BR_RETIRED:u',
Stat.BR_MISS: 'BR_MIS_PRED_RETIRED:u',
Stat.ITLB: 'L1I_TLB:u',
Stat.ITLB_MISS: 'ITLB_WALK:u',
Stat.DTLB: 'L1D_TLB:u',
Stat.DTLB_MISS: 'DTLB_WALK:u',
Stat.L2D_TLB: 'L2D_TLB:u',
Stat.L2D_TLB_MISS: 'L2D_TLB_REFILL:u',
Stat.L1I_CACHE: 'L1I_CACHE:u',
Stat.L1I_CACHE_MISS: 'L1I_CACHE_REFILL:u',
Stat.L1D_CACHE: 'L1D_CACHE:u',
Stat.L1D_CACHE_MISS: 'L1D_CACHE_REFILL:u',
Stat.L2D_CACHE: 'L2D_CACHE:u',
Stat.L2D_CACHE_MISS: 'L2D_CACHE_REFILL:u',
Stat.LL_CACHE: 'LL_CACHE_RD:u',
Stat.LL_CACHE_MISS: 'LL_CACHE_MISS_RD:u',
}
xeon_names = {
Stat.TASK_CLK: 'task-clock:u',
Stat.PAGE_FAULTS: 'page-faults:u',
Stat.CYCLES: 'cycles:u',
Stat.INST: 'instructions:u',
Stat.BR: 'branches:u',
Stat.BR_MISS: 'branch-misses:u',
Stat.ITLB: 'iTLB-loads:u',
Stat.ITLB_MISS: 'iTLB-load-misses:u',
Stat.DTLB: 'dTLB-loads:u',
Stat.DTLB_MISS: 'dTLB-load-misses:u',
Stat.L1I_CACHE: 'L1-icache-loads:u',
Stat.L1I_CACHE_MISS: 'L1-icache-load-misses:u',
Stat.L1D_CACHE: 'L1-dcache-loads:u',
Stat.L1D_CACHE_MISS: 'L1-dcache-load-misses:u',
Stat.LL_CACHE: 'LLC-loads:u',
Stat.LL_CACHE_MISS: 'LLC-load-misses:u',
}
class Workload(Enum):
MINI_EM = 'mini_em'
class CPU(Enum):
ALTRA = altra_names
XEON = xeon_names
class Plot(Enum):
BOX = 'box'
LINE = 'line'
def parse_input(filename: str, workload: Workload) -> dict[str, str | int]:
# Split filename into each input parameter.
filename_split = os.path.splitext(filename)[0].split('_')
i = 0
data = {Stat.CPU.value: filename_split[i]}
i += 1
if workload == Workload.MINI_EM:
data[Stat.SOLVER.value] = filename_split[i]
i += 1
data[Stat.LIN_ALG.value] = filename_split[i]
i += 1
data[Stat.INPUT_FILE.value] = filename_split[i]
regex = r'^maxwell(\d+)'
data[Stat.MAXWELL_SIZE.value] = int(re.search(regex, filename_split[i]).group(1))
x = data[Stat.MAXWELL_SIZE.value]
data[Stat.MATRIX_COLS.value] = 3 * x ** 3 + 6 * x ** 2 + 3 * x
i += 1
return data
def parse_output(filename: str, data: dict[str, str]) -> dict[str, str | int | float]:
result: dict[str, int | float] = dict()
cpu: CPU = CPU[data['cpu'].upper()]
with open(filename, 'r') as file:
for line in file:
for stat in [x for x in Stat if x in cpu.value]:
regex = r'^\W*([\d+(,|\.)?]+)\W*.*' + cpu.value[stat]
value = re.search(regex, line)
if value is None:
continue
elif stat == Stat.TASK_CLK:
result[stat.value] = float(value.group(1).replace(',', ''))
else:
result[stat.value] = int(value.group(1).replace(',', ''))
return result | parse_power(filename, cpu)
# TODO CHANGE THIS
def parse_power(filename: str, cpu: CPU) -> dict[str, int]:
match cpu:
case CPU.ALTRA:
class Socket(Enum):
SOCKET1 = 'Socket1'
SOCKET2 = 'Socket2'
data: dict[str, int] = {socket: list() for socket in Socket}
baseline_data: dict[str, int] = dict()
filename = os.path.splitext(filename)[0] + "_power" + os.path.splitext(filename)[1]
with open(filename, 'r') as file:
for line in file:
regex = r'Start'
value = re.search(regex, line)
if value is not None:
for socket in Socket:
baseline_data[socket] = np.average(np.array(data[socket]))
data[socket] = list()
continue
for socket in Socket:
regex = r'^' + socket.value + r' (\d+\.\d+)'
value = re.search(regex, line)
if value is not None:
data[socket].append(float(value.group(1)))
break
power_deltas: dict[str, int] = {
socket: np.max(np.array(data[socket])) - baseline_data[socket]
for socket in Socket
}
return {Stat.POWER_DELTA.value: max(power_deltas.values())}
case CPU.XEON:
return {}
def derive_stats(data: dict[str, str | int | float]) -> dict[str, int | float]:
result: dict[str, int | float] = dict()
result[Stat.BRANCH_MISS_RATE.value] = data[Stat.BR_MISS.value] / data[Stat.BR.value]
result[Stat.ITLB_MISS_RATE.value] = data[Stat.ITLB_MISS.value] / data[Stat.ITLB.value]
result[Stat.DTLB_MISS_RATE.value] = data[Stat.DTLB_MISS.value] / data[Stat.DTLB.value]
result[Stat.L1I_CACHE_MISS_RATE.value] = (
data[Stat.L1I_CACHE_MISS.value] / data[Stat.L1I_CACHE.value]
if Stat.L1I_CACHE_MISS.value in data and Stat.L1I_CACHE.value in data
else None)
result[Stat.L1D_CACHE_MISS_RATE.value] = (
data[Stat.L1D_CACHE_MISS.value] / data[Stat.L1D_CACHE.value])
result[Stat.L2D_CACHE_MISS_RATE.value] = (
data[Stat.L2D_CACHE_MISS.value] / data[Stat.L2D_CACHE.value]
if Stat.L2D_CACHE_MISS.value in data and Stat.L2D_CACHE.value in data
else None)
result[Stat.LL_CACHE_MISS_RATE.value] = (
data[Stat.LL_CACHE_MISS.value] / data[Stat.LL_CACHE.value])
return result
def accumulate(stats_list: list[dict[str, str | int | float]], category: Stat, value: Stat):
category_list = np.array([stats[category.value] for stats in stats_list if value.value in stats])
value_list = np.array([stats[value.value] for stats in stats_list if value.value in stats])
result: dict[np.ndarray] = dict()
for category in np.sort(np.unique(category_list)):
result[category] = value_list[category_list == category]
return result
def box_plot(ax, stats_list: list[dict[str, str | int | float]], x: Stat, y: Stat):
data: dict[str, np.ndarray] = accumulate(stats_list, x, y)
print("Plotted data: " + str(data))
ax.boxplot(data.values(), tick_labels=data.keys())
ax.set_ylabel(y.value)
def line_plot(
ax, stats_list: list[dict[str, str | int | float]],
x: Stat, y: Stat, color: Stat
):
x_data: dict[str, np.ndarray] = accumulate(stats_list, color, x)
y_data: dict[str, np.ndarray] = accumulate(stats_list, color, y)
for category in x_data.keys():
sorted_indices = np.argsort(x_data[category])
x_data[category] = x_data[category][sorted_indices]
y_data[category] = y_data[category][sorted_indices]
ax.plot(x_data[category], y_data[category], label=category)
print("Plotted x data: " + str(x_data[category]))
print("Plotted y data: " + str(y_data[category]))
ax.set_ylabel(y.value)
ax.grid(True)
def visualize(
stats_list: list[dict[str, str | int | float]],
plot: Plot,
rows: int,
size_multiplier: int,
font_size: int,
x: Stat,
y: Stat,
color: Stat,
filter_list: list[str] = []
):
# Remove stats entries containing undesired values (like a specific CPU).
stats_list = [stats for stats in stats_list
if len([stats[key] for key in stats.keys()
if stats[key] in filter_list]) == 0]
#x = Stat.MAXWELL_SIZE
#y = Stat.DTLB_MISS_RATE
#color = Stat.SOLVER
if y is None:
ys = [stat for stat in Stat if stat.value in stats_list[0].keys()
and stat is not x
#and y != color
#and y != marker
and stat.value not in filter_list]
fig, axes = plt.subplots(rows, int(math.ceil(len(ys) / rows)),
figsize = (16 * size_multiplier, 9 * size_multiplier))
match plot:
case Plot.BOX:
for i, y in enumerate(ys):
box_plot(axes[i % rows][int(i / rows)], stats_list, x, y)
case Plot.LINE:
for i, y in enumerate(ys):
line_plot(axes[i % rows][int(i / rows)], stats_list, x, y, color)
handles, labels = axes[i % rows][int(i / rows)].get_legend_handles_labels()
else:
fig, ax = plt.subplots()
match plot:
case Plot.BOX:
box_plot(ax, stats_list, x, y)
case Plot.LINE:
line_plot(ax, stats_list, x, y, color)
handles, labels = ax.get_legend_handles_labels()
#box_plot(ax, stats, x, y)
#line_plot(ax, stats, x, y, color)
match plot:
case Plot.BOX:
title = f"{plot.value}_plot_of_{y.value.replace(' ', '_')}_vs_{x.value.replace(' ', '_')}_excluding_{filter_list}"
case Plot.LINE:
title = f"{plot.value}_plot_of_{y.value.replace(' ', '_')}_vs_{x.value.replace(' ', '_')}_by_{color.value.replace(' ', '_')}_excluding_{filter_list}"
fig.suptitle(title, fontsize = font_size)
fig.legend(handles, labels, fontsize = font_size)
fig.supxlabel(x.value, fontsize = font_size)
plt.savefig(title + ".png", dpi = 100)
plt.show()
def main():
class Command(Enum):
PARSE = 'parse'
VISUALIZE = 'visualize'
parser = argparse.ArgumentParser()
parser.add_argument('command', choices=[x.value for x in Command])
parser.add_argument('workload',
choices=[x.name.lower() for x in Workload],
help='the workload to analyze')
parser.add_argument('filepath',
help='the output for the ' + Command.PARSE.value + ' command or the input for the ' + Command.VISUALIZE.value + ' command')
parser.add_argument('-i', '--input',
help='the input directory for the parse command')
parser.add_argument('-p', '--plot',
choices=[x.name.lower() for x in Plot],
help = 'the type of plot')
parser.add_argument('-r', '--rows', type=int,
help = 'the number of rows to display when -y is not specified',
default = 5)
parser.add_argument('-s', '--size', type=int,
help = 'figure size multiplier',
default = 4)
parser.add_argument('-fs', '--font_size', type=int,
help = 'font size',
default = 40)
parser.add_argument('-x',
choices=[x.name.lower() for x in Stat],
help = 'the name of the x axis')
parser.add_argument('-y',
choices=[x.name.lower() for x in Stat],
help = 'the name of the y axis')
parser.add_argument('-c', '--color',
choices=[x.name.lower() for x in Stat],
help = 'the name of the color')
parser.add_argument('-f', '--filter', nargs = '+',
help = 'a comma-separated string of names and values to filter out.',
default = [])
args = parser.parse_args()
stats_list: list[dict] = list()
if args.command == Command.PARSE.value:
if (args.input) is None:
print("An input directory is required with -i")
exit(-1)
original_dir = os.getcwd()
os.chdir(args.input)
for filename in os.listdir(os.getcwd()):
if "output" not in filename:
continue
if "power" in filename:
continue
stats = parse_input(filename, Workload[args.workload.upper()])
stats = stats | parse_output(filename, stats)
stats = stats | derive_stats(stats)
stats_list.append(stats)
print(filename + " parsed.")
os.chdir(original_dir)
with open(args.filepath, 'w') as file:
json.dump(stats_list, file, indent = 2)
elif args.command == Command.VISUALIZE.value:
with open(args.filepath, 'r') as file:
stats_list = json.load(file)
x = Stat[args.x.upper()] if args.x is not None else None
y = Stat[args.y.upper()] if args.y is not None else None
color = Stat[args.color.upper()] if args.color is not None else None
visualize(stats_list, Plot[args.plot.upper()], args.rows, args.size, args.font_size, x, y, color, args.filter)
if __name__ == '__main__':
main()