ignored matrices

new output
2024-12-03 08:54:48 -05:00 · 2024-12-03 08:53:39 -05:00
84 changed files with 3744 additions and 971 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,4 @@
 *.swp
 *.sif
 __pycache__
+matrices/
--- a/pytorch/batch.py
+++ b/pytorch/batch.py
@ -4,6 +4,7 @@ import argparse
 import glob
 import os
 import subprocess
+import random

 parser = argparse.ArgumentParser()
 parser.add_argument('arch')
@ -14,7 +15,7 @@ parser.add_argument('baseline_time_s', type=int)
 parser.add_argument('baseline_delay_s', type=int)
 parser.add_argument('--perf', action='store_const', const='--perf')
 parser.add_argument('--power', action='store_const', const='--power')
-parser.add_argument('--distribute', type=bool)
+parser.add_argument('--distribute', action='store_true')
 args = parser.parse_args()

 srun_args_altra = [
@ -42,32 +43,36 @@ def srun(srun_args_list: list, run_args, matrix_file: str) -> list:
        run_args_list += [args.perf]
    if args.power is not None:
        run_args_list += [args.power]
-    return ['srun'] + srun_args_list + ['run.py'] + run_args_list
+    return ['srun'] + srun_args_list + ['./run.py'] + run_args_list
+
+processes = list()

 for i, matrix in enumerate(glob.glob(f'{args.matrix_dir.rstrip("/")}/*.mtx')):
    if args.arch == 'altra':
-        if args.distribute == True:
+        if args.distribute:
            i = i % 40
-            srun_args_altra += [f'--nodelist oasis{i:02}']
+            srun_args = srun_args_altra + ['--nodelist', f'oasis{i:02}']
+        else:
+            srun_args = srun_args_altra

        output_filename = '_'.join([
                args.arch,
-                str(args.iterations),
-                os.path.splitext(os.path.basename(matrix))[0],
                str(args.baseline_time_s),
-                str(args.baseline_delay_s)])
+                str(args.baseline_delay_s),
+                os.path.splitext(os.path.basename(matrix))[0],
+                str(args.iterations)])

        json_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.json'
        raw_filepath = f'{args.output_dir.rstrip("/")}/{output_filename}.output'
        with open(json_filepath, 'w') as json_file, open(raw_filepath, 'w') as raw_file:
-            print(srun(srun_args_altra, args, matrix))
-            proc = subprocess.run(
+            print(srun(srun_args, args, matrix))
+            print(json_filepath)
+            print(raw_filepath)
+
+            processes.append(subprocess.Popen(
                    srun(srun_args_altra, args, matrix),
                    stdout=json_file,
-                    stderr=raw_file,
-                    text=True)
-            #output = proc.communicate()
-            #print(output[0])
-            #print(output[1])
+                    stderr=raw_file))

-        break;
+for process in processes:
+    process.wait()
--- a/pytorch/output/altra_10_30_Oregon-2_1000.json
+++ b/pytorch/output/altra_10_30_Oregon-2_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [37.36, 22.88, 22.36, 22.72, 22.52, 22.2, 21.96, 21.8, 21.48, 21.48], "matrix": "Oregon-2", "shape": [11806, 11806], "nnz": 65460, "% density": 0.0004696458003979807, "time_s": 1.5312557220458984, "power": [26.68, 27.84, 28.48, 29.92, 30.0], "power_after": [21.16, 21.32, 21.16, 21.16, 21.16, 20.88, 20.92, 20.76, 20.96, 21.2], "task clock (msec)": 64.81, "page faults": 3244, "cycles": 82069432, "instructions": 78292700, "branch mispredictions": 319703, "branches": 19996903, "ITLB accesses": 26988315, "ITLB misses": 5988, "DTLB misses": 14570, "DTLB accesses": 36879854, "L1I cache accesses": 30465174, "L1I cache misses": 293085, "L1D cache misses": 487330, "L1D cache accesses": 31932249, "LL cache misses": 545501, "LL cache accesses": 558084, "L2D TLB accesses": 204746, "L2D TLB misses": 25302, "L2D cache misses": 314594, "L2D cache accesses": 1828047, "instructions per cycle": 0.9539812582107307, "branch miss rate": 0.01598762568383714, "ITLB miss rate": 0.00022187379982781437, "DTLB miss rate": 0.0003950666399058955, "L2D TLB miss rate": 0.12357750578765886, "L1I cache miss rate": 0.009620329101025322, "L1D cache miss rate": 0.015261374167538278, "L2D cache miss rate": 0.17209294947011755, "LL cache miss rate": 0.9774532149282187}
--- a/pytorch/output/altra_10_30_Oregon-2_1000.output
+++ b/pytorch/output/altra_10_30_Oregon-2_1000.output
@ -5,45 +5,46 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394148 queued and waiting for resources
-srun: job 3394148 has been allocated resources
+srun: job 3394980 queued and waiting for resources
+srun: job 3394980 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([    0,   583,   584,  ..., 65459, 65460, 65460]),
       col_indices=tensor([   2,   23,   27,  ..., 3324,  958,  841]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(11806, 11806),
       nnz=65460, layout=torch.sparse_csr)
-tensor([0.3190, 0.2829, 0.6210,  ..., 0.9278, 0.7514, 0.5737])
+tensor([0.9231, 0.7723, 0.0509,  ..., 0.0839, 0.6982, 0.3459])
+Matrix: Oregon-2
 Shape: torch.Size([11806, 11806])
 NNZ: 65460
 Density: 0.0004696458003979807
-Time: 0.22389841079711914 seconds
+Time: 1.5677142143249512 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/Oregon-2.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/Oregon-2.mtx 1000':

-             42.01 msec task-clock:u                     #    0.012 CPUs utilized             
+             64.81 msec task-clock:u                     #    0.013 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,263      page-faults:u                    #   77.672 K/sec                     
-        47,084,933      cycles:u                         #    1.121 GHz                         (65.90%)
-        77,895,119      instructions:u                   #    1.65  insn per cycle              (85.49%)
+             3,244      page-faults:u                    #   50.056 K/sec                     
+        82,069,432      cycles:u                         #    1.266 GHz                         (59.04%)
+        78,292,700      instructions:u                   #    0.95  insn per cycle              (76.75%)
   <not supported>      branches:u                                                            
-           352,740      branch-misses:u                                                       
-        30,958,922      L1-dcache-loads:u                #  736.946 M/sec                     
-           442,351      L1-dcache-load-misses:u          #    1.43% of all L1-dcache accesses 
+           341,509      branch-misses:u                                                         (90.97%)
+        33,032,555      L1-dcache-loads:u                #  509.704 M/sec                     
+           478,674      L1-dcache-load-misses:u          #    1.45% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        29,506,648      L1-icache-loads:u                #  702.376 M/sec                     
-           272,063      L1-icache-load-misses:u          #    0.92% of all L1-icache accesses 
-        51,646,382      dTLB-loads:u                     #    1.229 G/sec                       (15.87%)
-     <not counted>      dTLB-load-misses:u                                                      (0.00%)
+        31,508,310      L1-icache-loads:u                #  486.184 M/sec                     
+           297,528      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
+        49,358,091      dTLB-loads:u                     #  761.613 M/sec                       (27.83%)
+            88,514      dTLB-load-misses:u               #    0.18% of all dTLB cache accesses  (14.82%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       3.513156571 seconds time elapsed
+       5.016393105 seconds time elapsed

-      15.150380000 seconds user
-      32.922923000 seconds sys
+      16.759527000 seconds user
+      31.429551000 seconds sys



@ -53,21 +54,22 @@ tensor(crow_indices=tensor([    0,   583,   584,  ..., 65459, 65460, 65460]),
       col_indices=tensor([   2,   23,   27,  ..., 3324,  958,  841]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(11806, 11806),
       nnz=65460, layout=torch.sparse_csr)
-tensor([0.0741, 0.5476, 0.1060,  ..., 0.8459, 0.8270, 0.8313])
+tensor([0.8423, 0.9339, 0.8037,  ..., 0.5953, 0.0649, 0.1559])
+Matrix: Oregon-2
 Shape: torch.Size([11806, 11806])
 NNZ: 65460
 Density: 0.0004696458003979807
-Time: 0.20610284805297852 seconds
+Time: 1.516484022140503 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/Oregon-2.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/Oregon-2.mtx 1000':

-           330,923      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        19,740,519      BR_RETIRED:u                                                          
+           319,703      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,996,903      BR_RETIRED:u                                                          

-       3.639725976 seconds time elapsed
+       4.945699041 seconds time elapsed

-      15.493122000 seconds user
-      27.617441000 seconds sys
+      16.431978000 seconds user
+      29.752452000 seconds sys



@ -77,23 +79,24 @@ tensor(crow_indices=tensor([    0,   583,   584,  ..., 65459, 65460, 65460]),
       col_indices=tensor([   2,   23,   27,  ..., 3324,  958,  841]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(11806, 11806),
       nnz=65460, layout=torch.sparse_csr)
-tensor([0.9699, 0.9368, 0.7284,  ..., 0.7182, 0.5308, 0.9833])
+tensor([0.8058, 0.2922, 0.1227,  ..., 0.2176, 0.9496, 0.8838])
+Matrix: Oregon-2
 Shape: torch.Size([11806, 11806])
 NNZ: 65460
 Density: 0.0004696458003979807
-Time: 0.15960955619812012 seconds
+Time: 1.6458909511566162 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/Oregon-2.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/Oregon-2.mtx 1000':

-        27,761,239      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,471      ITLB_WALK:u                                                           
-            17,268      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        36,993,265      L1D_TLB:u                                                             
+        26,988,315      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             5,988      ITLB_WALK:u                                                           
+            14,570      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,879,854      L1D_TLB:u                                                             

-       3.455602215 seconds time elapsed
+       5.011871473 seconds time elapsed

-      15.015027000 seconds user
-      27.930709000 seconds sys
+      16.529942000 seconds user
+      30.438432000 seconds sys



@ -103,23 +106,24 @@ tensor(crow_indices=tensor([    0,   583,   584,  ..., 65459, 65460, 65460]),
       col_indices=tensor([   2,   23,   27,  ..., 3324,  958,  841]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(11806, 11806),
       nnz=65460, layout=torch.sparse_csr)
-tensor([0.5851, 0.3425, 0.8120,  ..., 0.0829, 0.5823, 0.2256])
+tensor([0.7728, 0.1182, 0.3337,  ..., 0.2555, 0.2523, 0.5746])
+Matrix: Oregon-2
 Shape: torch.Size([11806, 11806])
 NNZ: 65460
 Density: 0.0004696458003979807
-Time: 0.15697884559631348 seconds
+Time: 1.529954433441162 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/Oregon-2.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/Oregon-2.mtx 1000':

-        31,834,980      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           298,333      L1I_CACHE_REFILL:u                                                    
-           466,901      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        33,528,976      L1D_CACHE:u                                                           
+        30,465,174      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           293,085      L1I_CACHE_REFILL:u                                                    
+           487,330      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        31,932,249      L1D_CACHE:u                                                           

-       3.452279902 seconds time elapsed
+       4.954100105 seconds time elapsed

-      14.635240000 seconds user
-      28.262858000 seconds sys
+      16.282966000 seconds user
+      28.926724000 seconds sys



@ -129,25 +133,26 @@ tensor(crow_indices=tensor([    0,   583,   584,  ..., 65459, 65460, 65460]),
       col_indices=tensor([   2,   23,   27,  ..., 3324,  958,  841]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(11806, 11806),
       nnz=65460, layout=torch.sparse_csr)
-tensor([0.0772, 0.9112, 0.0293,  ..., 0.4016, 0.4357, 0.5368])
+tensor([0.5613, 0.3211, 0.1739,  ..., 0.5461, 0.1391, 0.8387])
+Matrix: Oregon-2
 Shape: torch.Size([11806, 11806])
 NNZ: 65460
 Density: 0.0004696458003979807
-Time: 0.20962285995483398 seconds
+Time: 1.5726752281188965 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/Oregon-2.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/Oregon-2.mtx 1000':

-           525,505      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           546,521      LL_CACHE_RD:u                                                         
-           184,884      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            22,933      L2D_TLB_REFILL:u                                                      
-           292,367      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,706,226      L2D_CACHE:u                                                           
+           545,501      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           558,084      LL_CACHE_RD:u                                                         
+           204,746      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            25,302      L2D_TLB_REFILL:u                                                      
+           314,594      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,828,047      L2D_CACHE:u                                                           

-       3.566096255 seconds time elapsed
+       4.866549675 seconds time elapsed

-      15.763579000 seconds user
-      28.620423000 seconds sys
+      16.609257000 seconds user
+      31.381282000 seconds sys



--- a/pytorch/output/altra_10_30_as-caida_1000.json
+++ b/pytorch/output/altra_10_30_as-caida_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [21.6, 21.64, 21.88, 22.08, 22.2, 22.32, 22.36, 22.04, 22.0, 21.96], "matrix": "as-caida", "shape": [31379, 31379], "nnz": 106762, "% density": 0.00010842726485909405, "time_s": 2.6254467964172363, "power": [30.92, 29.2, 29.52, 29.72, 29.72, 31.72], "power_after": [21.04, 21.28, 21.04, 21.16, 21.16, 20.96, 21.04, 20.88, 20.56, 20.84], "task clock (msec)": 61.4, "page faults": 3507, "cycles": 78967021, "instructions": 94334531, "branch mispredictions": 325893, "branches": 19069753, "ITLB accesses": 27181279, "ITLB misses": 5995, "DTLB misses": 17412, "DTLB accesses": 37016930, "L1I cache accesses": 31535482, "L1I cache misses": 292676, "L1D cache misses": 471752, "L1D cache accesses": 33119145, "LL cache misses": 540894, "LL cache accesses": 554700, "L2D TLB accesses": 191772, "L2D TLB misses": 23711, "L2D cache misses": 306195, "L2D cache accesses": 1755986, "instructions per cycle": 1.1946066827061894, "branch miss rate": 0.017089523917797993, "ITLB miss rate": 0.00022055621444450792, "DTLB miss rate": 0.00047037936425305935, "L2D TLB miss rate": 0.12364161608576851, "L1I cache miss rate": 0.009280847522799873, "L1D cache miss rate": 0.01424408752097918, "L2D cache miss rate": 0.17437211913990203, "LL cache miss rate": 0.975110870740941}
--- a/pytorch/output/altra_10_30_as-caida_1000.output
+++ b/pytorch/output/altra_10_30_as-caida_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394150 queued and waiting for resources
-srun: job 3394150 has been allocated resources
+srun: job 3394983 queued and waiting for resources
+srun: job 3394983 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,      0,      0,  ..., 106761, 106761,
@ -14,37 +14,38 @@ tensor(crow_indices=tensor([     0,      0,      0,  ..., 106761, 106761,
       col_indices=tensor([  106,   329,  1040,  ...,   155,   160, 12170]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(31379, 31379),
       nnz=106762, layout=torch.sparse_csr)
-tensor([0.7672, 0.5818, 0.6775,  ..., 0.1052, 0.2539, 0.4347])
+tensor([0.4886, 0.3652, 0.5691,  ..., 0.6466, 0.4355, 0.8397])
+Matrix: as-caida
 Shape: torch.Size([31379, 31379])
 NNZ: 106762
 Density: 0.00010842726485909405
-Time: 0.28373050689697266 seconds
+Time: 2.6297245025634766 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/as-caida.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/as-caida.mtx 1000':

-             60.78 msec task-clock:u                     #    0.017 CPUs utilized             
+             61.40 msec task-clock:u                     #    0.010 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,300      page-faults:u                    #   54.293 K/sec                     
-        66,733,059      cycles:u                         #    1.098 GHz                         (58.34%)
-        87,889,334      instructions:u                   #    1.32  insn per cycle              (93.45%)
+             3,507      page-faults:u                    #   57.117 K/sec                     
+        78,967,021      cycles:u                         #    1.286 GHz                         (61.13%)
+        94,334,531      instructions:u                   #    1.19  insn per cycle              (95.16%)
   <not supported>      branches:u                                                            
-           369,909      branch-misses:u                                                       
-        31,872,708      L1-dcache-loads:u                #  524.386 M/sec                     
-           465,719      L1-dcache-load-misses:u          #    1.46% of all L1-dcache accesses 
+           365,239      branch-misses:u                                                       
+        33,334,312      L1-dcache-loads:u                #  542.906 M/sec                     
+           457,950      L1-dcache-load-misses:u          #    1.37% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        30,443,353      L1-icache-loads:u                #  500.870 M/sec                     
-           292,371      L1-icache-load-misses:u          #    0.96% of all L1-icache accesses 
-        34,702,735      dTLB-loads:u                     #  570.947 M/sec                       (6.96%)
+        31,725,851      L1-icache-loads:u                #  516.709 M/sec                     
+           297,720      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
+        25,188,580      dTLB-loads:u                     #  410.239 M/sec                       (5.16%)
     <not counted>      dTLB-load-misses:u                                                      (0.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       3.683429807 seconds time elapsed
+       6.049042045 seconds time elapsed

-      15.161162000 seconds user
-      31.335288000 seconds sys
+      17.649315000 seconds user
+      29.335859000 seconds sys



@ -55,21 +56,22 @@ tensor(crow_indices=tensor([     0,      0,      0,  ..., 106761, 106761,
       col_indices=tensor([  106,   329,  1040,  ...,   155,   160, 12170]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(31379, 31379),
       nnz=106762, layout=torch.sparse_csr)
-tensor([0.2708, 0.2455, 0.7615,  ..., 0.1172, 0.4072, 0.8970])
+tensor([0.8344, 0.2588, 0.2246,  ..., 0.5607, 0.8141, 0.9893])
+Matrix: as-caida
 Shape: torch.Size([31379, 31379])
 NNZ: 106762
 Density: 0.00010842726485909405
-Time: 0.32511067390441895 seconds
+Time: 2.6495532989501953 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/as-caida.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/as-caida.mtx 1000':

-           326,300      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        19,832,700      BR_RETIRED:u                                                          
+           325,893      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,069,753      BR_RETIRED:u                                                          

-       3.755497210 seconds time elapsed
+       6.023780447 seconds time elapsed

-      14.681699000 seconds user
-      29.413955000 seconds sys
+      17.654658000 seconds user
+      28.848805000 seconds sys



@ -80,23 +82,24 @@ tensor(crow_indices=tensor([     0,      0,      0,  ..., 106761, 106761,
       col_indices=tensor([  106,   329,  1040,  ...,   155,   160, 12170]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(31379, 31379),
       nnz=106762, layout=torch.sparse_csr)
-tensor([0.9417, 0.0965, 0.8551,  ..., 0.6665, 0.0164, 0.5102])
+tensor([0.0814, 0.1132, 0.8515,  ..., 0.8987, 0.5912, 0.5002])
+Matrix: as-caida
 Shape: torch.Size([31379, 31379])
 NNZ: 106762
 Density: 0.00010842726485909405
-Time: 0.33124780654907227 seconds
+Time: 2.5444185733795166 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/as-caida.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/as-caida.mtx 1000':

-        27,233,629      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             5,868      ITLB_WALK:u                                                           
-            16,893      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        36,409,508      L1D_TLB:u                                                             
+        27,181,279      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             5,995      ITLB_WALK:u                                                           
+            17,412      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        37,016,930      L1D_TLB:u                                                             

-       3.751203540 seconds time elapsed
+       5.790360666 seconds time elapsed

-      14.849342000 seconds user
-      27.706396000 seconds sys
+      17.919315000 seconds user
+      30.569858000 seconds sys



@ -107,23 +110,24 @@ tensor(crow_indices=tensor([     0,      0,      0,  ..., 106761, 106761,
       col_indices=tensor([  106,   329,  1040,  ...,   155,   160, 12170]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(31379, 31379),
       nnz=106762, layout=torch.sparse_csr)
-tensor([0.9215, 0.4139, 0.1789,  ..., 0.0245, 0.0029, 0.2129])
+tensor([0.0439, 0.1884, 0.3342,  ..., 0.2027, 0.5532, 0.7245])
+Matrix: as-caida
 Shape: torch.Size([31379, 31379])
 NNZ: 106762
 Density: 0.00010842726485909405
-Time: 0.3386805057525635 seconds
+Time: 2.620804786682129 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/as-caida.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/as-caida.mtx 1000':

-        30,924,532      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           288,199      L1I_CACHE_REFILL:u                                                    
-           462,816      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        32,428,375      L1D_CACHE:u                                                           
+        31,535,482      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           292,676      L1I_CACHE_REFILL:u                                                    
+           471,752      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,119,145      L1D_CACHE:u                                                           

-       3.628443937 seconds time elapsed
+       6.002311801 seconds time elapsed

-      15.430937000 seconds user
-      30.878583000 seconds sys
+      17.427887000 seconds user
+      30.063688000 seconds sys



@ -134,25 +138,26 @@ tensor(crow_indices=tensor([     0,      0,      0,  ..., 106761, 106761,
       col_indices=tensor([  106,   329,  1040,  ...,   155,   160, 12170]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(31379, 31379),
       nnz=106762, layout=torch.sparse_csr)
-tensor([0.4983, 0.0268, 0.1695,  ..., 0.6987, 0.7224, 0.8577])
+tensor([0.1495, 0.5856, 0.8600,  ..., 0.2101, 0.6229, 0.2019])
+Matrix: as-caida
 Shape: torch.Size([31379, 31379])
 NNZ: 106762
 Density: 0.00010842726485909405
-Time: 0.3289623260498047 seconds
+Time: 2.561279296875 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/as-caida.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/as-caida.mtx 1000':

-           551,997      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           568,528      LL_CACHE_RD:u                                                         
-           193,991      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            24,353      L2D_TLB_REFILL:u                                                      
-           312,207      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,821,196      L2D_CACHE:u                                                           
+           540,894      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           554,700      LL_CACHE_RD:u                                                         
+           191,772      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,711      L2D_TLB_REFILL:u                                                      
+           306,195      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,755,986      L2D_CACHE:u                                                           

-       3.698790384 seconds time elapsed
+       5.946428572 seconds time elapsed

-      15.745189000 seconds user
-      31.063512000 seconds sys
+      17.396567000 seconds user
+      32.141235000 seconds sys



--- a/pytorch/output/altra_10_30_dc2_1000.json
+++ b/pytorch/output/altra_10_30_dc2_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [83.04, 78.44, 65.92, 53.76, 38.68, 38.68, 25.68, 22.6, 22.52, 22.32], "matrix": "dc2", "shape": [116835, 116835], "nnz": 766396, "% density": 5.614451099680581e-05, "time_s": 14.128849267959595, "power": [89.84, 89.4, 82.8, 71.32, 57.72, 51.92, 53.0, 63.8, 78.24, 78.24, 90.2, 90.36, 90.08, 88.64, 88.64, 87.64, 87.68, 87.24], "power_after": [21.4, 21.2, 21.08, 21.08, 21.28, 21.04, 20.92, 21.12, 21.08, 21.0], "task clock (msec)": 58.45, "page faults": 3471, "cycles": 76691414, "instructions": 89547095, "branch mispredictions": 329725, "branches": 19946857, "ITLB accesses": 27648951, "ITLB misses": 6857, "DTLB misses": 18047, "DTLB accesses": 37225736, "L1I cache accesses": 32434686, "L1I cache misses": 293072, "L1D cache misses": 483557, "L1D cache accesses": 34059722, "LL cache misses": 561480, "LL cache accesses": 578369, "L2D TLB accesses": 192306, "L2D TLB misses": 25364, "L2D cache misses": 317121, "L2D cache accesses": 1812330, "instructions per cycle": 1.16762868656979, "branch miss rate": 0.01653017314958442, "ITLB miss rate": 0.00024800217556174194, "DTLB miss rate": 0.00048479901109275584, "L2D TLB miss rate": 0.13189396066685385, "L1I cache miss rate": 0.00903575881696527, "L1D cache miss rate": 0.014197326683993487, "L2D cache miss rate": 0.17497972223601663, "LL cache miss rate": 0.9707989190292011}
--- a/pytorch/output/altra_10_30_dc2_1000.output
+++ b/pytorch/output/altra_10_30_dc2_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394149 queued and waiting for resources
-srun: job 3394149 has been allocated resources
+srun: job 3394982 queued and waiting for resources
+srun: job 3394982 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,      1,      2,  ..., 766390, 766394,
@ -16,37 +16,38 @@ tensor(crow_indices=tensor([     0,      1,      2,  ..., 766390, 766394,
       values=tensor([-1.0000e+00, -1.0000e+00, -1.0000e+00,  ...,
                       1.0331e+01, -1.0000e-03,  1.0000e-03]),
       size=(116835, 116835), nnz=766396, layout=torch.sparse_csr)
-tensor([0.4749, 0.3788, 0.8812,  ..., 0.8281, 0.8889, 0.4945])
+tensor([0.0986, 0.6504, 0.0132,  ..., 0.6525, 0.3337, 0.7557])
+Matrix: dc2
 Shape: torch.Size([116835, 116835])
 NNZ: 766396
 Density: 5.614451099680581e-05
-Time: 2.2480316162109375 seconds
+Time: 18.46260714530945 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/dc2.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/dc2.mtx 1000':

-             50.43 msec task-clock:u                     #    0.009 CPUs utilized             
+             58.45 msec task-clock:u                     #    0.003 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,285      page-faults:u                    #   65.135 K/sec                     
-        54,118,679      cycles:u                         #    1.073 GHz                         (60.92%)
-        77,692,421      instructions:u                   #    1.44  insn per cycle              (82.73%)
+             3,471      page-faults:u                    #   59.382 K/sec                     
+        76,691,414      cycles:u                         #    1.312 GHz                         (41.20%)
+        89,547,095      instructions:u                   #    1.17  insn per cycle              (73.16%)
   <not supported>      branches:u                                                            
-           367,999      branch-misses:u                                                       
-        32,182,371      L1-dcache-loads:u                #  638.112 M/sec                     
-           491,960      L1-dcache-load-misses:u          #    1.53% of all L1-dcache accesses 
+           382,362      branch-misses:u                                                         (96.21%)
+        33,271,433      L1-dcache-loads:u                #  569.211 M/sec                     
+           488,730      L1-dcache-load-misses:u          #    1.47% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        30,682,258      L1-icache-loads:u                #  608.367 M/sec                     
-           300,874      L1-icache-load-misses:u          #    0.98% of all L1-icache accesses 
-        55,244,523      dTLB-loads:u                     #    1.095 G/sec                       (19.09%)
-     <not counted>      dTLB-load-misses:u                                                      (0.00%)
+        31,926,596      L1-icache-loads:u                #  546.204 M/sec                     
+           304,792      L1-icache-load-misses:u          #    0.95% of all L1-icache accesses 
+        36,392,791      dTLB-loads:u                     #  622.612 M/sec                       (31.21%)
+                 0      dTLB-load-misses:u                                                      (5.35%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       5.813837947 seconds time elapsed
+      22.126601025 seconds time elapsed

-      28.815118000 seconds user
-     213.749674000 seconds sys
+     103.642372000 seconds user
+    1434.131491000 seconds sys



@ -59,21 +60,22 @@ tensor(crow_indices=tensor([     0,      1,      2,  ..., 766390, 766394,
       values=tensor([-1.0000e+00, -1.0000e+00, -1.0000e+00,  ...,
                       1.0331e+01, -1.0000e-03,  1.0000e-03]),
       size=(116835, 116835), nnz=766396, layout=torch.sparse_csr)
-tensor([0.9715, 0.3920, 0.0297,  ..., 0.1819, 0.5744, 0.8105])
+tensor([0.5605, 0.9374, 0.4444,  ..., 0.5937, 0.3099, 0.2252])
+Matrix: dc2
 Shape: torch.Size([116835, 116835])
 NNZ: 766396
 Density: 5.614451099680581e-05
-Time: 2.2333595752716064 seconds
+Time: 13.607120752334595 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/dc2.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/dc2.mtx 1000':

-           325,039      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        19,383,216      BR_RETIRED:u                                                          
+           329,725      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,946,857      BR_RETIRED:u                                                          

-       5.973132269 seconds time elapsed
+      17.131143957 seconds time elapsed

-      29.719778000 seconds user
-     213.706315000 seconds sys
+      96.945305000 seconds user
+    1045.242697000 seconds sys



@ -86,23 +88,24 @@ tensor(crow_indices=tensor([     0,      1,      2,  ..., 766390, 766394,
       values=tensor([-1.0000e+00, -1.0000e+00, -1.0000e+00,  ...,
                       1.0331e+01, -1.0000e-03,  1.0000e-03]),
       size=(116835, 116835), nnz=766396, layout=torch.sparse_csr)
-tensor([0.3371, 0.4985, 0.9905,  ..., 0.6075, 0.1568, 0.3782])
+tensor([0.8954, 0.9777, 0.8042,  ..., 0.2069, 0.7063, 0.8479])
+Matrix: dc2
 Shape: torch.Size([116835, 116835])
 NNZ: 766396
 Density: 5.614451099680581e-05
-Time: 1.9790923595428467 seconds
+Time: 17.22396969795227 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/dc2.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/dc2.mtx 1000':

-        26,060,519      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             4,749      ITLB_WALK:u                                                           
-            16,865      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        34,819,729      L1D_TLB:u                                                             
+        27,648,951      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,857      ITLB_WALK:u                                                           
+            18,047      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        37,225,736      L1D_TLB:u                                                             

-       5.575020445 seconds time elapsed
+      20.911480243 seconds time elapsed

-      26.769391000 seconds user
-     188.138935000 seconds sys
+     107.392462000 seconds user
+    1329.272154000 seconds sys



@ -115,23 +118,24 @@ tensor(crow_indices=tensor([     0,      1,      2,  ..., 766390, 766394,
       values=tensor([-1.0000e+00, -1.0000e+00, -1.0000e+00,  ...,
                       1.0331e+01, -1.0000e-03,  1.0000e-03]),
       size=(116835, 116835), nnz=766396, layout=torch.sparse_csr)
-tensor([0.6806, 0.8858, 0.7035,  ..., 0.6007, 0.0880, 0.4550])
+tensor([0.9293, 0.9606, 0.8914,  ..., 0.2407, 0.2843, 0.5174])
+Matrix: dc2
 Shape: torch.Size([116835, 116835])
 NNZ: 766396
 Density: 5.614451099680581e-05
-Time: 1.5306556224822998 seconds
+Time: 13.233965873718262 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/dc2.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/dc2.mtx 1000':

-        30,777,115      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           293,980      L1I_CACHE_REFILL:u                                                    
-           461,522      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        32,216,597      L1D_CACHE:u                                                           
+        32,434,686      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           293,072      L1I_CACHE_REFILL:u                                                    
+           483,557      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        34,059,722      L1D_CACHE:u                                                           

-       4.961298684 seconds time elapsed
+      16.956477005 seconds time elapsed

-      23.946357000 seconds user
-     156.598674000 seconds sys
+      88.393687000 seconds user
+    1037.101858000 seconds sys



@ -144,25 +148,26 @@ tensor(crow_indices=tensor([     0,      1,      2,  ..., 766390, 766394,
       values=tensor([-1.0000e+00, -1.0000e+00, -1.0000e+00,  ...,
                       1.0331e+01, -1.0000e-03,  1.0000e-03]),
       size=(116835, 116835), nnz=766396, layout=torch.sparse_csr)
-tensor([0.3029, 0.1908, 0.9816,  ..., 0.0418, 0.8182, 0.5474])
+tensor([0.8850, 0.9552, 0.7029,  ..., 0.3357, 0.0248, 0.5395])
+Matrix: dc2
 Shape: torch.Size([116835, 116835])
 NNZ: 766396
 Density: 5.614451099680581e-05
-Time: 2.28926944732666 seconds
+Time: 13.873224973678589 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/dc2.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/dc2.mtx 1000':

-           567,700      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           588,689      LL_CACHE_RD:u                                                         
-           189,417      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            22,360      L2D_TLB_REFILL:u                                                      
-           328,306      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,908,607      L2D_CACHE:u                                                           
+           561,480      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           578,369      LL_CACHE_RD:u                                                         
+           192,306      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            25,364      L2D_TLB_REFILL:u                                                      
+           317,121      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,812,330      L2D_CACHE:u                                                           

-       5.710829283 seconds time elapsed
+      17.467787426 seconds time elapsed

-      28.671301000 seconds user
-     213.960421000 seconds sys
+      92.463054000 seconds user
+    1072.584062000 seconds sys



--- a/pytorch/output/altra_10_30_de2010_1000.json
+++ b/pytorch/output/altra_10_30_de2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [28.56, 28.04, 23.8, 23.08, 22.12, 21.16, 21.16, 21.0, 20.96, 20.72], "matrix": "de2010", "shape": [24115, 24115], "nnz": 116056, "% density": 0.0001995689928120616, "time_s": 2.713265895843506, "power": [33.24, 30.84, 29.96, 27.68, 25.8, 25.8, 31.16], "power_after": [20.6, 20.48, 20.24, 20.32, 20.2, 20.36, 20.4, 20.4, 20.36, 20.36], "task clock (msec)": 48.96, "page faults": 3285, "cycles": 48563060, "instructions": 73465190, "branch mispredictions": 326361, "branches": 19599354, "ITLB accesses": 26666488, "ITLB misses": 6643, "DTLB misses": 17347, "DTLB accesses": 35986736, "L1I cache accesses": 32502068, "L1I cache misses": 302739, "L1D cache misses": 480619, "L1D cache accesses": 34031072, "LL cache misses": 552815, "LL cache accesses": 567373, "L2D TLB accesses": 188248, "L2D TLB misses": 23165, "L2D cache misses": 308211, "L2D cache accesses": 1787647, "instructions per cycle": 1.5127792606149613, "branch miss rate": 0.016651620252381788, "ITLB miss rate": 0.0002491141690649327, "DTLB miss rate": 0.0004820387155978803, "L2D TLB miss rate": 0.12305575623645404, "L1I cache miss rate": 0.00931445346800702, "L1D cache miss rate": 0.014122946229845479, "L2D cache miss rate": 0.17241155552522394, "LL cache miss rate": 0.9743413944618443}
--- a/pytorch/output/altra_10_30_de2010_1000.output
+++ b/pytorch/output/altra_10_30_de2010_1000.output
@ -0,0 +1,168 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3394985 queued and waiting for resources
+srun: job 3394985 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
+                            116056]),
+       col_indices=tensor([  250,   251,   757,  ..., 23334, 23553, 24050]),
+       values=tensor([ 14900.,  33341.,  20255.,  ..., 164227.,  52413.,
+                       16949.]), size=(24115, 24115), nnz=116056,
+       layout=torch.sparse_csr)
+tensor([0.6055, 0.8789, 0.0482,  ..., 0.0736, 0.1316, 0.6744])
+Matrix: de2010
+Shape: torch.Size([24115, 24115])
+NNZ: 116056
+Density: 0.0001995689928120616
+Time: 2.6956887245178223 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 1000':
+
+             48.96 msec task-clock:u                     #    0.008 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,285      page-faults:u                    #   67.090 K/sec                     
+        48,563,060      cycles:u                         #    0.992 GHz                         (59.76%)
+        73,465,190      instructions:u                   #    1.51  insn per cycle              (78.23%)
+   <not supported>      branches:u                                                            
+           369,314      branch-misses:u                                                         (98.16%)
+        31,769,641      L1-dcache-loads:u                #  648.836 M/sec                     
+           479,594      L1-dcache-load-misses:u          #    1.51% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        30,338,929      L1-icache-loads:u                #  619.616 M/sec                     
+           282,162      L1-icache-load-misses:u          #    0.93% of all L1-icache accesses 
+        55,516,925      dTLB-loads:u                     #    1.134 G/sec                       (23.54%)
+            12,345      dTLB-load-misses:u               #    0.02% of all dTLB cache accesses  (3.47%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+       6.017085179 seconds time elapsed
+
+      17.484355000 seconds user
+      28.678064000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
+                            116056]),
+       col_indices=tensor([  250,   251,   757,  ..., 23334, 23553, 24050]),
+       values=tensor([ 14900.,  33341.,  20255.,  ..., 164227.,  52413.,
+                       16949.]), size=(24115, 24115), nnz=116056,
+       layout=torch.sparse_csr)
+tensor([0.2815, 0.8196, 0.3706,  ..., 0.1328, 0.4062, 0.9113])
+Matrix: de2010
+Shape: torch.Size([24115, 24115])
+NNZ: 116056
+Density: 0.0001995689928120616
+Time: 2.7908551692962646 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 1000':
+
+           326,361      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,599,354      BR_RETIRED:u                                                          
+
+       6.215591535 seconds time elapsed
+
+      18.097112000 seconds user
+      27.831633000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
+                            116056]),
+       col_indices=tensor([  250,   251,   757,  ..., 23334, 23553, 24050]),
+       values=tensor([ 14900.,  33341.,  20255.,  ..., 164227.,  52413.,
+                       16949.]), size=(24115, 24115), nnz=116056,
+       layout=torch.sparse_csr)
+tensor([0.9002, 0.0843, 0.5558,  ..., 0.3931, 0.8070, 0.7414])
+Matrix: de2010
+Shape: torch.Size([24115, 24115])
+NNZ: 116056
+Density: 0.0001995689928120616
+Time: 2.819589376449585 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 1000':
+
+        26,666,488      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,643      ITLB_WALK:u                                                           
+            17,347      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        35,986,736      L1D_TLB:u                                                             
+
+       6.243883495 seconds time elapsed
+
+      17.783312000 seconds user
+      31.714619000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
+                            116056]),
+       col_indices=tensor([  250,   251,   757,  ..., 23334, 23553, 24050]),
+       values=tensor([ 14900.,  33341.,  20255.,  ..., 164227.,  52413.,
+                       16949.]), size=(24115, 24115), nnz=116056,
+       layout=torch.sparse_csr)
+tensor([0.9109, 0.6392, 0.7899,  ..., 0.0945, 0.3298, 0.6865])
+Matrix: de2010
+Shape: torch.Size([24115, 24115])
+NNZ: 116056
+Density: 0.0001995689928120616
+Time: 2.747800827026367 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 1000':
+
+        32,502,068      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           302,739      L1I_CACHE_REFILL:u                                                    
+           480,619      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        34,031,072      L1D_CACHE:u                                                           
+
+       6.126767063 seconds time elapsed
+
+      17.702029000 seconds user
+      29.137072000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
+                            116056]),
+       col_indices=tensor([  250,   251,   757,  ..., 23334, 23553, 24050]),
+       values=tensor([ 14900.,  33341.,  20255.,  ..., 164227.,  52413.,
+                       16949.]), size=(24115, 24115), nnz=116056,
+       layout=torch.sparse_csr)
+tensor([0.7083, 0.6766, 0.7649,  ..., 0.3027, 0.9885, 0.8086])
+Matrix: de2010
+Shape: torch.Size([24115, 24115])
+NNZ: 116056
+Density: 0.0001995689928120616
+Time: 2.795116901397705 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 1000':
+
+           552,815      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           567,373      LL_CACHE_RD:u                                                         
+           188,248      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,165      L2D_TLB_REFILL:u                                                      
+           308,211      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,787,647      L2D_CACHE:u                                                           
+
+       6.041792624 seconds time elapsed
+
+      17.791735000 seconds user
+      29.790006000 seconds sys
+
+
+
--- a/pytorch/output/altra_10_30_email-Enron_1000.json
+++ b/pytorch/output/altra_10_30_email-Enron_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [28.96, 27.92, 27.24, 23.0, 22.28, 22.28, 21.6, 20.8, 20.68, 20.76], "matrix": "email-Enron", "shape": [36692, 36692], "nnz": 367662, "% density": 0.0002730901120626302, "time_s": 12.818164587020874, "power": [84.24, 82.72, 82.72, 72.0, 60.2, 51.88, 52.4, 59.36, 72.08, 83.88, 86.48, 84.28, 82.28, 81.12, 80.96, 80.96, 81.16], "power_after": [20.92, 20.92, 20.92, 20.92, 21.0, 20.96, 20.88, 20.84, 20.88, 20.68], "task clock (msec)": 48.76, "page faults": 3281, "cycles": 45495589, "instructions": 79104832, "branch mispredictions": 335574, "branches": 20121415, "ITLB accesses": 26011880, "ITLB misses": 5842, "DTLB misses": 16448, "DTLB accesses": 35000292, "L1I cache accesses": 32193112, "L1I cache misses": 310304, "L1D cache misses": 495806, "L1D cache accesses": 33829187, "LL cache misses": 546628, "LL cache accesses": 570044, "L2D TLB accesses": 196794, "L2D TLB misses": 24071, "L2D cache misses": 316028, "L2D cache accesses": 1836018, "instructions per cycle": 1.7387362981496954, "branch miss rate": 0.016677455338006797, "ITLB miss rate": 0.00022458968748125855, "DTLB miss rate": 0.000469938936509444, "L2D TLB miss rate": 0.1223157210077543, "L1I cache miss rate": 0.009638832058236556, "L1D cache miss rate": 0.014656160669779029, "L2D cache miss rate": 0.1721268527868463, "LL cache miss rate": 0.9589224691427328}
--- a/pytorch/output/altra_10_30_email-Enron_1000.output
+++ b/pytorch/output/altra_10_30_email-Enron_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394152 queued and waiting for resources
-srun: job 3394152 has been allocated resources
+srun: job 3394986 queued and waiting for resources
+srun: job 3394986 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,      1,     71,  ..., 367660, 367661,
@ -14,37 +14,38 @@ tensor(crow_indices=tensor([     0,      1,     71,  ..., 367660, 367661,
       col_indices=tensor([    1,     0,     2,  ..., 36690, 36689,  8203]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36692, 36692),
       nnz=367662, layout=torch.sparse_csr)
-tensor([0.3626, 0.7532, 0.0782,  ..., 0.6679, 0.4308, 0.6586])
+tensor([0.9906, 0.9401, 0.5661,  ..., 0.4491, 0.7550, 0.2452])
+Matrix: email-Enron
 Shape: torch.Size([36692, 36692])
 NNZ: 367662
 Density: 0.0002730901120626302
-Time: 1.3745801448822021 seconds
+Time: 12.80848503112793 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/email-Enron.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/email-Enron.mtx 1000':

-             60.43 msec task-clock:u                     #    0.012 CPUs utilized             
+             48.76 msec task-clock:u                     #    0.003 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,319      page-faults:u                    #   54.926 K/sec                     
-        66,114,448      cycles:u                         #    1.094 GHz                         (58.10%)
-        90,786,829      instructions:u                   #    1.37  insn per cycle              (92.25%)
+             3,281      page-faults:u                    #   67.289 K/sec                     
+        45,495,589      cycles:u                         #    0.933 GHz                         (57.79%)
+        79,104,832      instructions:u                   #    1.74  insn per cycle              (81.70%)
   <not supported>      branches:u                                                            
-           372,381      branch-misses:u                                                       
-        32,997,410      L1-dcache-loads:u                #  546.070 M/sec                     
-           470,216      L1-dcache-load-misses:u          #    1.43% of all L1-dcache accesses 
+           372,161      branch-misses:u                                                       
+        32,089,348      L1-dcache-loads:u                #  658.113 M/sec                     
+           467,576      L1-dcache-load-misses:u          #    1.46% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        31,485,339      L1-icache-loads:u                #  521.047 M/sec                     
-           294,395      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
-        31,376,646      dTLB-loads:u                     #  519.248 M/sec                       (10.03%)
+        30,688,995      L1-icache-loads:u                #  629.393 M/sec                     
+           289,698      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
+        47,006,355      dTLB-loads:u                     #  964.042 M/sec                       (22.12%)
     <not counted>      dTLB-load-misses:u                                                      (0.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       4.904488673 seconds time elapsed
+      16.331438990 seconds time elapsed

-      22.874521000 seconds user
-     139.276239000 seconds sys
+      76.869141000 seconds user
+     999.179638000 seconds sys



@ -55,21 +56,22 @@ tensor(crow_indices=tensor([     0,      1,     71,  ..., 367660, 367661,
       col_indices=tensor([    1,     0,     2,  ..., 36690, 36689,  8203]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36692, 36692),
       nnz=367662, layout=torch.sparse_csr)
-tensor([0.2040, 0.8252, 0.0215,  ..., 0.2921, 0.9143, 0.8728])
+tensor([0.7565, 0.5273, 0.1038,  ..., 0.9432, 0.1309, 0.5542])
+Matrix: email-Enron
 Shape: torch.Size([36692, 36692])
 NNZ: 367662
 Density: 0.0002730901120626302
-Time: 1.3087654113769531 seconds
+Time: 26.91536283493042 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/email-Enron.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/email-Enron.mtx 1000':

-           341,625      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        20,129,354      BR_RETIRED:u                                                          
+           335,574      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,121,415      BR_RETIRED:u                                                          

-       4.644873434 seconds time elapsed
+      30.559245388 seconds time elapsed

-      22.729927000 seconds user
-     132.278582000 seconds sys
+     126.799314000 seconds user
+    2081.777635000 seconds sys



@ -80,23 +82,24 @@ tensor(crow_indices=tensor([     0,      1,     71,  ..., 367660, 367661,
       col_indices=tensor([    1,     0,     2,  ..., 36690, 36689,  8203]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36692, 36692),
       nnz=367662, layout=torch.sparse_csr)
-tensor([0.6154, 0.6641, 0.3794,  ..., 0.9736, 0.0619, 0.4790])
+tensor([0.2321, 0.0702, 0.2538,  ..., 0.6254, 0.6308, 0.5317])
+Matrix: email-Enron
 Shape: torch.Size([36692, 36692])
 NNZ: 367662
 Density: 0.0002730901120626302
-Time: 1.2701547145843506 seconds
+Time: 14.841739892959595 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/email-Enron.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/email-Enron.mtx 1000':

-        27,441,303      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,807      ITLB_WALK:u                                                           
-            20,551      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        36,867,114      L1D_TLB:u                                                             
+        26,011,880      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             5,842      ITLB_WALK:u                                                           
+            16,448      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        35,000,292      L1D_TLB:u                                                             

-       4.861510767 seconds time elapsed
+      18.443612527 seconds time elapsed

-      22.111354000 seconds user
-     132.431608000 seconds sys
+      80.694133000 seconds user
+    1159.740575000 seconds sys



@ -107,23 +110,24 @@ tensor(crow_indices=tensor([     0,      1,     71,  ..., 367660, 367661,
       col_indices=tensor([    1,     0,     2,  ..., 36690, 36689,  8203]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36692, 36692),
       nnz=367662, layout=torch.sparse_csr)
-tensor([0.4201, 0.4134, 0.8169,  ..., 0.6631, 0.0087, 0.8439])
+tensor([0.7091, 0.9447, 0.0959,  ..., 0.0090, 0.7012, 0.6025])
+Matrix: email-Enron
 Shape: torch.Size([36692, 36692])
 NNZ: 367662
 Density: 0.0002730901120626302
-Time: 1.1176586151123047 seconds
+Time: 10.863199234008789 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/email-Enron.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/email-Enron.mtx 1000':

-        31,744,243      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           271,027      L1I_CACHE_REFILL:u                                                    
-           464,135      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        33,441,141      L1D_CACHE:u                                                           
+        32,193,112      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           310,304      L1I_CACHE_REFILL:u                                                    
+           495,806      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,829,187      L1D_CACHE:u                                                           

-       4.693803969 seconds time elapsed
+      14.426841778 seconds time elapsed

-      21.724904000 seconds user
-     119.873018000 seconds sys
+      70.728541000 seconds user
+     853.184507000 seconds sys



@ -134,25 +138,26 @@ tensor(crow_indices=tensor([     0,      1,     71,  ..., 367660, 367661,
       col_indices=tensor([    1,     0,     2,  ..., 36690, 36689,  8203]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36692, 36692),
       nnz=367662, layout=torch.sparse_csr)
-tensor([0.1285, 0.3989, 0.3903,  ..., 0.7892, 0.2737, 0.2659])
+tensor([0.8267, 0.6185, 0.8015,  ..., 0.8593, 0.4881, 0.8599])
+Matrix: email-Enron
 Shape: torch.Size([36692, 36692])
 NNZ: 367662
 Density: 0.0002730901120626302
-Time: 1.196892261505127 seconds
+Time: 12.076026678085327 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/email-Enron.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/email-Enron.mtx 1000':

-           539,935      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           552,519      LL_CACHE_RD:u                                                         
-           188,291      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            24,177      L2D_TLB_REFILL:u                                                      
-           301,281      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,737,575      L2D_CACHE:u                                                           
+           546,628      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           570,044      LL_CACHE_RD:u                                                         
+           196,794      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            24,071      L2D_TLB_REFILL:u                                                      
+           316,028      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,836,018      L2D_CACHE:u                                                           

-       4.741030347 seconds time elapsed
+      15.581045199 seconds time elapsed

-      23.793930000 seconds user
-     125.634838000 seconds sys
+      77.345591000 seconds user
+     942.987439000 seconds sys



--- a/pytorch/output/altra_10_30_p2p-Gnutella04_1000.json
+++ b/pytorch/output/altra_10_30_p2p-Gnutella04_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [16.12, 16.36, 16.8, 16.76, 16.6, 16.48, 16.44, 16.28, 16.28, 16.16], "matrix": "p2p-Gnutella04", "shape": [10879, 10879], "nnz": 39994, "% density": 0.0003379223282393842, "time_s": 1.0642461776733398, "power": [26.6, 27.52, 27.52, 31.16, 28.48], "power_after": [16.28, 16.4, 16.32, 16.12, 16.24, 16.0, 16.0, 16.24, 16.52, 17.04], "task clock (msec)": 50.59, "page faults": 3303, "cycles": 51318459, "instructions": 74705078, "branch mispredictions": 328853, "branches": 19620312, "ITLB accesses": 27939682, "ITLB misses": 5470, "DTLB misses": 17679, "DTLB accesses": 37425602, "L1I cache accesses": 30276633, "L1I cache misses": 291467, "L1D cache misses": 479061, "L1D cache accesses": 31689326, "LL cache misses": 529426, "LL cache accesses": 550033, "L2D TLB accesses": 171913, "L2D TLB misses": 20624, "L2D cache misses": 296662, "L2D cache accesses": 1714211, "instructions per cycle": 1.455715535028049, "branch miss rate": 0.01676084457780284, "ITLB miss rate": 0.0001957788925443031, "DTLB miss rate": 0.00047237717111404113, "L2D TLB miss rate": 0.11996765805959991, "L1I cache miss rate": 0.009626797008769106, "L1D cache miss rate": 0.015117424712661923, "L2D cache miss rate": 0.17306037588138215, "LL cache miss rate": 0.9625349751742168}
--- a/pytorch/output/altra_10_30_p2p-Gnutella04_1000.output
+++ b/pytorch/output/altra_10_30_p2p-Gnutella04_1000.output
@ -5,45 +5,46 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394153 queued and waiting for resources
-srun: job 3394153 has been allocated resources
+srun: job 3394992 queued and waiting for resources
+srun: job 3394992 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([    0,    10,    20,  ..., 39994, 39994, 39994]),
       col_indices=tensor([    1,     2,     3,  ...,  9711, 10875, 10876]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(10879, 10879),
       nnz=39994, layout=torch.sparse_csr)
-tensor([0.6982, 0.7263, 0.0064,  ..., 0.9256, 0.7249, 0.5065])
+tensor([0.1181, 0.8387, 0.0554,  ..., 0.8107, 0.4393, 0.9489])
+Matrix: p2p-Gnutella04
 Shape: torch.Size([10879, 10879])
 NNZ: 39994
 Density: 0.0003379223282393842
-Time: 0.18009519577026367 seconds
+Time: 1.061662197113037 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 1000':

-             67.56 msec task-clock:u                     #    0.019 CPUs utilized             
+             50.59 msec task-clock:u                     #    0.012 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,829      page-faults:u                    #   56.674 K/sec                     
-        47,862,000      cycles:u                         #    0.708 GHz                         (59.24%)
-        84,392,375      instructions:u                   #    1.76  insn per cycle              (87.61%)
+             3,303      page-faults:u                    #   65.291 K/sec                     
+        51,318,459      cycles:u                         #    1.014 GHz                         (59.34%)
+        74,705,078      instructions:u                   #    1.46  insn per cycle              (83.02%)
   <not supported>      branches:u                                                            
-           368,432      branch-misses:u                                                       
-        32,507,448      L1-dcache-loads:u                #  481.147 M/sec                     
-           481,389      L1-dcache-load-misses:u          #    1.48% of all L1-dcache accesses 
+           366,825      branch-misses:u                                                       
+        31,809,194      L1-dcache-loads:u                #  628.781 M/sec                     
+           466,198      L1-dcache-load-misses:u          #    1.47% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        31,030,656      L1-icache-loads:u                #  459.289 M/sec                     
-           308,582      L1-icache-load-misses:u          #    0.99% of all L1-icache accesses 
-        34,988,046      dTLB-loads:u                     #  517.863 M/sec                       (20.00%)
+        30,390,161      L1-icache-loads:u                #  600.731 M/sec                     
+           296,270      L1-icache-load-misses:u          #    0.97% of all L1-icache accesses 
+        61,518,375      dTLB-loads:u                     #    1.216 G/sec                       (17.94%)
     <not counted>      dTLB-load-misses:u                                                      (0.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       3.538329547 seconds time elapsed
+       4.302241563 seconds time elapsed

-      14.667604000 seconds user
-      29.534487000 seconds sys
+      16.122298000 seconds user
+      29.141140000 seconds sys



@ -53,21 +54,22 @@ tensor(crow_indices=tensor([    0,    10,    20,  ..., 39994, 39994, 39994]),
       col_indices=tensor([    1,     2,     3,  ...,  9711, 10875, 10876]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(10879, 10879),
       nnz=39994, layout=torch.sparse_csr)
-tensor([0.4946, 0.3509, 0.5239,  ..., 0.4520, 0.4206, 0.8181])
+tensor([0.7249, 0.8723, 0.3843,  ..., 0.2264, 0.4891, 0.9107])
+Matrix: p2p-Gnutella04
 Shape: torch.Size([10879, 10879])
 NNZ: 39994
 Density: 0.0003379223282393842
-Time: 0.18875432014465332 seconds
+Time: 1.0079431533813477 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 1000':

-           331,622      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        19,800,140      BR_RETIRED:u                                                          
+           328,853      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,620,312      BR_RETIRED:u                                                          

-       3.556031790 seconds time elapsed
+       4.241400567 seconds time elapsed

-      14.799719000 seconds user
-      27.876987000 seconds sys
+      15.325937000 seconds user
+      28.223386000 seconds sys



@ -77,23 +79,24 @@ tensor(crow_indices=tensor([    0,    10,    20,  ..., 39994, 39994, 39994]),
       col_indices=tensor([    1,     2,     3,  ...,  9711, 10875, 10876]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(10879, 10879),
       nnz=39994, layout=torch.sparse_csr)
-tensor([0.2184, 0.4999, 0.9567,  ..., 0.8794, 0.8213, 0.8713])
+tensor([0.7608, 0.2449, 0.5322,  ..., 0.5547, 0.8659, 0.8437])
+Matrix: p2p-Gnutella04
 Shape: torch.Size([10879, 10879])
 NNZ: 39994
 Density: 0.0003379223282393842
-Time: 0.1066896915435791 seconds
+Time: 1.1017234325408936 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 1000':

-        25,905,045      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,746      ITLB_WALK:u                                                           
-            17,547      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        35,220,079      L1D_TLB:u                                                             
+        27,939,682      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             5,470      ITLB_WALK:u                                                           
+            17,679      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        37,425,602      L1D_TLB:u                                                             

-       3.505367779 seconds time elapsed
+       4.296820500 seconds time elapsed

-      14.557493000 seconds user
-      29.642958000 seconds sys
+      15.875162000 seconds user
+      28.803412000 seconds sys



@ -103,23 +106,24 @@ tensor(crow_indices=tensor([    0,    10,    20,  ..., 39994, 39994, 39994]),
       col_indices=tensor([    1,     2,     3,  ...,  9711, 10875, 10876]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(10879, 10879),
       nnz=39994, layout=torch.sparse_csr)
-tensor([0.2180, 0.0881, 0.5532,  ..., 0.4961, 0.0093, 0.4929])
+tensor([0.9980, 0.9991, 0.6749,  ..., 0.4225, 0.7297, 0.3717])
+Matrix: p2p-Gnutella04
 Shape: torch.Size([10879, 10879])
 NNZ: 39994
 Density: 0.0003379223282393842
-Time: 0.12433028221130371 seconds
+Time: 1.0812580585479736 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 1000':

-        30,359,576      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           283,204      L1I_CACHE_REFILL:u                                                    
-           465,520      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        31,843,274      L1D_CACHE:u                                                           
+        30,276,633      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           291,467      L1I_CACHE_REFILL:u                                                    
+           479,061      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        31,689,326      L1D_CACHE:u                                                           

-       3.565310130 seconds time elapsed
+       4.500137840 seconds time elapsed

-      14.913239000 seconds user
-      28.125605000 seconds sys
+      15.794710000 seconds user
+      27.773851000 seconds sys



@ -129,25 +133,26 @@ tensor(crow_indices=tensor([    0,    10,    20,  ..., 39994, 39994, 39994]),
       col_indices=tensor([    1,     2,     3,  ...,  9711, 10875, 10876]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(10879, 10879),
       nnz=39994, layout=torch.sparse_csr)
-tensor([0.6394, 0.6808, 0.7957,  ..., 0.1529, 0.0561, 0.7834])
+tensor([0.8707, 0.5871, 0.5970,  ..., 0.8826, 0.4673, 0.4994])
+Matrix: p2p-Gnutella04
 Shape: torch.Size([10879, 10879])
 NNZ: 39994
 Density: 0.0003379223282393842
-Time: 0.13401126861572266 seconds
+Time: 0.9900743961334229 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 1000':

-           560,542      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           575,610      LL_CACHE_RD:u                                                         
-           173,643      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            21,499      L2D_TLB_REFILL:u                                                      
-           313,335      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,741,621      L2D_CACHE:u                                                           
+           529,426      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           550,033      LL_CACHE_RD:u                                                         
+           171,913      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            20,624      L2D_TLB_REFILL:u                                                      
+           296,662      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,714,211      L2D_CACHE:u                                                           

-       3.503362704 seconds time elapsed
+       4.284402033 seconds time elapsed

-      15.287949000 seconds user
-      28.752303000 seconds sys
+      15.584671000 seconds user
+      27.523772000 seconds sys



--- a/pytorch/output/altra_10_30_p2p-Gnutella24_1000.json
+++ b/pytorch/output/altra_10_30_p2p-Gnutella24_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [16.12, 16.12, 16.12, 16.36, 16.56, 16.52, 17.04, 16.76, 16.64, 16.92], "matrix": "p2p-Gnutella24", "shape": [26518, 26518], "nnz": 65369, "% density": 9.295875717624285e-05, "time_s": 1.6947758197784424, "power": [25.2, 25.2, 26.6, 26.28, 26.48], "power_after": [16.4, 16.6, 16.6, 16.64, 16.8, 16.48, 16.44, 16.16, 16.12, 16.2], "task clock (msec)": 66.78, "page faults": 3520, "cycles": 28858055, "instructions": 64429843, "branch mispredictions": 331167, "branches": 19518210, "ITLB accesses": 26964483, "ITLB misses": 4666, "DTLB misses": 14001, "DTLB accesses": 36143905, "L1I cache accesses": 31901160, "L1I cache misses": 302516, "L1D cache misses": 475663, "L1D cache accesses": 33507563, "LL cache misses": 558546, "LL cache accesses": 578676, "L2D TLB accesses": 187549, "L2D TLB misses": 22990, "L2D cache misses": 321826, "L2D cache accesses": 1816571, "instructions per cycle": 2.2326467601506756, "branch miss rate": 0.016967078435983628, "ITLB miss rate": 0.00017304244253449992, "DTLB miss rate": 0.00038736821602425086, "L2D TLB miss rate": 0.12258129875392564, "L1I cache miss rate": 0.009482915354802146, "L1D cache miss rate": 0.01419569068630864, "L2D cache miss rate": 0.1771612560147663, "LL cache miss rate": 0.9652136947099932}
--- a/pytorch/output/altra_10_30_p2p-Gnutella24_1000.output
+++ b/pytorch/output/altra_10_30_p2p-Gnutella24_1000.output
@ -0,0 +1,158 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3394989 queued and waiting for resources
+srun: job 3394989 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,     9,     9,  ..., 65369, 65369, 65369]),
+       col_indices=tensor([    1,     2,     3,  ..., 15065,  9401, 26517]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(26518, 26518),
+       nnz=65369, layout=torch.sparse_csr)
+tensor([0.2470, 0.4231, 0.1036,  ..., 0.7937, 0.3241, 0.7116])
+Matrix: p2p-Gnutella24
+Shape: torch.Size([26518, 26518])
+NNZ: 65369
+Density: 9.295875717624285e-05
+Time: 1.6974337100982666 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 1000':
+
+             66.78 msec task-clock:u                     #    0.013 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,520      page-faults:u                    #   52.713 K/sec                     
+        28,858,055      cycles:u                         #    0.432 GHz                         (26.93%)
+        64,429,843      instructions:u                   #    2.23  insn per cycle              (67.63%)
+   <not supported>      branches:u                                                            
+           296,857      branch-misses:u                                                         (84.08%)
+        33,646,348      L1-dcache-loads:u                #  503.866 M/sec                     
+           493,998      L1-dcache-load-misses:u          #    1.47% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        32,070,415      L1-icache-loads:u                #  480.266 M/sec                     
+           305,993      L1-icache-load-misses:u          #    0.95% of all L1-icache accesses 
+        46,903,081      dTLB-loads:u                     #  702.391 M/sec                       (46.16%)
+           114,272      dTLB-load-misses:u               #    0.24% of all dTLB cache accesses  (32.45%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+       5.106933083 seconds time elapsed
+
+      16.391614000 seconds user
+      28.913912000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,     9,     9,  ..., 65369, 65369, 65369]),
+       col_indices=tensor([    1,     2,     3,  ..., 15065,  9401, 26517]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(26518, 26518),
+       nnz=65369, layout=torch.sparse_csr)
+tensor([0.2307, 0.4662, 0.3789,  ..., 0.0144, 0.6300, 0.7829])
+Matrix: p2p-Gnutella24
+Shape: torch.Size([26518, 26518])
+NNZ: 65369
+Density: 9.295875717624285e-05
+Time: 1.6379659175872803 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 1000':
+
+           331,167      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,518,210      BR_RETIRED:u                                                          
+
+       5.017894585 seconds time elapsed
+
+      16.446505000 seconds user
+      31.004338000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,     9,     9,  ..., 65369, 65369, 65369]),
+       col_indices=tensor([    1,     2,     3,  ..., 15065,  9401, 26517]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(26518, 26518),
+       nnz=65369, layout=torch.sparse_csr)
+tensor([0.7309, 0.0314, 0.4424,  ..., 0.7434, 0.2124, 0.1432])
+Matrix: p2p-Gnutella24
+Shape: torch.Size([26518, 26518])
+NNZ: 65369
+Density: 9.295875717624285e-05
+Time: 1.7232718467712402 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 1000':
+
+        26,964,483      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             4,666      ITLB_WALK:u                                                           
+            14,001      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,143,905      L1D_TLB:u                                                             
+
+       5.053286721 seconds time elapsed
+
+      16.447780000 seconds user
+      28.580949000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,     9,     9,  ..., 65369, 65369, 65369]),
+       col_indices=tensor([    1,     2,     3,  ..., 15065,  9401, 26517]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(26518, 26518),
+       nnz=65369, layout=torch.sparse_csr)
+tensor([0.5695, 0.5025, 0.1946,  ..., 0.7428, 0.9634, 0.4327])
+Matrix: p2p-Gnutella24
+Shape: torch.Size([26518, 26518])
+NNZ: 65369
+Density: 9.295875717624285e-05
+Time: 1.644775629043579 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 1000':
+
+        31,901,160      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           302,516      L1I_CACHE_REFILL:u                                                    
+           475,663      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,507,563      L1D_CACHE:u                                                           
+
+       4.978338941 seconds time elapsed
+
+      16.455298000 seconds user
+      30.249373000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,     9,     9,  ..., 65369, 65369, 65369]),
+       col_indices=tensor([    1,     2,     3,  ..., 15065,  9401, 26517]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(26518, 26518),
+       nnz=65369, layout=torch.sparse_csr)
+tensor([0.0969, 0.1950, 0.8456,  ..., 0.3315, 0.1512, 0.3182])
+Matrix: p2p-Gnutella24
+Shape: torch.Size([26518, 26518])
+NNZ: 65369
+Density: 9.295875717624285e-05
+Time: 1.752812385559082 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 1000':
+
+           558,546      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           578,676      LL_CACHE_RD:u                                                         
+           187,549      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            22,990      L2D_TLB_REFILL:u                                                      
+           321,826      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,816,571      L2D_CACHE:u                                                           
+
+       4.952297819 seconds time elapsed
+
+      16.648691000 seconds user
+      27.005944000 seconds sys
+
+
+
--- a/pytorch/output/altra_10_30_p2p-Gnutella25_1000.json
+++ b/pytorch/output/altra_10_30_p2p-Gnutella25_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [16.0, 16.4, 16.4, 16.28, 16.48, 16.6, 16.48, 16.56, 16.88, 16.92], "matrix": "p2p-Gnutella25", "shape": [22687, 22687], "nnz": 54705, "% density": 0.00010628522108964806, "time_s": 1.4688231945037842, "power": [23.04, 29.0, 30.24, 27.96, 28.04], "power_after": [16.52, 16.68, 16.88, 17.12, 17.08, 17.04, 16.84, 16.72, 16.84, 16.84], "task clock (msec)": 48.61, "page faults": 3308, "cycles": 60072179, "instructions": 70991785, "branch mispredictions": 331765, "branches": 19906014, "ITLB accesses": 28194337, "ITLB misses": 5083, "DTLB misses": 17916, "DTLB accesses": 37944713, "L1I cache accesses": 31162212, "L1I cache misses": 270684, "L1D cache misses": 465467, "L1D cache accesses": 32857500, "LL cache misses": 541118, "LL cache accesses": 564199, "L2D TLB accesses": 194022, "L2D TLB misses": 23932, "L2D cache misses": 311476, "L2D cache accesses": 1783574, "instructions per cycle": 1.1817747613250387, "branch miss rate": 0.016666571218125335, "ITLB miss rate": 0.00018028443087702328, "DTLB miss rate": 0.00047216064066685654, "L2D TLB miss rate": 0.12334683695663379, "L1I cache miss rate": 0.008686289663904475, "L1D cache miss rate": 0.014166232975728525, "L2D cache miss rate": 0.17463587157022922, "LL cache miss rate": 0.9590906754531646}
--- a/pytorch/output/altra_10_30_p2p-Gnutella25_1000.output
+++ b/pytorch/output/altra_10_30_p2p-Gnutella25_1000.output
@ -5,45 +5,46 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394140 queued and waiting for resources
-srun: job 3394140 has been allocated resources
+srun: job 3394994 queued and waiting for resources
+srun: job 3394994 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([    0,     9,     9,  ..., 54704, 54704, 54705]),
       col_indices=tensor([    1,     2,     3,  ..., 17949, 22685,   144]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(22687, 22687),
       nnz=54705, layout=torch.sparse_csr)
-tensor([0.8199, 0.9849, 0.4642,  ..., 0.7594, 0.3568, 0.4020])
+tensor([0.1465, 0.4354, 0.7334,  ..., 0.2837, 0.5913, 0.9525])
+Matrix: p2p-Gnutella25
 Shape: torch.Size([22687, 22687])
 NNZ: 54705
 Density: 0.00010628522108964806
-Time: 0.19272208213806152 seconds
+Time: 1.4786670207977295 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 1000':

-             64.71 msec task-clock:u                     #    0.018 CPUs utilized             
+             48.61 msec task-clock:u                     #    0.010 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,319      page-faults:u                    #   51.288 K/sec                     
-        57,611,295      cycles:u                         #    0.890 GHz                         (39.00%)
-        83,148,228      instructions:u                   #    1.44  insn per cycle              (82.73%)
+             3,308      page-faults:u                    #   68.054 K/sec                     
+        60,072,179      cycles:u                         #    1.236 GHz                         (53.26%)
+        70,991,785      instructions:u                   #    1.18  insn per cycle              (71.54%)
   <not supported>      branches:u                                                            
-           375,111      branch-misses:u                                                       
-        32,759,228      L1-dcache-loads:u                #  506.221 M/sec                     
-           475,086      L1-dcache-load-misses:u          #    1.45% of all L1-dcache accesses 
+           371,197      branch-misses:u                                                       
+        32,964,378      L1-dcache-loads:u                #  678.165 M/sec                     
+           465,448      L1-dcache-load-misses:u          #    1.41% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        31,366,158      L1-icache-loads:u                #  484.694 M/sec                     
-           297,293      L1-icache-load-misses:u          #    0.95% of all L1-icache accesses 
-        35,611,781      dTLB-loads:u                     #  550.301 M/sec                       (25.73%)
+        31,435,424      L1-icache-loads:u                #  646.710 M/sec                     
+           293,561      L1-icache-load-misses:u          #    0.93% of all L1-icache accesses 
+        56,761,270      dTLB-loads:u                     #    1.168 G/sec                       (30.54%)
     <not counted>      dTLB-load-misses:u                                                      (0.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       3.578384817 seconds time elapsed
+       4.700046411 seconds time elapsed

-      14.435258000 seconds user
-      27.700836000 seconds sys
+      16.235801000 seconds user
+      28.396327000 seconds sys



@ -53,21 +54,22 @@ tensor(crow_indices=tensor([    0,     9,     9,  ..., 54704, 54704, 54705]),
       col_indices=tensor([    1,     2,     3,  ..., 17949, 22685,   144]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(22687, 22687),
       nnz=54705, layout=torch.sparse_csr)
-tensor([0.0069, 0.9904, 0.5316,  ..., 0.2082, 0.4858, 0.4936])
+tensor([0.7780, 0.3388, 0.1540,  ..., 0.2989, 0.3682, 0.9160])
+Matrix: p2p-Gnutella25
 Shape: torch.Size([22687, 22687])
 NNZ: 54705
 Density: 0.00010628522108964806
-Time: 0.1423017978668213 seconds
+Time: 1.4235138893127441 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 1000':

-           318,386      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        19,233,431      BR_RETIRED:u                                                          
+           331,765      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,906,014      BR_RETIRED:u                                                          

-       3.555753224 seconds time elapsed
+       4.757340585 seconds time elapsed

-      14.642518000 seconds user
-      30.112207000 seconds sys
+      16.412311000 seconds user
+      29.238029000 seconds sys



@ -77,23 +79,24 @@ tensor(crow_indices=tensor([    0,     9,     9,  ..., 54704, 54704, 54705]),
       col_indices=tensor([    1,     2,     3,  ..., 17949, 22685,   144]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(22687, 22687),
       nnz=54705, layout=torch.sparse_csr)
-tensor([0.2250, 0.5676, 0.3018,  ..., 0.5431, 0.7314, 0.5593])
+tensor([0.4944, 0.8057, 0.8211,  ..., 0.5137, 0.3388, 0.6316])
+Matrix: p2p-Gnutella25
 Shape: torch.Size([22687, 22687])
 NNZ: 54705
 Density: 0.00010628522108964806
-Time: 0.14638042449951172 seconds
+Time: 1.4664146900177002 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 1000':

-        27,039,805      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,375      ITLB_WALK:u                                                           
-            17,290      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        36,688,544      L1D_TLB:u                                                             
+        28,194,337      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             5,083      ITLB_WALK:u                                                           
+            17,916      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        37,944,713      L1D_TLB:u                                                             

-       3.566915241 seconds time elapsed
+       4.844329421 seconds time elapsed

-      16.116565000 seconds user
-      28.752519000 seconds sys
+      16.081022000 seconds user
+      28.021902000 seconds sys



@ -103,23 +106,24 @@ tensor(crow_indices=tensor([    0,     9,     9,  ..., 54704, 54704, 54705]),
       col_indices=tensor([    1,     2,     3,  ..., 17949, 22685,   144]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(22687, 22687),
       nnz=54705, layout=torch.sparse_csr)
-tensor([0.0220, 0.7494, 0.7913,  ..., 0.8924, 0.8542, 0.5491])
+tensor([0.0963, 0.5806, 0.0397,  ..., 0.1604, 0.5700, 0.8103])
+Matrix: p2p-Gnutella25
 Shape: torch.Size([22687, 22687])
 NNZ: 54705
 Density: 0.00010628522108964806
-Time: 0.17815685272216797 seconds
+Time: 1.3717434406280518 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 1000':

-        32,508,072      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           297,568      L1I_CACHE_REFILL:u                                                    
-           477,654      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        34,044,579      L1D_CACHE:u                                                           
+        31,162,212      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           270,684      L1I_CACHE_REFILL:u                                                    
+           465,467      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        32,857,500      L1D_CACHE:u                                                           

-       3.435706033 seconds time elapsed
+       4.598461782 seconds time elapsed

-      14.690285000 seconds user
-      28.763423000 seconds sys
+      15.609727000 seconds user
+      30.606837000 seconds sys



@ -129,25 +133,26 @@ tensor(crow_indices=tensor([    0,     9,     9,  ..., 54704, 54704, 54705]),
       col_indices=tensor([    1,     2,     3,  ..., 17949, 22685,   144]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(22687, 22687),
       nnz=54705, layout=torch.sparse_csr)
-tensor([0.6277, 0.4955, 0.9335,  ..., 0.1476, 0.2079, 0.0931])
+tensor([0.9137, 0.5009, 0.7507,  ..., 0.6623, 0.8760, 0.2991])
+Matrix: p2p-Gnutella25
 Shape: torch.Size([22687, 22687])
 NNZ: 54705
 Density: 0.00010628522108964806
-Time: 0.14432048797607422 seconds
+Time: 1.4291880130767822 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 1000':

-           549,474      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           561,939      LL_CACHE_RD:u                                                         
-           185,622      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            23,295      L2D_TLB_REFILL:u                                                      
-           305,878      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,763,089      L2D_CACHE:u                                                           
+           541,118      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           564,199      LL_CACHE_RD:u                                                         
+           194,022      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,932      L2D_TLB_REFILL:u                                                      
+           311,476      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,783,574      L2D_CACHE:u                                                           

-       3.538826979 seconds time elapsed
+       4.792239951 seconds time elapsed

-      15.006109000 seconds user
-      29.644298000 seconds sys
+      15.902307000 seconds user
+      28.747620000 seconds sys



--- a/pytorch/output/altra_10_30_p2p-Gnutella30_1000.json
+++ b/pytorch/output/altra_10_30_p2p-Gnutella30_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [31.96, 22.0, 22.16, 22.16, 21.84, 22.08, 22.4, 22.08, 22.0, 21.48], "matrix": "p2p-Gnutella30", "shape": [36682, 36682], "nnz": 88328, "% density": 6.564359899804003e-05, "time_s": 3.504030466079712, "power": [54.2, 64.16, 67.64, 67.64, 65.92, 58.96, 59.92], "power_after": [20.72, 20.76, 20.76, 20.8, 20.8, 20.88, 20.92, 21.04, 21.04, 21.12], "task clock (msec)": 56.52, "page faults": 3194, "cycles": 58074747, "instructions": 90036443, "branch mispredictions": 327895, "branches": 20553601, "ITLB accesses": 26120611, "ITLB misses": 7531, "DTLB misses": 19097, "DTLB accesses": 35744928, "L1I cache accesses": 31819981, "L1I cache misses": 284493, "L1D cache misses": 486709, "L1D cache accesses": 33545755, "LL cache misses": 544742, "LL cache accesses": 558323, "L2D TLB accesses": 190574, "L2D TLB misses": 23746, "L2D cache misses": 305844, "L2D cache accesses": 1736964, "instructions per cycle": 1.5503544595725918, "branch miss rate": 0.015953165579111903, "ITLB miss rate": 0.00028831637973552763, "DTLB miss rate": 0.0005342576155140109, "L2D TLB miss rate": 0.12460251660772194, "L1I cache miss rate": 0.008940703012990485, "L1D cache miss rate": 0.014508810429218243, "L2D cache miss rate": 0.17607964241055082, "LL cache miss rate": 0.9756753707083534}
--- a/pytorch/output/altra_10_30_p2p-Gnutella30_1000.output
+++ b/pytorch/output/altra_10_30_p2p-Gnutella30_1000.output
@ -0,0 +1,158 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3394991 queued and waiting for resources
+srun: job 3394991 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,    10,    10,  ..., 88328, 88328, 88328]),
+       col_indices=tensor([    1,     2,     3,  ..., 36675, 36676, 36677]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36682, 36682),
+       nnz=88328, layout=torch.sparse_csr)
+tensor([0.3046, 0.0725, 0.4580,  ..., 0.0593, 0.5121, 0.2116])
+Matrix: p2p-Gnutella30
+Shape: torch.Size([36682, 36682])
+NNZ: 88328
+Density: 6.564359899804003e-05
+Time: 3.6646029949188232 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 1000':
+
+             56.52 msec task-clock:u                     #    0.008 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,194      page-faults:u                    #   56.515 K/sec                     
+        58,074,747      cycles:u                         #    1.028 GHz                         (51.20%)
+        90,036,443      instructions:u                   #    1.55  insn per cycle              (89.06%)
+   <not supported>      branches:u                                                            
+           363,262      branch-misses:u                                                       
+        33,111,438      L1-dcache-loads:u                #  585.875 M/sec                     
+           454,665      L1-dcache-load-misses:u          #    1.37% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        31,646,314      L1-icache-loads:u                #  559.951 M/sec                     
+           281,443      L1-icache-load-misses:u          #    0.89% of all L1-icache accesses 
+        43,495,524      dTLB-loads:u                     #  769.611 M/sec                       (11.87%)
+     <not counted>      dTLB-load-misses:u                                                      (0.00%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+       7.033463989 seconds time elapsed
+
+      34.670765000 seconds user
+     307.031553000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,    10,    10,  ..., 88328, 88328, 88328]),
+       col_indices=tensor([    1,     2,     3,  ..., 36675, 36676, 36677]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36682, 36682),
+       nnz=88328, layout=torch.sparse_csr)
+tensor([0.9700, 0.1728, 0.2199,  ..., 0.6107, 0.3357, 0.2661])
+Matrix: p2p-Gnutella30
+Shape: torch.Size([36682, 36682])
+NNZ: 88328
+Density: 6.564359899804003e-05
+Time: 2.3380045890808105 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 1000':
+
+           327,895      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,553,601      BR_RETIRED:u                                                          
+
+       5.895917276 seconds time elapsed
+
+      31.121063000 seconds user
+     208.127447000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,    10,    10,  ..., 88328, 88328, 88328]),
+       col_indices=tensor([    1,     2,     3,  ..., 36675, 36676, 36677]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36682, 36682),
+       nnz=88328, layout=torch.sparse_csr)
+tensor([0.9533, 0.7568, 0.8141,  ..., 0.8395, 0.5617, 0.7830])
+Matrix: p2p-Gnutella30
+Shape: torch.Size([36682, 36682])
+NNZ: 88328
+Density: 6.564359899804003e-05
+Time: 4.476518869400024 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 1000':
+
+        26,120,611      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             7,531      ITLB_WALK:u                                                           
+            19,097      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        35,744,928      L1D_TLB:u                                                             
+
+       8.109622410 seconds time elapsed
+
+      38.467161000 seconds user
+     370.437915000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,    10,    10,  ..., 88328, 88328, 88328]),
+       col_indices=tensor([    1,     2,     3,  ..., 36675, 36676, 36677]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36682, 36682),
+       nnz=88328, layout=torch.sparse_csr)
+tensor([0.6886, 0.7814, 0.9957,  ..., 0.8460, 0.1015, 0.8097])
+Matrix: p2p-Gnutella30
+Shape: torch.Size([36682, 36682])
+NNZ: 88328
+Density: 6.564359899804003e-05
+Time: 2.856834888458252 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 1000':
+
+        31,819,981      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           284,493      L1I_CACHE_REFILL:u                                                    
+           486,709      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,545,755      L1D_CACHE:u                                                           
+
+       6.374371632 seconds time elapsed
+
+      30.817943000 seconds user
+     247.363843000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,    10,    10,  ..., 88328, 88328, 88328]),
+       col_indices=tensor([    1,     2,     3,  ..., 36675, 36676, 36677]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36682, 36682),
+       nnz=88328, layout=torch.sparse_csr)
+tensor([0.8464, 0.0437, 0.1230,  ..., 0.6221, 0.9268, 0.5436])
+Matrix: p2p-Gnutella30
+Shape: torch.Size([36682, 36682])
+NNZ: 88328
+Density: 6.564359899804003e-05
+Time: 4.838747978210449 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 1000':
+
+           544,742      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           558,323      LL_CACHE_RD:u                                                         
+           190,574      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,746      L2D_TLB_REFILL:u                                                      
+           305,844      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,736,964      L2D_CACHE:u                                                           
+
+       8.386896120 seconds time elapsed
+
+      39.861141000 seconds user
+     395.959334000 seconds sys
+
+
+
--- a/pytorch/output/altra_10_30_ri2010_1000.json
+++ b/pytorch/output/altra_10_30_ri2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [31.2, 31.56, 31.56, 30.84, 24.52, 23.2, 21.32, 20.76, 20.84, 20.84], "matrix": "ri2010", "shape": [25181, 25181], "nnz": 125750, "% density": 0.00019831796057928155, "time_s": 3.077709913253784, "power": [27.76, 28.28, 28.44, 28.28, 25.16, 30.44, 30.6], "power_after": [21.08, 20.88, 20.68, 20.68, 20.6, 20.56, 20.68, 20.8, 20.96, 21.24], "task clock (msec)": 64.49, "page faults": 3473, "cycles": 42783607, "instructions": 84598454, "branch mispredictions": 331326, "branches": 20438455, "ITLB accesses": 26869742, "ITLB misses": 6302, "DTLB misses": 14926, "DTLB accesses": 36876841, "L1I cache accesses": 31664385, "L1I cache misses": 301678, "L1D cache misses": 493536, "L1D cache accesses": 33219437, "LL cache misses": 552180, "LL cache accesses": 564990, "L2D TLB accesses": 167824, "L2D TLB misses": 19594, "L2D cache misses": 304114, "L2D cache accesses": 1716370, "instructions per cycle": 1.977356747877756, "branch miss rate": 0.01621091222404042, "ITLB miss rate": 0.00023453890997539165, "DTLB miss rate": 0.00040475267390718204, "L2D TLB miss rate": 0.11675326532557918, "L1I cache miss rate": 0.009527360155581737, "L1D cache miss rate": 0.014856844202386693, "L2D cache miss rate": 0.17718440662561102, "LL cache miss rate": 0.9773270323368555}
--- a/pytorch/output/altra_10_30_ri2010_1000.output
+++ b/pytorch/output/altra_10_30_ri2010_1000.output
@ -0,0 +1,163 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3394984 queued and waiting for resources
+srun: job 3394984 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
+                            125750]),
+       col_indices=tensor([   25,    56,   662,  ..., 21738, 22279, 23882]),
+       values=tensor([17171., 37318.,  5284.,  ..., 25993., 24918.,   803.]),
+       size=(25181, 25181), nnz=125750, layout=torch.sparse_csr)
+tensor([0.5906, 0.9651, 0.2033,  ..., 0.2175, 0.4484, 0.0412])
+Matrix: ri2010
+Shape: torch.Size([25181, 25181])
+NNZ: 125750
+Density: 0.00019831796057928155
+Time: 3.107008934020996 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 1000':
+
+             64.49 msec task-clock:u                     #    0.010 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,473      page-faults:u                    #   53.852 K/sec                     
+        42,783,607      cycles:u                         #    0.663 GHz                         (37.27%)
+        84,598,454      instructions:u                   #    1.98  insn per cycle              (73.53%)
+   <not supported>      branches:u                                                            
+           353,558      branch-misses:u                                                         (89.57%)
+        33,192,964      L1-dcache-loads:u                #  514.689 M/sec                     
+           466,217      L1-dcache-load-misses:u          #    1.40% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        31,727,502      L1-icache-loads:u                #  491.965 M/sec                     
+           292,570      L1-icache-load-misses:u          #    0.92% of all L1-icache accesses 
+        38,623,737      dTLB-loads:u                     #  598.898 M/sec                       (34.88%)
+           124,174      dTLB-load-misses:u               #    0.32% of all dTLB cache accesses  (14.74%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+       6.612563197 seconds time elapsed
+
+      18.114584000 seconds user
+      29.808542000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
+                            125750]),
+       col_indices=tensor([   25,    56,   662,  ..., 21738, 22279, 23882]),
+       values=tensor([17171., 37318.,  5284.,  ..., 25993., 24918.,   803.]),
+       size=(25181, 25181), nnz=125750, layout=torch.sparse_csr)
+tensor([0.6092, 0.5511, 0.6052,  ..., 0.8002, 0.0295, 0.2972])
+Matrix: ri2010
+Shape: torch.Size([25181, 25181])
+NNZ: 125750
+Density: 0.00019831796057928155
+Time: 2.9385879039764404 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 1000':
+
+           331,326      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,438,455      BR_RETIRED:u                                                          
+
+       6.446731410 seconds time elapsed
+
+      17.939571000 seconds user
+      33.272929000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
+                            125750]),
+       col_indices=tensor([   25,    56,   662,  ..., 21738, 22279, 23882]),
+       values=tensor([17171., 37318.,  5284.,  ..., 25993., 24918.,   803.]),
+       size=(25181, 25181), nnz=125750, layout=torch.sparse_csr)
+tensor([0.3348, 0.2974, 0.2569,  ..., 0.2397, 0.1965, 0.5651])
+Matrix: ri2010
+Shape: torch.Size([25181, 25181])
+NNZ: 125750
+Density: 0.00019831796057928155
+Time: 2.972891330718994 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 1000':
+
+        26,869,742      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,302      ITLB_WALK:u                                                           
+            14,926      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,876,841      L1D_TLB:u                                                             
+
+       6.376775396 seconds time elapsed
+
+      17.836418000 seconds user
+      29.830135000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
+                            125750]),
+       col_indices=tensor([   25,    56,   662,  ..., 21738, 22279, 23882]),
+       values=tensor([17171., 37318.,  5284.,  ..., 25993., 24918.,   803.]),
+       size=(25181, 25181), nnz=125750, layout=torch.sparse_csr)
+tensor([0.7889, 0.7395, 0.6553,  ..., 0.3938, 0.2478, 0.7923])
+Matrix: ri2010
+Shape: torch.Size([25181, 25181])
+NNZ: 125750
+Density: 0.00019831796057928155
+Time: 2.9658284187316895 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 1000':
+
+        31,664,385      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           301,678      L1I_CACHE_REFILL:u                                                    
+           493,536      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,219,437      L1D_CACHE:u                                                           
+
+       6.559158078 seconds time elapsed
+
+      19.008146000 seconds user
+      38.233666000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
+                            125750]),
+       col_indices=tensor([   25,    56,   662,  ..., 21738, 22279, 23882]),
+       values=tensor([17171., 37318.,  5284.,  ..., 25993., 24918.,   803.]),
+       size=(25181, 25181), nnz=125750, layout=torch.sparse_csr)
+tensor([0.1256, 0.1417, 0.9800,  ..., 0.2509, 0.8121, 0.6210])
+Matrix: ri2010
+Shape: torch.Size([25181, 25181])
+NNZ: 125750
+Density: 0.00019831796057928155
+Time: 2.9228267669677734 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 1000':
+
+           552,180      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           564,990      LL_CACHE_RD:u                                                         
+           167,824      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            19,594      L2D_TLB_REFILL:u                                                      
+           304,114      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,716,370      L2D_CACHE:u                                                           
+
+       6.135787277 seconds time elapsed
+
+      18.029630000 seconds user
+      28.723217000 seconds sys
+
+
+
--- a/pytorch/output/altra_10_30_soc-sign-Slashdot090216_1000.json
+++ b/pytorch/output/altra_10_30_soc-sign-Slashdot090216_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [29.88, 23.64, 23.08, 21.84, 21.4, 21.2, 21.0, 21.0, 21.16, 21.0], "matrix": "soc-sign-Slashdot090216", "shape": [81871, 81871], "nnz": 545671, "% density": 8.140867447881048e-05, "time_s": 19.113287687301636, "power": [81.08, 81.56, 71.96, 60.52, 47.16, 48.4, 53.84, 53.84, 67.4, 82.64, 90.8, 89.16, 87.96, 85.76, 84.64, 84.04, 83.64, 84.68, 84.88, 84.88, 84.64, 84.04, 83.6], "power_after": [20.72, 20.6, 20.68, 20.88, 21.2, 21.28, 21.28, 21.48, 21.56, 21.36], "task clock (msec)": 67.66, "page faults": 3317, "cycles": 41915850, "instructions": 84471787, "branch mispredictions": 344452, "branches": 20610765, "ITLB accesses": 27276117, "ITLB misses": 6358, "DTLB misses": 17361, "DTLB accesses": 36565837, "L1I cache accesses": 32022662, "L1I cache misses": 293044, "L1D cache misses": 458939, "L1D cache accesses": 33505164, "LL cache misses": 553814, "LL cache accesses": 567372, "L2D TLB accesses": 199301, "L2D TLB misses": 25193, "L2D cache misses": 313278, "L2D cache accesses": 1796299, "instructions per cycle": 2.015270762730566, "branch miss rate": 0.016712237512775483, "ITLB miss rate": 0.00023309769495416082, "DTLB miss rate": 0.0004747874361524939, "L2D TLB miss rate": 0.12640679173712124, "L1I cache miss rate": 0.009151144274014446, "L1D cache miss rate": 0.01369756017311242, "L2D cache miss rate": 0.17440192306514674, "LL cache miss rate": 0.97610386131145}
--- a/pytorch/output/altra_10_30_soc-sign-Slashdot090216_1000.output
+++ b/pytorch/output/altra_10_30_soc-sign-Slashdot090216_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394151 queued and waiting for resources
-srun: job 3394151 has been allocated resources
+srun: job 3394981 queued and waiting for resources
+srun: job 3394981 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,     29,    124,  ..., 545669, 545669,
@ -14,37 +14,38 @@ tensor(crow_indices=tensor([     0,     29,    124,  ..., 545669, 545669,
       col_indices=tensor([    1,     2,     3,  ..., 81869, 81699, 81863]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(81871, 81871),
       nnz=545671, layout=torch.sparse_csr)
-tensor([0.3831, 0.6714, 0.8380,  ..., 0.7892, 0.5274, 0.9035])
+tensor([0.6780, 0.5234, 0.1205,  ..., 0.2995, 0.6275, 0.1399])
+Matrix: soc-sign-Slashdot090216
 Shape: torch.Size([81871, 81871])
 NNZ: 545671
 Density: 8.140867447881048e-05
-Time: 2.044952392578125 seconds
+Time: 30.653191089630127 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090216.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090216.mtx 1000':

-             59.01 msec task-clock:u                     #    0.010 CPUs utilized             
+             67.66 msec task-clock:u                     #    0.002 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,448      page-faults:u                    #   58.432 K/sec                     
-        73,062,796      cycles:u                         #    1.238 GHz                         (59.95%)
-        88,329,175      instructions:u                   #    1.21  insn per cycle              (93.89%)
+             3,317      page-faults:u                    #   49.022 K/sec                     
+        41,915,850      cycles:u                         #    0.619 GHz                         (57.88%)
+        84,471,787      instructions:u                   #    2.02  insn per cycle              (88.19%)
   <not supported>      branches:u                                                            
-           365,177      branch-misses:u                                                       
-        31,850,867      L1-dcache-loads:u                #  539.766 M/sec                     
-           473,835      L1-dcache-load-misses:u          #    1.49% of all L1-dcache accesses 
+           375,016      branch-misses:u                                                       
+        32,438,527      L1-dcache-loads:u                #  479.407 M/sec                     
+           499,618      L1-dcache-load-misses:u          #    1.54% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        30,385,913      L1-icache-loads:u                #  514.940 M/sec                     
-           299,969      L1-icache-load-misses:u          #    0.99% of all L1-icache accesses 
-        24,365,554      dTLB-loads:u                     #  412.915 M/sec                       (8.42%)
+        30,998,693      L1-icache-loads:u                #  458.127 M/sec                     
+           306,445      L1-icache-load-misses:u          #    0.99% of all L1-icache accesses 
+        34,294,934      dTLB-loads:u                     #  506.842 M/sec                       (18.86%)
     <not counted>      dTLB-load-misses:u                                                      (0.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       5.680365622 seconds time elapsed
+      34.340632995 seconds time elapsed

-      27.656957000 seconds user
-     194.823873000 seconds sys
+     149.743244000 seconds user
+    2355.852109000 seconds sys



@ -55,21 +56,22 @@ tensor(crow_indices=tensor([     0,     29,    124,  ..., 545669, 545669,
       col_indices=tensor([    1,     2,     3,  ..., 81869, 81699, 81863]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(81871, 81871),
       nnz=545671, layout=torch.sparse_csr)
-tensor([0.6906, 0.4067, 0.7042,  ..., 0.8333, 0.7120, 0.3519])
+tensor([0.9875, 0.2031, 0.7260,  ..., 0.5908, 0.1575, 0.7971])
+Matrix: soc-sign-Slashdot090216
 Shape: torch.Size([81871, 81871])
 NNZ: 545671
 Density: 8.140867447881048e-05
-Time: 1.3788115978240967 seconds
+Time: 13.671181440353394 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090216.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090216.mtx 1000':

-           331,091      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        20,013,316      BR_RETIRED:u                                                          
+           344,452      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,610,765      BR_RETIRED:u                                                          

-       4.886021169 seconds time elapsed
+      17.331425967 seconds time elapsed

-      23.105025000 seconds user
-     141.491451000 seconds sys
+      83.136180000 seconds user
+    1069.027469000 seconds sys



@ -80,23 +82,24 @@ tensor(crow_indices=tensor([     0,     29,    124,  ..., 545669, 545669,
       col_indices=tensor([    1,     2,     3,  ..., 81869, 81699, 81863]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(81871, 81871),
       nnz=545671, layout=torch.sparse_csr)
-tensor([0.8755, 0.6165, 0.4104,  ..., 0.6974, 0.9453, 0.9872])
+tensor([0.2046, 0.3645, 0.7960,  ..., 0.6490, 0.4098, 0.5342])
+Matrix: soc-sign-Slashdot090216
 Shape: torch.Size([81871, 81871])
 NNZ: 545671
 Density: 8.140867447881048e-05
-Time: 2.8570749759674072 seconds
+Time: 19.569235801696777 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090216.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090216.mtx 1000':

-        26,330,936      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             5,193      ITLB_WALK:u                                                           
-            16,837      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        35,930,477      L1D_TLB:u                                                             
+        27,276,117      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,358      ITLB_WALK:u                                                           
+            17,361      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,565,837      L1D_TLB:u                                                             

-       6.371573603 seconds time elapsed
+      23.323243037 seconds time elapsed

-      30.986329000 seconds user
-     254.347216000 seconds sys
+     108.830923000 seconds user
+    1521.834565000 seconds sys



@ -107,23 +110,24 @@ tensor(crow_indices=tensor([     0,     29,    124,  ..., 545669, 545669,
       col_indices=tensor([    1,     2,     3,  ..., 81869, 81699, 81863]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(81871, 81871),
       nnz=545671, layout=torch.sparse_csr)
-tensor([0.3573, 0.9331, 0.0611,  ..., 0.9133, 0.6057, 0.2374])
+tensor([0.4164, 0.2188, 0.5460,  ..., 0.1057, 0.5277, 0.0624])
+Matrix: soc-sign-Slashdot090216
 Shape: torch.Size([81871, 81871])
 NNZ: 545671
 Density: 8.140867447881048e-05
-Time: 2.311248540878296 seconds
+Time: 26.337355375289917 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090216.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090216.mtx 1000':

-        31,853,890      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           306,147      L1I_CACHE_REFILL:u                                                    
-           479,933      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        33,426,019      L1D_CACHE:u                                                           
+        32,022,662      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           293,044      L1I_CACHE_REFILL:u                                                    
+           458,939      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,505,164      L1D_CACHE:u                                                           

-       5.718741260 seconds time elapsed
+      30.017812847 seconds time elapsed

-      28.451593000 seconds user
-     214.350594000 seconds sys
+     131.976276000 seconds user
+    2029.636174000 seconds sys



@ -134,25 +138,26 @@ tensor(crow_indices=tensor([     0,     29,    124,  ..., 545669, 545669,
       col_indices=tensor([    1,     2,     3,  ..., 81869, 81699, 81863]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(81871, 81871),
       nnz=545671, layout=torch.sparse_csr)
-tensor([0.6021, 0.5679, 0.4538,  ..., 0.9086, 0.9552, 0.5329])
+tensor([0.7679, 0.9196, 0.3474,  ..., 0.5624, 0.0163, 0.8596])
+Matrix: soc-sign-Slashdot090216
 Shape: torch.Size([81871, 81871])
 NNZ: 545671
 Density: 8.140867447881048e-05
-Time: 1.8193013668060303 seconds
+Time: 29.926054000854492 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090216.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090216.mtx 1000':

-           540,302      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           553,181      LL_CACHE_RD:u                                                         
-           173,206      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            21,390      L2D_TLB_REFILL:u                                                      
-           300,032      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,739,931      L2D_CACHE:u                                                           
+           553,814      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           567,372      LL_CACHE_RD:u                                                         
+           199,301      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            25,193      L2D_TLB_REFILL:u                                                      
+           313,278      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,796,299      L2D_CACHE:u                                                           

-       5.546861941 seconds time elapsed
+      33.553779692 seconds time elapsed

-      28.194596000 seconds user
-     181.004698000 seconds sys
+     154.498461000 seconds user
+    2293.574463000 seconds sys



--- a/pytorch/output/altra_10_30_soc-sign-Slashdot090221_1000.json
+++ b/pytorch/output/altra_10_30_soc-sign-Slashdot090221_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [21.92, 21.84, 20.96, 20.24, 20.28, 20.16, 19.96, 19.72, 19.88, 19.76], "matrix": "soc-sign-Slashdot090221", "shape": [82144, 82144], "nnz": 549202, "% density": 8.13917555860553e-05, "time_s": 18.79910135269165, "power": [80.48, 80.08, 69.04, 69.04, 55.0, 46.8, 49.16, 56.2, 70.84, 82.84, 86.52, 84.28, 82.56, 81.2, 80.28, 80.28, 80.04, 80.16, 80.8, 81.0, 81.92, 83.04, 82.88], "power_after": [21.0, 20.96, 21.12, 20.76, 20.72, 20.56, 20.52, 20.64, 20.88, 21.04], "task clock (msec)": 58.57, "page faults": 3259, "cycles": 74509373, "instructions": 88672751, "branch mispredictions": 342121, "branches": 20436338, "ITLB accesses": 27189335, "ITLB misses": 6437, "DTLB misses": 18156, "DTLB accesses": 36676625, "L1I cache accesses": 30721032, "L1I cache misses": 302777, "L1D cache misses": 469833, "L1D cache accesses": 32109077, "LL cache misses": 551850, "LL cache accesses": 565355, "L2D TLB accesses": 200417, "L2D TLB misses": 25536, "L2D cache misses": 304133, "L2D cache accesses": 1801849, "instructions per cycle": 1.190088540941017, "branch miss rate": 0.016740817263836603, "ITLB miss rate": 0.0002367472393127673, "DTLB miss rate": 0.0004950291909356436, "L2D TLB miss rate": 0.12741434109880898, "L1I cache miss rate": 0.009855691045795596, "L1D cache miss rate": 0.014632404413244267, "L2D cache miss rate": 0.16878939356183564, "LL cache miss rate": 0.9761123541845389}
--- a/pytorch/output/altra_10_30_soc-sign-Slashdot090221_1000.output
+++ b/pytorch/output/altra_10_30_soc-sign-Slashdot090221_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394147 queued and waiting for resources
-srun: job 3394147 has been allocated resources
+srun: job 3394979 queued and waiting for resources
+srun: job 3394979 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,     29,    124,  ..., 549200, 549200,
@ -14,37 +14,38 @@ tensor(crow_indices=tensor([     0,     29,    124,  ..., 549200, 549200,
       col_indices=tensor([    1,     2,     3,  ..., 82142, 81974, 82136]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(82144, 82144),
       nnz=549202, layout=torch.sparse_csr)
-tensor([0.2696, 0.6106, 0.1626,  ..., 0.2215, 0.5107, 0.8609])
+tensor([0.4201, 0.7748, 0.6565,  ..., 0.0517, 0.6958, 0.5341])
+Matrix: soc-sign-Slashdot090221
 Shape: torch.Size([82144, 82144])
 NNZ: 549202
 Density: 8.13917555860553e-05
-Time: 1.4500706195831299 seconds
+Time: 27.35153603553772 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090221.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090221.mtx 1000':

-             61.26 msec task-clock:u                     #    0.012 CPUs utilized             
+             58.57 msec task-clock:u                     #    0.002 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,303      page-faults:u                    #   53.917 K/sec                     
-        44,515,786      cycles:u                         #    0.727 GHz                         (40.46%)
-        81,513,738      instructions:u                   #    1.83  insn per cycle              (73.51%)
+             3,259      page-faults:u                    #   55.640 K/sec                     
+        74,509,373      cycles:u                         #    1.272 GHz                         (58.00%)
+        88,672,751      instructions:u                   #    1.19  insn per cycle              (90.97%)
   <not supported>      branches:u                                                            
-           344,479      branch-misses:u                                                         (89.42%)
-        34,411,073      L1-dcache-loads:u                #  561.710 M/sec                     
-           484,811      L1-dcache-load-misses:u          #    1.41% of all L1-dcache accesses 
+           361,568      branch-misses:u                                                       
+        31,594,797      L1-dcache-loads:u                #  539.410 M/sec                     
+           460,467      L1-dcache-load-misses:u          #    1.46% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        32,789,672      L1-icache-loads:u                #  535.243 M/sec                     
-           293,487      L1-icache-load-misses:u          #    0.90% of all L1-icache accesses 
-        47,065,740      dTLB-loads:u                     #  768.279 M/sec                       (32.81%)
-           146,215      dTLB-load-misses:u               #    0.31% of all dTLB cache accesses  (13.39%)
+        30,148,838      L1-icache-loads:u                #  514.724 M/sec                     
+           282,768      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
+        19,757,856      dTLB-loads:u                     #  337.321 M/sec                       (11.69%)
+     <not counted>      dTLB-load-misses:u                                                      (0.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       4.966101053 seconds time elapsed
+      31.087250856 seconds time elapsed

-      23.375418000 seconds user
-     148.052989000 seconds sys
+     142.716222000 seconds user
+    2102.420776000 seconds sys



@ -55,21 +56,22 @@ tensor(crow_indices=tensor([     0,     29,    124,  ..., 549200, 549200,
       col_indices=tensor([    1,     2,     3,  ..., 82142, 81974, 82136]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(82144, 82144),
       nnz=549202, layout=torch.sparse_csr)
-tensor([0.1999, 0.3932, 0.8035,  ..., 0.5079, 0.5903, 0.7606])
+tensor([0.7637, 0.5328, 0.8286,  ..., 0.7084, 0.8903, 0.1707])
+Matrix: soc-sign-Slashdot090221
 Shape: torch.Size([82144, 82144])
 NNZ: 549202
 Density: 8.13917555860553e-05
-Time: 1.9677543640136719 seconds
+Time: 17.188836336135864 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090221.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090221.mtx 1000':

-           328,019      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        19,893,662      BR_RETIRED:u                                                          
+           342,121      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,436,338      BR_RETIRED:u                                                          

-       5.529871590 seconds time elapsed
+      20.753346873 seconds time elapsed

-      26.844356000 seconds user
-     190.429440000 seconds sys
+      98.605331000 seconds user
+    1332.291974000 seconds sys



@ -80,23 +82,24 @@ tensor(crow_indices=tensor([     0,     29,    124,  ..., 549200, 549200,
       col_indices=tensor([    1,     2,     3,  ..., 82142, 81974, 82136]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(82144, 82144),
       nnz=549202, layout=torch.sparse_csr)
-tensor([0.2933, 0.6999, 0.0078,  ..., 0.6213, 0.9377, 0.6359])
+tensor([0.9017, 0.8505, 0.0023,  ..., 0.4182, 0.6895, 0.5023])
+Matrix: soc-sign-Slashdot090221
 Shape: torch.Size([82144, 82144])
 NNZ: 549202
 Density: 8.13917555860553e-05
-Time: 1.4976201057434082 seconds
+Time: 16.22375249862671 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090221.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090221.mtx 1000':

-        27,248,112      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             5,792      ITLB_WALK:u                                                           
-            16,632      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        36,929,042      L1D_TLB:u                                                             
+        27,189,335      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,437      ITLB_WALK:u                                                           
+            18,156      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,676,625      L1D_TLB:u                                                             

-       4.971341163 seconds time elapsed
+      19.748749363 seconds time elapsed

-      24.247480000 seconds user
-     151.276717000 seconds sys
+     103.049578000 seconds user
+    1249.814927000 seconds sys



@ -107,23 +110,24 @@ tensor(crow_indices=tensor([     0,     29,    124,  ..., 549200, 549200,
       col_indices=tensor([    1,     2,     3,  ..., 82142, 81974, 82136]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(82144, 82144),
       nnz=549202, layout=torch.sparse_csr)
-tensor([0.1310, 0.6695, 0.9479,  ..., 0.3141, 0.9327, 0.2117])
+tensor([0.4805, 0.2325, 0.2103,  ..., 0.1710, 0.7638, 0.9368])
+Matrix: soc-sign-Slashdot090221
 Shape: torch.Size([82144, 82144])
 NNZ: 549202
 Density: 8.13917555860553e-05
-Time: 1.0877256393432617 seconds
+Time: 15.453373908996582 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090221.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090221.mtx 1000':

-        31,702,830      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           295,778      L1I_CACHE_REFILL:u                                                    
-           470,423      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        33,155,119      L1D_CACHE:u                                                           
+        30,721,032      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           302,777      L1I_CACHE_REFILL:u                                                    
+           469,833      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        32,109,077      L1D_CACHE:u                                                           

-       4.675682406 seconds time elapsed
+      19.090250444 seconds time elapsed

-      23.098007000 seconds user
-     119.827712000 seconds sys
+      94.904880000 seconds user
+    1195.102767000 seconds sys



@ -134,25 +138,26 @@ tensor(crow_indices=tensor([     0,     29,    124,  ..., 549200, 549200,
       col_indices=tensor([    1,     2,     3,  ..., 82142, 81974, 82136]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(82144, 82144),
       nnz=549202, layout=torch.sparse_csr)
-tensor([0.0860, 0.5402, 0.6738,  ..., 0.3856, 0.5968, 0.4203])
+tensor([0.8430, 0.9439, 0.4260,  ..., 0.8172, 0.4243, 0.3834])
+Matrix: soc-sign-Slashdot090221
 Shape: torch.Size([82144, 82144])
 NNZ: 549202
 Density: 8.13917555860553e-05
-Time: 1.2302696704864502 seconds
+Time: 29.316507816314697 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090221.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-Slashdot090221.mtx 1000':

-           545,220      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           562,139      LL_CACHE_RD:u                                                         
-           192,206      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            24,891      L2D_TLB_REFILL:u                                                      
-           307,033      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,782,260      L2D_CACHE:u                                                           
+           551,850      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           565,355      LL_CACHE_RD:u                                                         
+           200,417      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            25,536      L2D_TLB_REFILL:u                                                      
+           304,133      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,801,849      L2D_CACHE:u                                                           

-       4.781838296 seconds time elapsed
+      32.859276963 seconds time elapsed

-      23.716896000 seconds user
-     130.971947000 seconds sys
+     148.969816000 seconds user
+    2252.321936000 seconds sys



--- a/pytorch/output/altra_10_30_soc-sign-epinions_1000.json
+++ b/pytorch/output/altra_10_30_soc-sign-epinions_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [20.32, 20.52, 20.52, 20.56, 20.6, 20.4, 20.76, 20.6, 20.36, 20.4], "matrix": "soc-sign-epinions", "shape": [131828, 131828], "nnz": 841372, "% density": 4.841419648464106e-05, "time_s": 22.52380871772766, "power": [81.24, 81.16, 74.84, 62.04, 51.6, 50.56, 52.4, 52.4, 68.24, 80.56, 91.44, 91.36, 90.28, 88.32, 86.4, 85.16, 83.64, 82.36, 82.96, 82.84, 82.84, 82.56, 82.44, 82.08, 83.64, 84.4], "power_after": [20.8, 20.88, 20.8, 20.92, 20.88, 20.88, 20.8, 20.84, 20.84, 20.6], "task clock (msec)": 63.9, "page faults": 3446, "cycles": 55931043, "instructions": 77907356, "branch mispredictions": 332778, "branches": 20000746, "ITLB accesses": 27000304, "ITLB misses": 6713, "DTLB misses": 18689, "DTLB accesses": 36395663, "L1I cache accesses": 32396405, "L1I cache misses": 292629, "L1D cache misses": 473799, "L1D cache accesses": 34061981, "LL cache misses": 542765, "LL cache accesses": 557193, "L2D TLB accesses": 203626, "L2D TLB misses": 24363, "L2D cache misses": 303397, "L2D cache accesses": 1772084, "instructions per cycle": 1.3929179901043505, "branch miss rate": 0.01663827939217867, "ITLB miss rate": 0.00024862683027568875, "DTLB miss rate": 0.0005134952480464499, "L2D TLB miss rate": 0.11964582126054629, "L1I cache miss rate": 0.009032761505481858, "L1D cache miss rate": 0.01390990735389113, "L2D cache miss rate": 0.171209152613533, "LL cache miss rate": 0.9741059202107708}
--- a/pytorch/output/altra_10_30_soc-sign-epinions_1000.output
+++ b/pytorch/output/altra_10_30_soc-sign-epinions_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394154 queued and waiting for resources
-srun: job 3394154 has been allocated resources
+srun: job 3394990 queued and waiting for resources
+srun: job 3394990 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,      1,      2,  ..., 841371, 841371,
@ -15,37 +15,38 @@ tensor(crow_indices=tensor([     0,      1,      2,  ..., 841371, 841371,
                             7714]),
       values=tensor([-1., -1.,  1.,  ...,  1.,  1.,  1.]),
       size=(131828, 131828), nnz=841372, layout=torch.sparse_csr)
-tensor([0.5842, 0.3042, 0.7358,  ..., 0.7882, 0.7596, 0.5895])
+tensor([0.3914, 0.2076, 0.6733,  ..., 0.4758, 0.6360, 0.6316])
+Matrix: soc-sign-epinions
 Shape: torch.Size([131828, 131828])
 NNZ: 841372
 Density: 4.841419648464106e-05
-Time: 2.4407293796539307 seconds
+Time: 20.04187798500061 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-epinions.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-epinions.mtx 1000':

-             49.87 msec task-clock:u                     #    0.008 CPUs utilized             
+             63.90 msec task-clock:u                     #    0.003 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,300      page-faults:u                    #   66.174 K/sec                     
-        51,935,476      cycles:u                         #    1.041 GHz                         (65.00%)
-        83,731,856      instructions:u                   #    1.61  insn per cycle              (84.25%)
+             3,446      page-faults:u                    #   53.927 K/sec                     
+        55,931,043      cycles:u                         #    0.875 GHz                         (85.43%)
+        77,907,356      instructions:u                   #    1.39  insn per cycle            
   <not supported>      branches:u                                                            
-           375,900      branch-misses:u                                                       
-        34,169,837      L1-dcache-loads:u                #  685.197 M/sec                     
-           474,410      L1-dcache-load-misses:u          #    1.39% of all L1-dcache accesses 
+           357,739      branch-misses:u                                                       
+        33,000,188      L1-dcache-loads:u                #  516.421 M/sec                     
+           466,824      L1-dcache-load-misses:u          #    1.41% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        32,443,215      L1-icache-loads:u                #  650.574 M/sec                     
-           294,146      L1-icache-load-misses:u          #    0.91% of all L1-icache accesses 
-        63,709,518      dTLB-loads:u                     #    1.278 G/sec                       (16.44%)
-     <not counted>      dTLB-load-misses:u                                                      (0.00%)
+        31,503,048      L1-icache-loads:u                #  492.992 M/sec                     
+           301,112      L1-icache-load-misses:u          #    0.96% of all L1-icache accesses 
+        34,740,872      dTLB-loads:u                     #  543.661 M/sec                       (18.37%)
+            32,355      dTLB-load-misses:u               #    0.09% of all dTLB cache accesses  (12.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       6.058862056 seconds time elapsed
+      23.478083368 seconds time elapsed

-      29.101578000 seconds user
-     224.790489000 seconds sys
+     119.232326000 seconds user
+    1541.081607000 seconds sys



@ -57,21 +58,22 @@ tensor(crow_indices=tensor([     0,      1,      2,  ..., 841371, 841371,
                             7714]),
       values=tensor([-1., -1.,  1.,  ...,  1.,  1.,  1.]),
       size=(131828, 131828), nnz=841372, layout=torch.sparse_csr)
-tensor([0.9696, 0.8139, 0.4858,  ..., 0.2374, 0.1716, 0.9756])
+tensor([0.3970, 0.5643, 0.0036,  ..., 0.0338, 0.0807, 0.3885])
+Matrix: soc-sign-epinions
 Shape: torch.Size([131828, 131828])
 NNZ: 841372
 Density: 4.841419648464106e-05
-Time: 2.0945546627044678 seconds
+Time: 16.115705490112305 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-epinions.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-epinions.mtx 1000':

-           326,464      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        20,341,367      BR_RETIRED:u                                                          
+           332,778      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,000,746      BR_RETIRED:u                                                          

-       5.525378890 seconds time elapsed
+      19.765627973 seconds time elapsed

-      28.841740000 seconds user
-     199.678982000 seconds sys
+     103.591961000 seconds user
+    1250.845091000 seconds sys



@ -83,23 +85,24 @@ tensor(crow_indices=tensor([     0,      1,      2,  ..., 841371, 841371,
                             7714]),
       values=tensor([-1., -1.,  1.,  ...,  1.,  1.,  1.]),
       size=(131828, 131828), nnz=841372, layout=torch.sparse_csr)
-tensor([0.3478, 0.0057, 0.8574,  ..., 0.6409, 0.1876, 0.8429])
+tensor([0.0049, 0.4550, 0.3166,  ..., 0.3734, 0.8337, 0.5156])
+Matrix: soc-sign-epinions
 Shape: torch.Size([131828, 131828])
 NNZ: 841372
 Density: 4.841419648464106e-05
-Time: 2.8504912853240967 seconds
+Time: 18.55180263519287 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-epinions.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-epinions.mtx 1000':

-        27,590,154      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,210      ITLB_WALK:u                                                           
-            17,536      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        36,763,243      L1D_TLB:u                                                             
+        27,000,304      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,713      ITLB_WALK:u                                                           
+            18,689      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,395,663      L1D_TLB:u                                                             

-       6.425887143 seconds time elapsed
+      22.333459337 seconds time elapsed

-      33.069094000 seconds user
-     256.667850000 seconds sys
+     109.075160000 seconds user
+    1441.055730000 seconds sys



@ -111,23 +114,24 @@ tensor(crow_indices=tensor([     0,      1,      2,  ..., 841371, 841371,
                             7714]),
       values=tensor([-1., -1.,  1.,  ...,  1.,  1.,  1.]),
       size=(131828, 131828), nnz=841372, layout=torch.sparse_csr)
-tensor([0.5381, 0.6651, 0.4689,  ..., 0.7251, 0.3759, 0.8516])
+tensor([0.0560, 0.8530, 0.8946,  ..., 0.4591, 0.5391, 0.2898])
+Matrix: soc-sign-epinions
 Shape: torch.Size([131828, 131828])
 NNZ: 841372
 Density: 4.841419648464106e-05
-Time: 1.6941111087799072 seconds
+Time: 25.587534427642822 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-epinions.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-epinions.mtx 1000':

-        31,663,300      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           289,727      L1I_CACHE_REFILL:u                                                    
-           462,864      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        33,262,254      L1D_CACHE:u                                                           
+        32,396,405      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           292,629      L1I_CACHE_REFILL:u                                                    
+           473,799      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        34,061,981      L1D_CACHE:u                                                           

-       5.304170809 seconds time elapsed
+      29.367381835 seconds time elapsed

-      25.992245000 seconds user
-     173.752913000 seconds sys
+     142.233743000 seconds user
+    1962.747683000 seconds sys



@ -139,25 +143,26 @@ tensor(crow_indices=tensor([     0,      1,      2,  ..., 841371, 841371,
                             7714]),
       values=tensor([-1., -1.,  1.,  ...,  1.,  1.,  1.]),
       size=(131828, 131828), nnz=841372, layout=torch.sparse_csr)
-tensor([0.4145, 0.8515, 0.7222,  ..., 0.1386, 0.6641, 0.6662])
+tensor([0.7002, 0.7829, 0.1511,  ..., 0.3651, 0.2391, 0.7788])
+Matrix: soc-sign-epinions
 Shape: torch.Size([131828, 131828])
 NNZ: 841372
 Density: 4.841419648464106e-05
-Time: 3.0850296020507812 seconds
+Time: 23.656178951263428 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-epinions.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/soc-sign-epinions.mtx 1000':

-           530,272      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           551,373      LL_CACHE_RD:u                                                         
-           196,152      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            23,542      L2D_TLB_REFILL:u                                                      
-           301,998      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,732,662      L2D_CACHE:u                                                           
+           542,765      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           557,193      LL_CACHE_RD:u                                                         
+           203,626      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            24,363      L2D_TLB_REFILL:u                                                      
+           303,397      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,772,084      L2D_CACHE:u                                                           

-       6.733517838 seconds time elapsed
+      27.453055481 seconds time elapsed

-      34.030476000 seconds user
-     271.397968000 seconds sys
+     128.709934000 seconds user
+    1831.887905000 seconds sys



--- a/pytorch/output/altra_10_30_sx-mathoverflow_1000.json
+++ b/pytorch/output/altra_10_30_sx-mathoverflow_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [16.28, 16.44, 16.68, 16.68, 16.84, 17.04, 16.84, 16.84, 16.72, 16.72], "matrix": "sx-mathoverflow", "shape": [24818, 24818], "nnz": 239978, "% density": 0.00038961697406616504, "time_s": 5.405760288238525, "power": [25.64, 20.44, 21.24, 22.16, 22.28, 27.04, 26.92, 26.28, 25.32], "power_after": [16.32, 16.44, 16.4, 16.4, 16.6, 16.48, 16.56, 16.6, 16.32, 16.44], "task clock (msec)": 50.36, "page faults": 3296, "cycles": 56049457, "instructions": 72333565, "branch mispredictions": 325529, "branches": 19463406, "ITLB accesses": 27374917, "ITLB misses": 5203, "DTLB misses": 16771, "DTLB accesses": 36373182, "L1I cache accesses": 31839975, "L1I cache misses": 274158, "L1D cache misses": 471992, "L1D cache accesses": 33638817, "LL cache misses": 538067, "LL cache accesses": 557981, "L2D TLB accesses": 170169, "L2D TLB misses": 21987, "L2D cache misses": 301746, "L2D cache accesses": 1735872, "instructions per cycle": 1.2905310572411077, "branch miss rate": 0.016725181604905125, "ITLB miss rate": 0.00019006450320927, "DTLB miss rate": 0.00046108146381034247, "L2D TLB miss rate": 0.12920684731061474, "L1I cache miss rate": 0.00861049671050307, "L1D cache miss rate": 0.014031171191305569, "L2D cache miss rate": 0.1738296372082734, "LL cache miss rate": 0.9643106127269566}
--- a/pytorch/output/altra_10_30_sx-mathoverflow_1000.output
+++ b/pytorch/output/altra_10_30_sx-mathoverflow_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394144 queued and waiting for resources
-srun: job 3394144 has been allocated resources
+srun: job 3394987 queued and waiting for resources
+srun: job 3394987 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,    317,    416,  ..., 239976, 239977,
@ -14,37 +14,38 @@ tensor(crow_indices=tensor([     0,    317,    416,  ..., 239976, 239977,
       col_indices=tensor([    0,     1,     2,  ...,  1483,  2179, 24817]),
       values=tensor([151.,  17.,   6.,  ...,   1.,   1.,   1.]),
       size=(24818, 24818), nnz=239978, layout=torch.sparse_csr)
-tensor([0.7658, 0.2874, 0.7506,  ..., 0.3335, 0.5056, 0.9767])
+tensor([0.8864, 0.5637, 0.9805,  ..., 0.0234, 0.9487, 0.4860])
+Matrix: sx-mathoverflow
 Shape: torch.Size([24818, 24818])
 NNZ: 239978
 Density: 0.00038961697406616504
-Time: 0.5561239719390869 seconds
+Time: 5.484489917755127 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/sx-mathoverflow.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/sx-mathoverflow.mtx 1000':

-             62.49 msec task-clock:u                     #    0.015 CPUs utilized             
+             50.36 msec task-clock:u                     #    0.006 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,312      page-faults:u                    #   53.003 K/sec                     
-        76,783,170      cycles:u                         #    1.229 GHz                         (62.65%)
-        77,095,702      instructions:u                   #    1.00  insn per cycle              (80.20%)
+             3,296      page-faults:u                    #   65.452 K/sec                     
+        56,049,457      cycles:u                         #    1.113 GHz                         (49.66%)
+        72,333,565      instructions:u                   #    1.29  insn per cycle              (66.35%)
   <not supported>      branches:u                                                            
-           370,891      branch-misses:u                                                         (94.99%)
-        32,730,448      L1-dcache-loads:u                #  523.800 M/sec                     
-           467,718      L1-dcache-load-misses:u          #    1.43% of all L1-dcache accesses 
+           369,218      branch-misses:u                                                         (86.12%)
+        33,730,437      L1-dcache-loads:u                #  669.814 M/sec                       (93.88%)
+           459,922      L1-dcache-load-misses:u          #    1.36% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        31,548,469      L1-icache-loads:u                #  504.885 M/sec                     
-           298,966      L1-icache-load-misses:u          #    0.95% of all L1-icache accesses 
-        61,098,419      dTLB-loads:u                     #  977.786 M/sec                       (20.67%)
-            64,747      dTLB-load-misses:u               #    0.11% of all dTLB cache accesses  (10.91%)
-     <not counted>      iTLB-loads:u                                                            (0.00%)
+        31,827,672      L1-icache-loads:u                #  632.030 M/sec                     
+           295,060      L1-icache-load-misses:u          #    0.93% of all L1-icache accesses 
+        54,366,618      dTLB-loads:u                     #    1.080 G/sec                       (35.64%)
+            84,768      dTLB-load-misses:u               #    0.16% of all dTLB cache accesses  (25.48%)
+        12,107,953      iTLB-loads:u                     #  240.438 M/sec                       (10.11%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       4.062782709 seconds time elapsed
+       8.968532171 seconds time elapsed

-      16.106338000 seconds user
-      32.399716000 seconds sys
+      20.749643000 seconds user
+      28.745486000 seconds sys



@ -55,21 +56,22 @@ tensor(crow_indices=tensor([     0,    317,    416,  ..., 239976, 239977,
       col_indices=tensor([    0,     1,     2,  ...,  1483,  2179, 24817]),
       values=tensor([151.,  17.,   6.,  ...,   1.,   1.,   1.]),
       size=(24818, 24818), nnz=239978, layout=torch.sparse_csr)
-tensor([0.7531, 0.4727, 0.4126,  ..., 0.1574, 0.5247, 0.8875])
+tensor([0.5549, 0.0336, 0.9472,  ..., 0.2657, 0.3394, 0.6185])
+Matrix: sx-mathoverflow
 Shape: torch.Size([24818, 24818])
 NNZ: 239978
 Density: 0.00038961697406616504
-Time: 0.6003477573394775 seconds
+Time: 5.532417297363281 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/sx-mathoverflow.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/sx-mathoverflow.mtx 1000':

-           323,514      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        19,769,937      BR_RETIRED:u                                                          
+           325,529      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,463,406      BR_RETIRED:u                                                          

-       4.061021393 seconds time elapsed
+       8.912497962 seconds time elapsed

-      16.155442000 seconds user
-      31.047278000 seconds sys
+      20.214519000 seconds user
+      31.566513000 seconds sys



@ -80,23 +82,24 @@ tensor(crow_indices=tensor([     0,    317,    416,  ..., 239976, 239977,
       col_indices=tensor([    0,     1,     2,  ...,  1483,  2179, 24817]),
       values=tensor([151.,  17.,   6.,  ...,   1.,   1.,   1.]),
       size=(24818, 24818), nnz=239978, layout=torch.sparse_csr)
-tensor([0.3067, 0.4335, 0.8814,  ..., 0.2370, 0.1210, 0.7695])
+tensor([0.3330, 0.8843, 0.5150,  ..., 0.7292, 0.0873, 0.4184])
+Matrix: sx-mathoverflow
 Shape: torch.Size([24818, 24818])
 NNZ: 239978
 Density: 0.00038961697406616504
-Time: 0.5404119491577148 seconds
+Time: 5.457342863082886 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/sx-mathoverflow.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/sx-mathoverflow.mtx 1000':

-        26,809,325      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,925      ITLB_WALK:u                                                           
-            19,003      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        36,516,965      L1D_TLB:u                                                             
+        27,374,917      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             5,203      ITLB_WALK:u                                                           
+            16,771      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,373,182      L1D_TLB:u                                                             

-       4.031175418 seconds time elapsed
+       8.730534933 seconds time elapsed

-      15.607232000 seconds user
-      30.562258000 seconds sys
+      20.156482000 seconds user
+      31.426118000 seconds sys



@ -107,23 +110,24 @@ tensor(crow_indices=tensor([     0,    317,    416,  ..., 239976, 239977,
       col_indices=tensor([    0,     1,     2,  ...,  1483,  2179, 24817]),
       values=tensor([151.,  17.,   6.,  ...,   1.,   1.,   1.]),
       size=(24818, 24818), nnz=239978, layout=torch.sparse_csr)
-tensor([0.5013, 0.5961, 0.5565,  ..., 0.3779, 0.1835, 0.6722])
+tensor([0.5864, 0.4449, 0.4042,  ..., 0.1651, 0.7793, 0.8302])
+Matrix: sx-mathoverflow
 Shape: torch.Size([24818, 24818])
 NNZ: 239978
 Density: 0.00038961697406616504
-Time: 0.6185996532440186 seconds
+Time: 5.449937582015991 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/sx-mathoverflow.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/sx-mathoverflow.mtx 1000':

-        31,104,231      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           285,499      L1I_CACHE_REFILL:u                                                    
-           468,498      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        32,677,465      L1D_CACHE:u                                                           
+        31,839,975      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           274,158      L1I_CACHE_REFILL:u                                                    
+           471,992      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,638,817      L1D_CACHE:u                                                           

-       4.083129305 seconds time elapsed
+       8.845491835 seconds time elapsed

-      16.243642000 seconds user
-      36.578375000 seconds sys
+      20.577696000 seconds user
+      35.105662000 seconds sys



@ -134,25 +138,26 @@ tensor(crow_indices=tensor([     0,    317,    416,  ..., 239976, 239977,
       col_indices=tensor([    0,     1,     2,  ...,  1483,  2179, 24817]),
       values=tensor([151.,  17.,   6.,  ...,   1.,   1.,   1.]),
       size=(24818, 24818), nnz=239978, layout=torch.sparse_csr)
-tensor([0.9075, 0.2788, 0.1365,  ..., 0.4240, 0.8832, 0.1064])
+tensor([0.8880, 0.4700, 0.5542,  ..., 0.8505, 0.9123, 0.5742])
+Matrix: sx-mathoverflow
 Shape: torch.Size([24818, 24818])
 NNZ: 239978
 Density: 0.00038961697406616504
-Time: 0.54673171043396 seconds
+Time: 5.400304794311523 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/sx-mathoverflow.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/sx-mathoverflow.mtx 1000':

-           559,358      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           571,935      LL_CACHE_RD:u                                                         
-           194,840      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            23,481      L2D_TLB_REFILL:u                                                      
-           313,487      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,779,730      L2D_CACHE:u                                                           
+           538,067      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           557,981      LL_CACHE_RD:u                                                         
+           170,169      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            21,987      L2D_TLB_REFILL:u                                                      
+           301,746      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,735,872      L2D_CACHE:u                                                           

-       3.961843929 seconds time elapsed
+       8.606800178 seconds time elapsed

-      15.425912000 seconds user
-      28.864046000 seconds sys
+      21.064990000 seconds user
+      34.158762000 seconds sys



--- a/pytorch/output/altra_10_30_ut2010_1000.json
+++ b/pytorch/output/altra_10_30_ut2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [20.36, 20.4, 20.68, 20.64, 20.92, 20.92, 20.88, 20.68, 20.68, 20.6], "matrix": "ut2010", "shape": [115406, 115406], "nnz": 572066, "% density": 4.295259032005559e-05, "time_s": 11.10523509979248, "power": [90.68, 90.68, 88.24, 72.2, 59.48, 52.0, 54.72, 64.28, 79.24, 94.08, 96.24, 93.72, 92.36, 92.36, 90.08], "power_after": [21.24, 21.28, 20.96, 21.16, 20.92, 21.04, 21.32, 21.56, 21.16, 21.24], "task clock (msec)": 52.22, "page faults": 3288, "cycles": 67463873, "instructions": 73042754, "branch mispredictions": 344635, "branches": 20775821, "ITLB accesses": 27488750, "ITLB misses": 6494, "DTLB misses": 18293, "DTLB accesses": 36697113, "L1I cache accesses": 31066176, "L1I cache misses": 298652, "L1D cache misses": 473808, "L1D cache accesses": 32572985, "LL cache misses": 547428, "LL cache accesses": 566356, "L2D TLB accesses": 162858, "L2D TLB misses": 19852, "L2D cache misses": 304056, "L2D cache accesses": 1713420, "instructions per cycle": 1.0826943481291091, "branch miss rate": 0.01658827345499367, "ITLB miss rate": 0.00023624209904051657, "DTLB miss rate": 0.0004984860798177775, "L2D TLB miss rate": 0.12189760404769799, "L1I cache miss rate": 0.009613413636747567, "L1D cache miss rate": 0.014546041758223879, "L2D cache miss rate": 0.17745561508561825, "LL cache miss rate": 0.9665793246650517}
--- a/pytorch/output/altra_10_30_ut2010_1000.output
+++ b/pytorch/output/altra_10_30_ut2010_1000.output
@ -0,0 +1,173 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3394993 queued and waiting for resources
+srun: job 3394993 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
+                            572066]),
+       col_indices=tensor([   453,   1291,   1979,  ..., 113521, 114509,
+                           114602]),
+       values=tensor([160642.,  31335., 282373.,  ...,  88393.,  99485.,
+                       18651.]), size=(115406, 115406), nnz=572066,
+       layout=torch.sparse_csr)
+tensor([0.6983, 0.2845, 0.5984,  ..., 0.1182, 0.9468, 0.3161])
+Matrix: ut2010
+Shape: torch.Size([115406, 115406])
+NNZ: 572066
+Density: 4.295259032005559e-05
+Time: 8.604448795318604 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 1000':
+
+             52.22 msec task-clock:u                     #    0.004 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,288      page-faults:u                    #   62.965 K/sec                     
+        67,463,873      cycles:u                         #    1.292 GHz                         (52.95%)
+        73,042,754      instructions:u                   #    1.08  insn per cycle              (71.78%)
+   <not supported>      branches:u                                                            
+           376,297      branch-misses:u                                                         (87.57%)
+        34,189,906      L1-dcache-loads:u                #  654.731 M/sec                       (97.72%)
+           471,636      L1-dcache-load-misses:u          #    1.38% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        31,870,328      L1-icache-loads:u                #  610.312 M/sec                     
+           297,680      L1-icache-load-misses:u          #    0.93% of all L1-icache accesses 
+        57,623,823      dTLB-loads:u                     #    1.103 G/sec                       (30.16%)
+            75,454      dTLB-load-misses:u               #    0.13% of all dTLB cache accesses  (24.31%)
+                 0      iTLB-loads:u                     #    0.000 /sec                        (3.96%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+      12.112100803 seconds time elapsed
+
+      66.253313000 seconds user
+     675.855469000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
+                            572066]),
+       col_indices=tensor([   453,   1291,   1979,  ..., 113521, 114509,
+                           114602]),
+       values=tensor([160642.,  31335., 282373.,  ...,  88393.,  99485.,
+                       18651.]), size=(115406, 115406), nnz=572066,
+       layout=torch.sparse_csr)
+tensor([0.0260, 0.8569, 0.4315,  ..., 0.5243, 0.8018, 0.1763])
+Matrix: ut2010
+Shape: torch.Size([115406, 115406])
+NNZ: 572066
+Density: 4.295259032005559e-05
+Time: 8.702903270721436 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 1000':
+
+           344,635      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,775,821      BR_RETIRED:u                                                          
+
+      12.383096073 seconds time elapsed
+
+      64.544546000 seconds user
+     688.477174000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
+                            572066]),
+       col_indices=tensor([   453,   1291,   1979,  ..., 113521, 114509,
+                           114602]),
+       values=tensor([160642.,  31335., 282373.,  ...,  88393.,  99485.,
+                       18651.]), size=(115406, 115406), nnz=572066,
+       layout=torch.sparse_csr)
+tensor([0.7940, 0.1585, 0.6879,  ..., 0.4017, 0.1738, 0.9713])
+Matrix: ut2010
+Shape: torch.Size([115406, 115406])
+NNZ: 572066
+Density: 4.295259032005559e-05
+Time: 7.38647985458374 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 1000':
+
+        27,488,750      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,494      ITLB_WALK:u                                                           
+            18,293      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,697,113      L1D_TLB:u                                                             
+
+      10.936742446 seconds time elapsed
+
+      63.993242000 seconds user
+     580.515047000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
+                            572066]),
+       col_indices=tensor([   453,   1291,   1979,  ..., 113521, 114509,
+                           114602]),
+       values=tensor([160642.,  31335., 282373.,  ...,  88393.,  99485.,
+                       18651.]), size=(115406, 115406), nnz=572066,
+       layout=torch.sparse_csr)
+tensor([0.2725, 0.6578, 0.8180,  ..., 0.0148, 0.5094, 0.1155])
+Matrix: ut2010
+Shape: torch.Size([115406, 115406])
+NNZ: 572066
+Density: 4.295259032005559e-05
+Time: 12.719107389450073 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 1000':
+
+        31,066,176      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           298,652      L1I_CACHE_REFILL:u                                                    
+           473,808      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        32,572,985      L1D_CACHE:u                                                           
+
+      16.299576479 seconds time elapsed
+
+      86.072431000 seconds user
+     987.199923000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
+                            572066]),
+       col_indices=tensor([   453,   1291,   1979,  ..., 113521, 114509,
+                           114602]),
+       values=tensor([160642.,  31335., 282373.,  ...,  88393.,  99485.,
+                       18651.]), size=(115406, 115406), nnz=572066,
+       layout=torch.sparse_csr)
+tensor([0.1156, 0.5715, 0.3099,  ..., 0.3964, 0.9672, 0.5694])
+Matrix: ut2010
+Shape: torch.Size([115406, 115406])
+NNZ: 572066
+Density: 4.295259032005559e-05
+Time: 12.682909727096558 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 1000':
+
+           547,428      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           566,356      LL_CACHE_RD:u                                                         
+           162,858      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            19,852      L2D_TLB_REFILL:u                                                      
+           304,056      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,713,420      L2D_CACHE:u                                                           
+
+      16.221517033 seconds time elapsed
+
+      79.927661000 seconds user
+     988.333919000 seconds sys
+
+
+
--- a/pytorch/output/altra_10_30_vt2010_1000.json
+++ b/pytorch/output/altra_10_30_vt2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [20.88, 20.76, 20.76, 20.96, 20.92, 20.88, 20.72, 20.4, 20.4, 20.24], "matrix": "vt2010", "shape": [32580, 32580], "nnz": 155598, "% density": 0.00014658915806621921, "time_s": 3.6774682998657227, "power": [34.12, 31.52, 30.36, 27.2, 27.16, 30.64, 31.0, 31.32], "power_after": [20.44, 20.52, 20.68, 20.72, 20.68, 20.72, 20.88, 20.8, 20.88, 20.52], "task clock (msec)": 48.59, "page faults": 3274, "cycles": 55030923, "instructions": 78222423, "branch mispredictions": 323004, "branches": 19091130, "ITLB accesses": 27178617, "ITLB misses": 6398, "DTLB misses": 19770, "DTLB accesses": 36355567, "L1I cache accesses": 31341858, "L1I cache misses": 291951, "L1D cache misses": 468242, "L1D cache accesses": 32805413, "LL cache misses": 520057, "LL cache accesses": 541186, "L2D TLB accesses": 191068, "L2D TLB misses": 22725, "L2D cache misses": 288895, "L2D cache accesses": 1728320, "instructions per cycle": 1.4214266949511278, "branch miss rate": 0.01691906136514706, "ITLB miss rate": 0.00023540564996371965, "DTLB miss rate": 0.0005437956723381593, "L2D TLB miss rate": 0.11893671363074926, "L1I cache miss rate": 0.009315050817982775, "L1D cache miss rate": 0.014273315199537345, "L2D cache miss rate": 0.16715365210146269, "LL cache miss rate": 0.9609579700879181}
--- a/pytorch/output/altra_10_30_vt2010_1000.output
+++ b/pytorch/output/altra_10_30_vt2010_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394143 queued and waiting for resources
-srun: job 3394143 has been allocated resources
+srun: job 3394988 queued and waiting for resources
+srun: job 3394988 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
@ -14,37 +14,38 @@ tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
       col_indices=tensor([  131,   561,   996,  ..., 32237, 32238, 32570]),
       values=tensor([79040.,  7820., 15136.,  ...,  2828., 17986.,  2482.]),
       size=(32580, 32580), nnz=155598, layout=torch.sparse_csr)
-tensor([0.9170, 0.7306, 0.1175,  ..., 0.0616, 0.0147, 0.6403])
+tensor([0.2022, 0.3400, 0.2561,  ..., 0.8370, 0.0285, 0.6506])
+Matrix: vt2010
 Shape: torch.Size([32580, 32580])
 NNZ: 155598
 Density: 0.00014658915806621921
-Time: 0.4440653324127197 seconds
+Time: 3.74875545501709 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 1000':

-             61.63 msec task-clock:u                     #    0.016 CPUs utilized             
+             48.59 msec task-clock:u                     #    0.007 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,304      page-faults:u                    #   53.611 K/sec                     
-        64,734,203      cycles:u                         #    1.050 GHz                         (50.46%)
-        53,597,991      instructions:u                   #    0.83  insn per cycle              (70.10%)
+             3,274      page-faults:u                    #   67.376 K/sec                     
+        55,030,923      cycles:u                         #    1.132 GHz                         (65.54%)
+        78,222,423      instructions:u                   #    1.42  insn per cycle              (83.60%)
   <not supported>      branches:u                                                            
-           347,389      branch-misses:u                                                         (91.95%)
-        31,363,842      L1-dcache-loads:u                #  508.915 M/sec                     
-           482,780      L1-dcache-load-misses:u          #    1.54% of all L1-dcache accesses 
+           369,917      branch-misses:u                                                       
+        32,435,815      L1-dcache-loads:u                #  667.500 M/sec                     
+           467,963      L1-dcache-load-misses:u          #    1.44% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        30,027,001      L1-icache-loads:u                #  487.223 M/sec                     
-           288,023      L1-icache-load-misses:u          #    0.96% of all L1-icache accesses 
-        44,333,825      dTLB-loads:u                     #  719.368 M/sec                       (48.58%)
-            74,525      dTLB-load-misses:u               #    0.17% of all dTLB cache accesses  (16.71%)
+        31,013,287      L1-icache-loads:u                #  638.226 M/sec                     
+           289,982      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
+        60,644,978      dTLB-loads:u                     #    1.248 G/sec                       (17.29%)
+     <not counted>      dTLB-load-misses:u                                                      (0.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       3.811654040 seconds time elapsed
+       6.978143797 seconds time elapsed

-      15.616953000 seconds user
-      30.906234000 seconds sys
+      18.401752000 seconds user
+      28.060858000 seconds sys



@ -55,21 +56,22 @@ tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
       col_indices=tensor([  131,   561,   996,  ..., 32237, 32238, 32570]),
       values=tensor([79040.,  7820., 15136.,  ...,  2828., 17986.,  2482.]),
       size=(32580, 32580), nnz=155598, layout=torch.sparse_csr)
-tensor([0.5548, 0.3514, 0.6283,  ..., 0.5672, 0.1575, 0.4493])
+tensor([0.3381, 0.0423, 0.5363,  ..., 0.0429, 0.4077, 0.4744])
+Matrix: vt2010
 Shape: torch.Size([32580, 32580])
 NNZ: 155598
 Density: 0.00014658915806621921
-Time: 0.44233155250549316 seconds
+Time: 3.7925527095794678 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 1000':

-           330,777      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        20,357,034      BR_RETIRED:u                                                          
+           323,004      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,091,130      BR_RETIRED:u                                                          

-       3.835342404 seconds time elapsed
+       7.233250772 seconds time elapsed

-      15.497637000 seconds user
-      28.676763000 seconds sys
+      19.111768000 seconds user
+      32.178633000 seconds sys



@ -80,23 +82,24 @@ tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
       col_indices=tensor([  131,   561,   996,  ..., 32237, 32238, 32570]),
       values=tensor([79040.,  7820., 15136.,  ...,  2828., 17986.,  2482.]),
       size=(32580, 32580), nnz=155598, layout=torch.sparse_csr)
-tensor([0.0953, 0.5790, 0.0112,  ..., 0.9540, 0.3173, 0.4731])
+tensor([0.7962, 0.6492, 0.2778,  ..., 0.5407, 0.1159, 0.3587])
+Matrix: vt2010
 Shape: torch.Size([32580, 32580])
 NNZ: 155598
 Density: 0.00014658915806621921
-Time: 0.43302106857299805 seconds
+Time: 3.668635129928589 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 1000':

-        27,381,387      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,248      ITLB_WALK:u                                                           
-            17,636      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        37,436,110      L1D_TLB:u                                                             
+        27,178,617      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,398      ITLB_WALK:u                                                           
+            19,770      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,355,567      L1D_TLB:u                                                             

-       3.828586094 seconds time elapsed
+       6.925944164 seconds time elapsed

-      15.518057000 seconds user
-      31.389361000 seconds sys
+      18.970654000 seconds user
+      30.786317000 seconds sys



@ -107,23 +110,24 @@ tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
       col_indices=tensor([  131,   561,   996,  ..., 32237, 32238, 32570]),
       values=tensor([79040.,  7820., 15136.,  ...,  2828., 17986.,  2482.]),
       size=(32580, 32580), nnz=155598, layout=torch.sparse_csr)
-tensor([0.5456, 0.8708, 0.2037,  ..., 0.8669, 0.9122, 0.2046])
+tensor([0.8340, 0.3434, 0.3449,  ..., 0.9828, 0.6683, 0.0312])
+Matrix: vt2010
 Shape: torch.Size([32580, 32580])
 NNZ: 155598
 Density: 0.00014658915806621921
-Time: 0.4426534175872803 seconds
+Time: 3.623232126235962 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 1000':

-        32,505,993      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           303,849      L1I_CACHE_REFILL:u                                                    
-           467,426      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        34,241,110      L1D_CACHE:u                                                           
+        31,341,858      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           291,951      L1I_CACHE_REFILL:u                                                    
+           468,242      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        32,805,413      L1D_CACHE:u                                                           

-       3.811299200 seconds time elapsed
+       6.941260499 seconds time elapsed

-      15.932195000 seconds user
-      30.887870000 seconds sys
+      18.410270000 seconds user
+      27.908787000 seconds sys



@ -134,25 +138,26 @@ tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
       col_indices=tensor([  131,   561,   996,  ..., 32237, 32238, 32570]),
       values=tensor([79040.,  7820., 15136.,  ...,  2828., 17986.,  2482.]),
       size=(32580, 32580), nnz=155598, layout=torch.sparse_csr)
-tensor([0.5024, 0.2304, 0.7925,  ..., 0.1397, 0.5558, 0.6450])
+tensor([0.2754, 0.3661, 0.9484,  ..., 0.7285, 0.5354, 0.4116])
+Matrix: vt2010
 Shape: torch.Size([32580, 32580])
 NNZ: 155598
 Density: 0.00014658915806621921
-Time: 0.3671383857727051 seconds
+Time: 3.7337992191314697 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 1000':

-           550,075      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           562,829      LL_CACHE_RD:u                                                         
-           199,285      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            24,424      L2D_TLB_REFILL:u                                                      
-           310,155      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,783,824      L2D_CACHE:u                                                           
+           520,057      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           541,186      LL_CACHE_RD:u                                                         
+           191,068      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            22,725      L2D_TLB_REFILL:u                                                      
+           288,895      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,728,320      L2D_CACHE:u                                                           

-       3.824434783 seconds time elapsed
+       7.164825085 seconds time elapsed

-      15.754438000 seconds user
-      28.226523000 seconds sys
+      18.193885000 seconds user
+      30.023194000 seconds sys



--- a/pytorch/output/altra_2_2_Oregon-2_100.json
+++ b/pytorch/output/altra_2_2_Oregon-2_100.json
@ -1 +0,0 @@
-{"power_before": [50.88, 50.88], "shape": [11806, 11806], "nnz": 65460, "% density": 0.0004696458003979807, "time_s": 0.1896660327911377, "power": [25.52, 32.28, 33.12, 33.12], "power_after": [32.88, 26.52], "task clock (msec)": 42.01, "page faults": 3263, "cycles": 47084933, "instructions": 77895119, "branch mispredictions": 330923, "branches": 19740519, "ITLB accesses": 27761239, "ITLB misses": 6471, "DTLB misses": 17268, "DTLB accesses": 36993265, "L1I cache accesses": 31834980, "L1I cache misses": 298333, "L1D cache misses": 466901, "L1D cache accesses": 33528976, "LL cache misses": 525505, "LL cache accesses": 546521, "L2D TLB accesses": 184884, "L2D TLB misses": 22933, "L2D cache misses": 292367, "L2D cache accesses": 1706226, "instructions per cycle": 1.6543534000568716, "branch miss rate": 0.016763642333821112, "ITLB miss rate": 0.00023309478370183695, "DTLB miss rate": 0.0004667876706746485, "L2D TLB miss rate": 0.12403993855606758, "L1I cache miss rate": 0.009371232524725947, "L1D cache miss rate": 0.013925298523879763, "L2D cache miss rate": 0.1713530329510862, "LL cache miss rate": 0.9615458509371094}
--- a/pytorch/output/altra_2_2_as-caida_100.json
+++ b/pytorch/output/altra_2_2_as-caida_100.json
@ -1 +0,0 @@
-{"power_before": [20.16, 20.08], "shape": [31379, 31379], "nnz": 106762, "% density": 0.00010842726485909405, "time_s": 0.336850643157959, "power": [24.28, 30.72, 30.72, 34.56], "power_after": [37.32, 32.92], "task clock (msec)": 60.78, "page faults": 3300, "cycles": 66733059, "instructions": 87889334, "branch mispredictions": 326300, "branches": 19832700, "ITLB accesses": 27233629, "ITLB misses": 5868, "DTLB misses": 16893, "DTLB accesses": 36409508, "L1I cache accesses": 30924532, "L1I cache misses": 288199, "L1D cache misses": 462816, "L1D cache accesses": 32428375, "LL cache misses": 551997, "LL cache accesses": 568528, "L2D TLB accesses": 193991, "L2D TLB misses": 24353, "L2D cache misses": 312207, "L2D cache accesses": 1821196, "instructions per cycle": 1.3170284011707, "branch miss rate": 0.016452626218316214, "ITLB miss rate": 0.0002154688969288669, "DTLB miss rate": 0.00046397221297250155, "L2D TLB miss rate": 0.125536751704976, "L1I cache miss rate": 0.009319429635992551, "L1D cache miss rate": 0.014271945479845968, "L2D cache miss rate": 0.17142965391973186, "LL cache miss rate": 0.9709231559395491}
--- a/pytorch/output/altra_2_2_dc2_100.json
+++ b/pytorch/output/altra_2_2_dc2_100.json
@ -1 +0,0 @@
-{"power_before": [16.32, 16.2], "shape": [116835, 116835], "nnz": 766396, "% density": 5.614451099680581e-05, "time_s": 2.2665774822235107, "power": [35.16, 50.8, 53.4, 53.4, 46.08, 46.88], "power_after": [58.4, 57.32], "task clock (msec)": 50.43, "page faults": 3285, "cycles": 54118679, "instructions": 77692421, "branch mispredictions": 325039, "branches": 19383216, "ITLB accesses": 26060519, "ITLB misses": 4749, "DTLB misses": 16865, "DTLB accesses": 34819729, "L1I cache accesses": 30777115, "L1I cache misses": 293980, "L1D cache misses": 461522, "L1D cache accesses": 32216597, "LL cache misses": 567700, "LL cache accesses": 588689, "L2D TLB accesses": 189417, "L2D TLB misses": 22360, "L2D cache misses": 328306, "L2D cache accesses": 1908607, "instructions per cycle": 1.4355934482436277, "branch miss rate": 0.0167690954896236, "ITLB miss rate": 0.00018222967854170517, "DTLB miss rate": 0.00048435184547243316, "L2D TLB miss rate": 0.11804642666708902, "L1I cache miss rate": 0.009551902444397404, "L1D cache miss rate": 0.014325597455249542, "L2D cache miss rate": 0.172013410827897, "LL cache miss rate": 0.9643461997761127}
--- a/pytorch/output/altra_2_2_de2010_100.json
+++ b/pytorch/output/altra_2_2_de2010_100.json
@ -1 +0,0 @@
-{"power_before": [20.48, 20.96], "shape": [24115, 24115], "nnz": 116056, "% density": 0.0001995689928120616, "time_s": 0.3271017074584961, "power": [25.28, 26.08, 31.28, 32.96], "power_after": [33.4, 30.24], "task clock (msec)": 59.88, "page faults": 3313, "cycles": 58169777, "instructions": 57993431, "branch mispredictions": 330494, "branches": 20578427, "ITLB accesses": 27982097, "ITLB misses": 6614, "DTLB misses": 17270, "DTLB accesses": 37728899, "L1I cache accesses": 29754926, "L1I cache misses": 278786, "L1D cache misses": 454742, "L1D cache accesses": 31173246, "LL cache misses": 543243, "LL cache accesses": 560716, "L2D TLB accesses": 162281, "L2D TLB misses": 19847, "L2D cache misses": 300577, "L2D cache accesses": 1696278, "instructions per cycle": 0.9969684257170179, "branch miss rate": 0.016060216847478187, "ITLB miss rate": 0.0002363654160729984, "DTLB miss rate": 0.00045773930482307474, "L2D TLB miss rate": 0.12230020766448321, "L1I cache miss rate": 0.009369406598423401, "L1D cache miss rate": 0.014587572946365611, "L2D cache miss rate": 0.1771979592967662, "LL cache miss rate": 0.9688380570556218}
--- a/pytorch/output/altra_2_2_email-Enron_100.json
+++ b/pytorch/output/altra_2_2_email-Enron_100.json
@ -1 +0,0 @@
-{"power_before": [20.28, 20.32], "shape": [36692, 36692], "nnz": 367662, "% density": 0.0002730901120626302, "time_s": 1.030203104019165, "power": [32.08, 47.84, 55.76, 58.08, 58.24], "power_after": [48.76, 45.16], "task clock (msec)": 60.43, "page faults": 3319, "cycles": 66114448, "instructions": 90786829, "branch mispredictions": 341625, "branches": 20129354, "ITLB accesses": 27441303, "ITLB misses": 6807, "DTLB misses": 20551, "DTLB accesses": 36867114, "L1I cache accesses": 31744243, "L1I cache misses": 271027, "L1D cache misses": 464135, "L1D cache accesses": 33441141, "LL cache misses": 539935, "LL cache accesses": 552519, "L2D TLB accesses": 188291, "L2D TLB misses": 24177, "L2D cache misses": 301281, "L2D cache accesses": 1737575, "instructions per cycle": 1.3731768432824245, "branch miss rate": 0.016971483535934636, "ITLB miss rate": 0.00024805673404065397, "DTLB miss rate": 0.0005574344658494288, "L2D TLB miss rate": 0.12840231344036623, "L1I cache miss rate": 0.008537831568388637, "L1D cache miss rate": 0.01387916159918108, "L2D cache miss rate": 0.17339165215889962, "LL cache miss rate": 0.9772243126480719}
--- a/pytorch/output/altra_2_2_p2p-Gnutella04_100.json
+++ b/pytorch/output/altra_2_2_p2p-Gnutella04_100.json
@ -1 +0,0 @@
-{"power_before": [50.68, 49.4], "shape": [10879, 10879], "nnz": 39994, "% density": 0.0003379223282393842, "time_s": 0.11296772956848145, "power": [26.2, 29.76, 33.64, 34.44], "power_after": [36.84, 29.44], "task clock (msec)": 67.56, "page faults": 3829, "cycles": 47862000, "instructions": 84392375, "branch mispredictions": 331622, "branches": 19800140, "ITLB accesses": 25905045, "ITLB misses": 6746, "DTLB misses": 17547, "DTLB accesses": 35220079, "L1I cache accesses": 30359576, "L1I cache misses": 283204, "L1D cache misses": 465520, "L1D cache accesses": 31843274, "LL cache misses": 560542, "LL cache accesses": 575610, "L2D TLB accesses": 173643, "L2D TLB misses": 21499, "L2D cache misses": 313335, "L2D cache accesses": 1741621, "instructions per cycle": 1.7632438051063475, "branch miss rate": 0.016748467435078743, "ITLB miss rate": 0.0002604125953072075, "DTLB miss rate": 0.0004982101261044871, "L2D TLB miss rate": 0.12381149830399152, "L1I cache miss rate": 0.009328325270418797, "L1D cache miss rate": 0.014619099782264852, "L2D cache miss rate": 0.17990998041479747, "LL cache miss rate": 0.9738225534650197}
--- a/pytorch/output/altra_2_2_p2p-Gnutella24_100.json
+++ b/pytorch/output/altra_2_2_p2p-Gnutella24_100.json
@ -1 +0,0 @@
-{"power_before": [16.52, 16.24], "shape": [26518, 26518], "nnz": 65369, "% density": 9.295875717624285e-05, "time_s": 0.1715233325958252, "power": [18.56, 24.92, 27.84, 27.84], "power_after": [33.2, 27.28], "task clock (msec)": 61.92, "page faults": 3281, "cycles": 66250810, "instructions": 75178179, "branch mispredictions": 332366, "branches": 19076182, "ITLB accesses": 27005133, "ITLB misses": 4791, "DTLB misses": 13403, "DTLB accesses": 36457054, "L1I cache accesses": 32367686, "L1I cache misses": 287524, "L1D cache misses": 467557, "L1D cache accesses": 34022862, "LL cache misses": 535707, "LL cache accesses": 556316, "L2D TLB accesses": 150149, "L2D TLB misses": 18418, "L2D cache misses": 297042, "L2D cache accesses": 1687364, "instructions per cycle": 1.1347510920998551, "branch miss rate": 0.017423088121092577, "ITLB miss rate": 0.00017741071669597036, "DTLB miss rate": 0.00036763804338112453, "L2D TLB miss rate": 0.12266481961251822, "L1I cache miss rate": 0.008883057009388932, "L1D cache miss rate": 0.013742435895016709, "L2D cache miss rate": 0.1760390763344483, "LL cache miss rate": 0.9629545078696281}
--- a/pytorch/output/altra_2_2_p2p-Gnutella25_100.json
+++ b/pytorch/output/altra_2_2_p2p-Gnutella25_100.json
@ -1 +0,0 @@
-{"power_before": [29.76, 33.16], "shape": [22687, 22687], "nnz": 54705, "% density": 0.00010628522108964806, "time_s": 0.14322686195373535, "power": [22.6, 22.6, 26.16, 29.2], "power_after": [34.0, 30.16], "task clock (msec)": 64.71, "page faults": 3319, "cycles": 57611295, "instructions": 83148228, "branch mispredictions": 318386, "branches": 19233431, "ITLB accesses": 27039805, "ITLB misses": 6375, "DTLB misses": 17290, "DTLB accesses": 36688544, "L1I cache accesses": 32508072, "L1I cache misses": 297568, "L1D cache misses": 477654, "L1D cache accesses": 34044579, "LL cache misses": 549474, "LL cache accesses": 561939, "L2D TLB accesses": 185622, "L2D TLB misses": 23295, "L2D cache misses": 305878, "L2D cache accesses": 1763089, "instructions per cycle": 1.4432626102225268, "branch miss rate": 0.01655378075809771, "ITLB miss rate": 0.00023576353453732377, "DTLB miss rate": 0.00047126427257511227, "L2D TLB miss rate": 0.12549697772893298, "L1I cache miss rate": 0.009153664972810446, "L1D cache miss rate": 0.014030251336049713, "L2D cache miss rate": 0.17348982382625042, "LL cache miss rate": 0.9778178770293573}
--- a/pytorch/output/altra_2_2_p2p-Gnutella30_100.json
+++ b/pytorch/output/altra_2_2_p2p-Gnutella30_100.json
@ -1 +0,0 @@
-{"power_before": [20.56, 20.28], "shape": [36682, 36682], "nnz": 88328, "% density": 6.564359899804003e-05, "time_s": 0.30861377716064453, "power": [23.88, 27.6, 39.8, 40.12], "power_after": [39.28, 35.2], "task clock (msec)": 65.91, "page faults": 3247, "cycles": 92293071, "instructions": 76208632, "branch mispredictions": 320083, "branches": 19285106, "ITLB accesses": 26853940, "ITLB misses": 6728, "DTLB misses": 13955, "DTLB accesses": 37111059, "L1I cache accesses": 32554796, "L1I cache misses": 298729, "L1D cache misses": 473779, "L1D cache accesses": 34117102, "LL cache misses": 535040, "LL cache accesses": 547502, "L2D TLB accesses": 179876, "L2D TLB misses": 21809, "L2D cache misses": 298620, "L2D cache accesses": 1722959, "instructions per cycle": 0.8257243059990929, "branch miss rate": 0.016597419791210898, "ITLB miss rate": 0.0002505405165871377, "DTLB miss rate": 0.0003760334621547717, "L2D TLB miss rate": 0.12124463519313304, "L1I cache miss rate": 0.009176190199440968, "L1D cache miss rate": 0.013886847716432655, "L2D cache miss rate": 0.17331811145825293, "LL cache miss rate": 0.9772384393116372}
--- a/pytorch/output/altra_2_2_ri2010_100.json
+++ b/pytorch/output/altra_2_2_ri2010_100.json
@ -1 +0,0 @@
-{"power_before": [30.44, 35.52], "shape": [25181, 25181], "nnz": 125750, "% density": 0.00019831796057928155, "time_s": 0.29622840881347656, "power": [23.84, 29.44, 33.0, 33.04], "power_after": [36.32, 30.0], "task clock (msec)": 60.77, "page faults": 3361, "cycles": 63493475, "instructions": 91578911, "branch mispredictions": 329084, "branches": 20406595, "ITLB accesses": 26859919, "ITLB misses": 6237, "DTLB misses": 16689, "DTLB accesses": 36348977, "L1I cache accesses": 30979764, "L1I cache misses": 292038, "L1D cache misses": 469219, "L1D cache accesses": 32411890, "LL cache misses": 571870, "LL cache accesses": 598306, "L2D TLB accesses": 205488, "L2D TLB misses": 26392, "L2D cache misses": 342141, "L2D cache accesses": 1857697, "instructions per cycle": 1.442335783322617, "branch miss rate": 0.01612635522976763, "ITLB miss rate": 0.00023220472109390948, "DTLB miss rate": 0.0004591325912693499, "L2D TLB miss rate": 0.12843572374055906, "L1I cache miss rate": 0.009426734173959492, "L1D cache miss rate": 0.014476755289494072, "L2D cache miss rate": 0.1841748142996409, "LL cache miss rate": 0.9558152517273769}
--- a/pytorch/output/altra_2_2_soc-sign-Slashdot090216_100.json
+++ b/pytorch/output/altra_2_2_soc-sign-Slashdot090216_100.json
@ -1 +0,0 @@
-{"power_before": [16.52, 16.64], "shape": [81871, 81871], "nnz": 545671, "% density": 8.140867447881048e-05, "time_s": 1.3372814655303955, "power": [23.92, 38.6, 46.04, 48.2, 48.2], "power_after": [45.0, 44.08], "task clock (msec)": 59.01, "page faults": 3448, "cycles": 73062796, "instructions": 88329175, "branch mispredictions": 331091, "branches": 20013316, "ITLB accesses": 26330936, "ITLB misses": 5193, "DTLB misses": 16837, "DTLB accesses": 35930477, "L1I cache accesses": 31853890, "L1I cache misses": 306147, "L1D cache misses": 479933, "L1D cache accesses": 33426019, "LL cache misses": 540302, "LL cache accesses": 553181, "L2D TLB accesses": 173206, "L2D TLB misses": 21390, "L2D cache misses": 300032, "L2D cache accesses": 1739931, "instructions per cycle": 1.2089487377406143, "branch miss rate": 0.016543535314187813, "ITLB miss rate": 0.0001972204861991993, "DTLB miss rate": 0.000468599401004334, "L2D TLB miss rate": 0.12349456716280037, "L1I cache miss rate": 0.009610976869701, "L1D cache miss rate": 0.014358066391334247, "L2D cache miss rate": 0.17243902200719455, "LL cache miss rate": 0.9767182893121781}
--- a/pytorch/output/altra_2_2_soc-sign-Slashdot090221_100.json
+++ b/pytorch/output/altra_2_2_soc-sign-Slashdot090221_100.json
@ -1 +0,0 @@
-{"power_before": [53.64, 46.88], "shape": [82144, 82144], "nnz": 549202, "% density": 8.13917555860553e-05, "time_s": 1.2292509078979492, "power": [40.64, 52.44, 54.8, 54.96, 46.8], "power_after": [47.88, 47.08], "task clock (msec)": 61.26, "page faults": 3303, "cycles": 44515786, "instructions": 81513738, "branch mispredictions": 328019, "branches": 19893662, "ITLB accesses": 27248112, "ITLB misses": 5792, "DTLB misses": 16632, "DTLB accesses": 36929042, "L1I cache accesses": 31702830, "L1I cache misses": 295778, "L1D cache misses": 470423, "L1D cache accesses": 33155119, "LL cache misses": 545220, "LL cache accesses": 562139, "L2D TLB accesses": 192206, "L2D TLB misses": 24891, "L2D cache misses": 307033, "L2D cache accesses": 1782260, "instructions per cycle": 1.8311198189334452, "branch miss rate": 0.01648861833482443, "ITLB miss rate": 0.0002125651861677609, "DTLB miss rate": 0.0004503772396803578, "L2D TLB miss rate": 0.12950168048864238, "L1I cache miss rate": 0.009329703373484323, "L1D cache miss rate": 0.014188548079106578, "L2D cache miss rate": 0.17227172241984895, "LL cache miss rate": 0.9699024618466251}
--- a/pytorch/output/altra_2_2_soc-sign-epinions_100.json
+++ b/pytorch/output/altra_2_2_soc-sign-epinions_100.json
@ -1 +0,0 @@
-{"power_before": [30.48, 33.04], "shape": [131828, 131828], "nnz": 841372, "% density": 4.841419648464106e-05, "time_s": 2.848874092102051, "power": [65.52, 75.88, 71.16, 71.16, 59.72, 47.92, 48.68], "power_after": [68.68, 67.88], "task clock (msec)": 49.87, "page faults": 3300, "cycles": 51935476, "instructions": 83731856, "branch mispredictions": 326464, "branches": 20341367, "ITLB accesses": 27590154, "ITLB misses": 6210, "DTLB misses": 17536, "DTLB accesses": 36763243, "L1I cache accesses": 31663300, "L1I cache misses": 289727, "L1D cache misses": 462864, "L1D cache accesses": 33262254, "LL cache misses": 530272, "LL cache accesses": 551373, "L2D TLB accesses": 196152, "L2D TLB misses": 23542, "L2D cache misses": 301998, "L2D cache accesses": 1732662, "instructions per cycle": 1.6122285275675532, "branch miss rate": 0.01604926551888081, "ITLB miss rate": 0.000225080294948698, "DTLB miss rate": 0.0004769981799483794, "L2D TLB miss rate": 0.12001916880786329, "L1I cache miss rate": 0.00915024649989104, "L1D cache miss rate": 0.013915593332911234, "L2D cache miss rate": 0.17429712200071334, "LL cache miss rate": 0.9617300810884828}
--- a/pytorch/output/altra_2_2_sx-mathoverflow_100.json
+++ b/pytorch/output/altra_2_2_sx-mathoverflow_100.json
@ -1 +0,0 @@
-{"power_before": [20.44, 20.2], "shape": [24818, 24818], "nnz": 239978, "% density": 0.00038961697406616504, "time_s": 0.556269645690918, "power": [25.24, 32.16, 33.0, 32.52], "power_after": [34.24, 30.28], "task clock (msec)": 62.49, "page faults": 3312, "cycles": 76783170, "instructions": 77095702, "branch mispredictions": 323514, "branches": 19769937, "ITLB accesses": 26809325, "ITLB misses": 6925, "DTLB misses": 19003, "DTLB accesses": 36516965, "L1I cache accesses": 31104231, "L1I cache misses": 285499, "L1D cache misses": 468498, "L1D cache accesses": 32677465, "LL cache misses": 559358, "LL cache accesses": 571935, "L2D TLB accesses": 194840, "L2D TLB misses": 23481, "L2D cache misses": 313487, "L2D cache accesses": 1779730, "instructions per cycle": 1.004070319055595, "branch miss rate": 0.016363936819829016, "ITLB miss rate": 0.00025830564551699827, "DTLB miss rate": 0.0005203882633729282, "L2D TLB miss rate": 0.12051426811742968, "L1I cache miss rate": 0.009178783426601994, "L1D cache miss rate": 0.01433703624194839, "L2D cache miss rate": 0.1761430104566423, "LL cache miss rate": 0.9780097388689274}
--- a/pytorch/output/altra_2_2_ut2010_100.json
+++ b/pytorch/output/altra_2_2_ut2010_100.json
@ -1 +0,0 @@
-{"power_before": [34.6, 37.16], "shape": [115406, 115406], "nnz": 572066, "% density": 4.295259032005559e-05, "time_s": 1.0817186832427979, "power": [34.32, 50.84, 52.12, 52.4, 52.76], "power_after": [49.0, 45.08], "task clock (msec)": 60.55, "page faults": 3490, "cycles": 49977496, "instructions": 78622993, "branch mispredictions": 327078, "branches": 20135808, "ITLB accesses": 27608093, "ITLB misses": 6616, "DTLB misses": 17185, "DTLB accesses": 36866957, "L1I cache accesses": 32639204, "L1I cache misses": 309643, "L1D cache misses": 478856, "L1D cache accesses": 34280618, "LL cache misses": 555275, "LL cache accesses": 578455, "L2D TLB accesses": 188723, "L2D TLB misses": 24635, "L2D cache misses": 319663, "L2D cache accesses": 1799940, "instructions per cycle": 1.573167911413569, "branch miss rate": 0.016243599462211798, "ITLB miss rate": 0.00023963987661154286, "DTLB miss rate": 0.00046613556958335347, "L2D TLB miss rate": 0.13053522888042263, "L1I cache miss rate": 0.009486842877663316, "L1D cache miss rate": 0.013968709665619214, "L2D cache miss rate": 0.17759647543807017, "LL cache miss rate": 0.9599277385449171}
--- a/pytorch/output/altra_2_2_vt2010_100.json
+++ b/pytorch/output/altra_2_2_vt2010_100.json
@ -1 +0,0 @@
-{"power_before": [34.04, 43.96], "shape": [32580, 32580], "nnz": 155598, "% density": 0.00014658915806621921, "time_s": 0.4164857864379883, "power": [23.72, 23.72, 29.88, 33.32], "power_after": [33.36, 32.52], "task clock (msec)": 61.63, "page faults": 3304, "cycles": 64734203, "instructions": 53597991, "branch mispredictions": 330777, "branches": 20357034, "ITLB accesses": 27381387, "ITLB misses": 6248, "DTLB misses": 17636, "DTLB accesses": 37436110, "L1I cache accesses": 32505993, "L1I cache misses": 303849, "L1D cache misses": 467426, "L1D cache accesses": 34241110, "LL cache misses": 550075, "LL cache accesses": 562829, "L2D TLB accesses": 199285, "L2D TLB misses": 24424, "L2D cache misses": 310155, "L2D cache accesses": 1783824, "instructions per cycle": 0.8279701999266138, "branch miss rate": 0.016248781625063848, "ITLB miss rate": 0.00022818420410916364, "DTLB miss rate": 0.00047109595521543235, "L2D TLB miss rate": 0.12255814536969667, "L1I cache miss rate": 0.009347476325365603, "L1D cache miss rate": 0.01365101773861887, "L2D cache miss rate": 0.17387085272986572, "LL cache miss rate": 0.9773394761108614}
--- a/pytorch/output_HPC/altra_10_30_ASIC_680k_1000.json
+++ b/pytorch/output_HPC/altra_10_30_ASIC_680k_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [80.64, 75.2, 61.4, 49.84, 38.12, 38.12, 24.16, 22.68, 22.36, 22.2], "matrix": "ASIC_680k", "shape": [682862, 682862], "nnz": 3871773, "% density": 8.303171256088674e-06, "time_s": 41.51614689826965, "power": [92.4, 92.44, 84.28, 73.04, 59.64, 53.28, 56.32, 65.88, 80.28, 93.76, 96.16, 94.44, 94.44, 94.2, 93.92, 92.48, 92.16, 91.84, 92.08, 91.84, 91.68, 90.68, 90.88, 90.28, 90.28, 92.44, 92.52, 92.84, 90.0, 89.64, 88.16, 87.28, 88.12, 88.24, 88.08, 85.72, 85.12, 85.12, 81.72, 82.52, 83.84, 86.32, 88.8, 91.0, 90.2], "power_after": [21.92, 21.88, 21.92, 21.88, 21.88, 21.72, 21.72, 21.72, 21.72, 21.44], "task clock (msec)": 55.74, "page faults": 3266, "cycles": 51085608, "instructions": 88049969, "branch mispredictions": 332704, "branches": 20219525, "ITLB accesses": 27856157, "ITLB misses": 6496, "DTLB misses": 17046, "DTLB accesses": 37522360, "L1I cache accesses": 31475230, "L1I cache misses": 277921, "L1D cache misses": 462005, "L1D cache accesses": 33126938, "LL cache misses": 558923, "LL cache accesses": 571263, "L2D TLB accesses": 190627, "L2D TLB misses": 24234, "L2D cache misses": 314815, "L2D cache accesses": 1760110, "instructions per cycle": 1.7235768046452535, "branch miss rate": 0.01645459030318467, "ITLB miss rate": 0.00023319799640704206, "DTLB miss rate": 0.0004542891225392006, "L2D TLB miss rate": 0.12712784652751186, "L1I cache miss rate": 0.008829832220447635, "L1D cache miss rate": 0.013946504805243395, "L2D cache miss rate": 0.17886098027964162, "LL cache miss rate": 0.978398741035215}
--- a/pytorch/output_HPC/altra_10_30_ASIC_680k_1000.output
+++ b/pytorch/output_HPC/altra_10_30_ASIC_680k_1000.output
@ -0,0 +1,173 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395287 queued and waiting for resources
+srun: job 3395287 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,       4,  ..., 3871767,
+                            3871770, 3871773]),
+       col_indices=tensor([     0,  11698,  11699,  ..., 169456, 645874,
+                           682861]),
+       values=tensor([ 3.8333e-04, -3.3333e-04, -5.0000e-05,  ...,
+                       0.0000e+00,  0.0000e+00,  7.9289e-02]),
+       size=(682862, 682862), nnz=3871773, layout=torch.sparse_csr)
+tensor([0.9283, 0.0381, 0.0668,  ..., 0.8379, 0.4193, 0.2544])
+Matrix: ASIC_680k
+Shape: torch.Size([682862, 682862])
+NNZ: 3871773
+Density: 8.303171256088674e-06
+Time: 29.317893266677856 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ASIC_680k.mtx 1000':
+
+             55.74 msec task-clock:u                     #    0.002 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,266      page-faults:u                    #   58.589 K/sec                     
+        51,085,608      cycles:u                         #    0.916 GHz                         (47.05%)
+        88,049,969      instructions:u                   #    1.72  insn per cycle              (92.14%)
+   <not supported>      branches:u                                                            
+           360,079      branch-misses:u                                                       
+        31,381,953      L1-dcache-loads:u                #  562.963 M/sec                     
+           471,072      L1-dcache-load-misses:u          #    1.50% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        29,944,756      L1-icache-loads:u                #  537.181 M/sec                     
+           283,203      L1-icache-load-misses:u          #    0.95% of all L1-icache accesses 
+        20,217,238      dTLB-loads:u                     #  362.679 M/sec                       (11.38%)
+     <not counted>      dTLB-load-misses:u                                                      (0.00%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+      33.488240295 seconds time elapsed
+
+     222.678572000 seconds user
+    2205.889153000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,       4,  ..., 3871767,
+                            3871770, 3871773]),
+       col_indices=tensor([     0,  11698,  11699,  ..., 169456, 645874,
+                           682861]),
+       values=tensor([ 3.8333e-04, -3.3333e-04, -5.0000e-05,  ...,
+                       0.0000e+00,  0.0000e+00,  7.9289e-02]),
+       size=(682862, 682862), nnz=3871773, layout=torch.sparse_csr)
+tensor([0.3482, 0.5546, 0.8398,  ..., 0.6137, 0.0654, 0.9075])
+Matrix: ASIC_680k
+Shape: torch.Size([682862, 682862])
+NNZ: 3871773
+Density: 8.303171256088674e-06
+Time: 38.4066903591156 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ASIC_680k.mtx 1000':
+
+           332,704      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,219,525      BR_RETIRED:u                                                          
+
+      42.582064532 seconds time elapsed
+
+     238.965431000 seconds user
+    2914.615754000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,       4,  ..., 3871767,
+                            3871770, 3871773]),
+       col_indices=tensor([     0,  11698,  11699,  ..., 169456, 645874,
+                           682861]),
+       values=tensor([ 3.8333e-04, -3.3333e-04, -5.0000e-05,  ...,
+                       0.0000e+00,  0.0000e+00,  7.9289e-02]),
+       size=(682862, 682862), nnz=3871773, layout=torch.sparse_csr)
+tensor([0.2581, 0.2884, 0.9465,  ..., 0.4833, 0.3421, 0.4862])
+Matrix: ASIC_680k
+Shape: torch.Size([682862, 682862])
+NNZ: 3871773
+Density: 8.303171256088674e-06
+Time: 34.74818539619446 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ASIC_680k.mtx 1000':
+
+        27,856,157      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,496      ITLB_WALK:u                                                           
+            17,046      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        37,522,360      L1D_TLB:u                                                             
+
+      39.019872270 seconds time elapsed
+
+     239.678206000 seconds user
+    2622.552757000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,       4,  ..., 3871767,
+                            3871770, 3871773]),
+       col_indices=tensor([     0,  11698,  11699,  ..., 169456, 645874,
+                           682861]),
+       values=tensor([ 3.8333e-04, -3.3333e-04, -5.0000e-05,  ...,
+                       0.0000e+00,  0.0000e+00,  7.9289e-02]),
+       size=(682862, 682862), nnz=3871773, layout=torch.sparse_csr)
+tensor([0.8603, 0.0423, 0.3724,  ..., 0.4873, 0.6469, 0.9634])
+Matrix: ASIC_680k
+Shape: torch.Size([682862, 682862])
+NNZ: 3871773
+Density: 8.303171256088674e-06
+Time: 33.05097770690918 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ASIC_680k.mtx 1000':
+
+        31,475,230      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           277,921      L1I_CACHE_REFILL:u                                                    
+           462,005      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,126,938      L1D_CACHE:u                                                           
+
+      37.399374202 seconds time elapsed
+
+     239.238852000 seconds user
+    2492.385966000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,       4,  ..., 3871767,
+                            3871770, 3871773]),
+       col_indices=tensor([     0,  11698,  11699,  ..., 169456, 645874,
+                           682861]),
+       values=tensor([ 3.8333e-04, -3.3333e-04, -5.0000e-05,  ...,
+                       0.0000e+00,  0.0000e+00,  7.9289e-02]),
+       size=(682862, 682862), nnz=3871773, layout=torch.sparse_csr)
+tensor([0.1993, 0.2167, 0.6338,  ..., 0.0614, 0.0230, 0.4851])
+Matrix: ASIC_680k
+Shape: torch.Size([682862, 682862])
+NNZ: 3871773
+Density: 8.303171256088674e-06
+Time: 32.37103772163391 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ASIC_680k.mtx 1000':
+
+           558,923      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           571,263      LL_CACHE_RD:u                                                         
+           190,627      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            24,234      L2D_TLB_REFILL:u                                                      
+           314,815      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,760,110      L2D_CACHE:u                                                           
+
+      36.644016288 seconds time elapsed
+
+     233.933818000 seconds user
+    2439.284669000 seconds sys
+
+
+
--- a/pytorch/output_HPC/altra_10_30_de2010_1000.json
+++ b/pytorch/output_HPC/altra_10_30_de2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [22.08, 21.88, 21.88, 21.88, 21.56, 21.64, 21.84, 21.88, 21.72, 21.92], "matrix": "de2010", "shape": [24115, 24115], "nnz": 116056, "% density": 0.0001995689928120616, "time_s": 2.7533018589019775, "power": [29.48, 30.24, 27.96, 28.4, 26.84, 30.6, 30.92], "power_after": [20.84, 21.24, 21.2, 21.24, 21.28, 20.88, 20.68, 20.56, 20.52, 20.56], "task clock (msec)": 61.38, "page faults": 3315, "cycles": 65013274, "instructions": 87442627, "branch mispredictions": 328392, "branches": 19496396, "ITLB accesses": 28311619, "ITLB misses": 6963, "DTLB misses": 17888, "DTLB accesses": 38223408, "L1I cache accesses": 30063404, "L1I cache misses": 272797, "L1D cache misses": 468341, "L1D cache accesses": 31519623, "LL cache misses": 538689, "LL cache accesses": 552789, "L2D TLB accesses": 192995, "L2D TLB misses": 23339, "L2D cache misses": 300578, "L2D cache accesses": 1764035, "instructions per cycle": 1.344996515634638, "branch miss rate": 0.016843728451145536, "ITLB miss rate": 0.0002459414277933028, "DTLB miss rate": 0.00046798548156668814, "L2D TLB miss rate": 0.12093059405684085, "L1I cache miss rate": 0.009074055619250568, "L1D cache miss rate": 0.01485871198395996, "L2D cache miss rate": 0.17039231081015965, "LL cache miss rate": 0.9744929801425137}
--- a/pytorch/output_HPC/altra_10_30_de2010_1000.output
+++ b/pytorch/output_HPC/altra_10_30_de2010_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394139 queued and waiting for resources
-srun: job 3394139 has been allocated resources
+srun: job 3395278 queued and waiting for resources
+srun: job 3395278 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
@ -15,37 +15,38 @@ tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
       values=tensor([ 14900.,  33341.,  20255.,  ..., 164227.,  52413.,
                       16949.]), size=(24115, 24115), nnz=116056,
       layout=torch.sparse_csr)
-tensor([0.4207, 0.3943, 0.6543,  ..., 0.2191, 0.5415, 0.1575])
+tensor([0.3547, 0.6554, 0.2142,  ..., 0.8854, 0.1041, 0.2243])
+Matrix: de2010
 Shape: torch.Size([24115, 24115])
 NNZ: 116056
 Density: 0.0001995689928120616
-Time: 0.36042284965515137 seconds
+Time: 2.74495267868042 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 1000':

-             59.88 msec task-clock:u                     #    0.016 CPUs utilized             
+             61.38 msec task-clock:u                     #    0.010 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,313      page-faults:u                    #   55.328 K/sec                     
-        58,169,777      cycles:u                         #    0.971 GHz                         (61.49%)
-        57,993,431      instructions:u                   #    1.00  insn per cycle              (81.67%)
+             3,315      page-faults:u                    #   54.008 K/sec                     
+        65,013,274      cycles:u                         #    1.059 GHz                         (90.47%)
+        87,442,627      instructions:u                   #    1.34  insn per cycle            
   <not supported>      branches:u                                                            
-           341,266      branch-misses:u                                                       
-        31,858,781      L1-dcache-loads:u                #  532.049 M/sec                     
-           467,486      L1-dcache-load-misses:u          #    1.47% of all L1-dcache accesses 
+           369,052      branch-misses:u                                                       
+        31,570,549      L1-dcache-loads:u                #  514.350 M/sec                     
+           477,402      L1-dcache-load-misses:u          #    1.51% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        30,461,310      L1-icache-loads:u                #  508.711 M/sec                     
-           294,156      L1-icache-load-misses:u          #    0.97% of all L1-icache accesses 
-        43,828,130      dTLB-loads:u                     #  731.940 M/sec                       (40.26%)
-            47,836      dTLB-load-misses:u               #    0.11% of all dTLB cache accesses  (25.52%)
-                 0      iTLB-loads:u                     #    0.000 /sec                        (2.73%)
+        30,354,192      L1-icache-loads:u                #  494.533 M/sec                     
+           294,845      L1-icache-load-misses:u          #    0.97% of all L1-icache accesses 
+                 0      dTLB-loads:u                     #    0.000 /sec                        (3.92%)
+     <not counted>      dTLB-load-misses:u                                                      (0.00%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       3.824054028 seconds time elapsed
+       6.232986287 seconds time elapsed

-      15.099361000 seconds user
-      28.830417000 seconds sys
+      17.354331000 seconds user
+      29.036034000 seconds sys



@ -57,21 +58,22 @@ tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
       values=tensor([ 14900.,  33341.,  20255.,  ..., 164227.,  52413.,
                       16949.]), size=(24115, 24115), nnz=116056,
       layout=torch.sparse_csr)
-tensor([0.0456, 0.2095, 0.0276,  ..., 0.4209, 0.6824, 0.5475])
+tensor([0.3177, 0.9122, 0.6465,  ..., 0.5489, 0.2254, 0.7965])
+Matrix: de2010
 Shape: torch.Size([24115, 24115])
 NNZ: 116056
 Density: 0.0001995689928120616
-Time: 0.3598823547363281 seconds
+Time: 2.7603256702423096 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 1000':

-           330,494      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        20,578,427      BR_RETIRED:u                                                          
+           328,392      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,496,396      BR_RETIRED:u                                                          

-       3.781234836 seconds time elapsed
+       6.149991615 seconds time elapsed

-      14.965545000 seconds user
-      29.444131000 seconds sys
+      17.630426000 seconds user
+      30.586756000 seconds sys



@ -83,23 +85,24 @@ tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
       values=tensor([ 14900.,  33341.,  20255.,  ..., 164227.,  52413.,
                       16949.]), size=(24115, 24115), nnz=116056,
       layout=torch.sparse_csr)
-tensor([0.9882, 0.5477, 0.6307,  ..., 0.1179, 0.6903, 0.1235])
+tensor([0.7815, 0.6240, 0.3715,  ..., 0.5116, 0.5969, 0.4241])
+Matrix: de2010
 Shape: torch.Size([24115, 24115])
 NNZ: 116056
 Density: 0.0001995689928120616
-Time: 0.29088521003723145 seconds
+Time: 2.7978765964508057 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 1000':

-        27,982,097      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,614      ITLB_WALK:u                                                           
-            17,270      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        37,728,899      L1D_TLB:u                                                             
+        28,311,619      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,963      ITLB_WALK:u                                                           
+            17,888      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        38,223,408      L1D_TLB:u                                                             

-       3.576632300 seconds time elapsed
+       6.151843492 seconds time elapsed

-      14.864601000 seconds user
-      29.274547000 seconds sys
+      17.202045000 seconds user
+      28.014218000 seconds sys



@ -111,23 +114,24 @@ tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
       values=tensor([ 14900.,  33341.,  20255.,  ..., 164227.,  52413.,
                       16949.]), size=(24115, 24115), nnz=116056,
       layout=torch.sparse_csr)
-tensor([0.3952, 0.0475, 0.1125,  ..., 0.3481, 0.1290, 0.3495])
+tensor([0.9638, 0.0929, 0.0479,  ..., 0.1500, 0.3117, 0.9664])
+Matrix: de2010
 Shape: torch.Size([24115, 24115])
 NNZ: 116056
 Density: 0.0001995689928120616
-Time: 0.30365920066833496 seconds
+Time: 2.684640884399414 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 1000':

-        29,754,926      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           278,786      L1I_CACHE_REFILL:u                                                    
-           454,742      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        31,173,246      L1D_CACHE:u                                                           
+        30,063,404      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           272,797      L1I_CACHE_REFILL:u                                                    
+           468,341      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        31,519,623      L1D_CACHE:u                                                           

-       3.730995381 seconds time elapsed
+       5.874324363 seconds time elapsed

-      15.213930000 seconds user
-      30.995070000 seconds sys
+      17.629166000 seconds user
+      29.998701000 seconds sys



@ -139,25 +143,26 @@ tensor(crow_indices=tensor([     0,     13,     21,  ..., 116047, 116051,
       values=tensor([ 14900.,  33341.,  20255.,  ..., 164227.,  52413.,
                       16949.]), size=(24115, 24115), nnz=116056,
       layout=torch.sparse_csr)
-tensor([0.7266, 0.7537, 0.9729,  ..., 0.3349, 0.3523, 0.6532])
+tensor([0.3936, 0.9167, 0.4396,  ..., 0.1628, 0.6361, 0.1875])
+Matrix: de2010
 Shape: torch.Size([24115, 24115])
 NNZ: 116056
 Density: 0.0001995689928120616
-Time: 0.2798902988433838 seconds
+Time: 2.747934103012085 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/de2010.mtx 1000':

-           543,243      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           560,716      LL_CACHE_RD:u                                                         
-           162,281      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            19,847      L2D_TLB_REFILL:u                                                      
-           300,577      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,696,278      L2D_CACHE:u                                                           
+           538,689      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           552,789      LL_CACHE_RD:u                                                         
+           192,995      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,339      L2D_TLB_REFILL:u                                                      
+           300,578      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,764,035      L2D_CACHE:u                                                           

-       3.819959836 seconds time elapsed
+       6.102012809 seconds time elapsed

-      15.346035000 seconds user
-      29.199873000 seconds sys
+      18.001082000 seconds user
+      27.986033000 seconds sys



--- a/pytorch/output_HPC/altra_10_30_fl2010_1000.json
+++ b/pytorch/output_HPC/altra_10_30_fl2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [20.72, 20.8, 20.96, 21.08, 21.4, 21.48, 21.48, 21.36, 21.08, 21.04], "matrix": "fl2010", "shape": [484481, 484481], "nnz": 2346294, "% density": 9.99606174861054e-06, "time_s": 14.43001127243042, "power": [93.04, 93.04, 89.16, 77.68, 62.92, 55.12, 53.84, 64.72, 77.04, 89.56, 94.4, 94.76, 93.52, 93.52, 96.04, 97.12, 96.44, 93.88, 93.72], "power_after": [21.08, 21.28, 21.28, 21.36, 21.08, 21.24, 21.08, 20.8, 21.04, 20.88], "task clock (msec)": 61.6, "page faults": 3276, "cycles": 41408849, "instructions": 49118917, "branch mispredictions": 331330, "branches": 19331189, "ITLB accesses": 27367982, "ITLB misses": 6160, "DTLB misses": 17157, "DTLB accesses": 36828216, "L1I cache accesses": 30147304, "L1I cache misses": 280082, "L1D cache misses": 454022, "L1D cache accesses": 31595140, "LL cache misses": 536056, "LL cache accesses": 550006, "L2D TLB accesses": 185998, "L2D TLB misses": 23735, "L2D cache misses": 296648, "L2D cache accesses": 1723525, "instructions per cycle": 1.1861937287848787, "branch miss rate": 0.017139659645353425, "ITLB miss rate": 0.00022508053388810325, "DTLB miss rate": 0.00046586562867992305, "L2D TLB miss rate": 0.12760889902041958, "L1I cache miss rate": 0.009290449321770198, "L1D cache miss rate": 0.014369994878959232, "L2D cache miss rate": 0.172117027603313, "LL cache miss rate": 0.97463664032756}
--- a/pytorch/output_HPC/altra_10_30_fl2010_1000.output
+++ b/pytorch/output_HPC/altra_10_30_fl2010_1000.output
@ -0,0 +1,169 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395283 queued and waiting for resources
+srun: job 3395283 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       5,  ..., 2346288,
+                            2346292, 2346294]),
+       col_indices=tensor([  1513,   5311,    947,  ..., 484460, 482463,
+                           484022]),
+       values=tensor([28364., 12497., 11567.,  ...,  8532., 22622., 35914.]),
+       size=(484481, 484481), nnz=2346294, layout=torch.sparse_csr)
+tensor([2.0367e-04, 1.7661e-01, 2.1772e-01,  ..., 1.8646e-01, 2.2210e-01,
+        4.2364e-02])
+Matrix: fl2010
+Shape: torch.Size([484481, 484481])
+NNZ: 2346294
+Density: 9.99606174861054e-06
+Time: 16.31556534767151 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/fl2010.mtx 1000':
+
+             61.60 msec task-clock:u                     #    0.003 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,276      page-faults:u                    #   53.185 K/sec                     
+        41,408,849      cycles:u                         #    0.672 GHz                         (41.57%)
+        49,118,917      instructions:u                   #    1.19  insn per cycle              (67.74%)
+   <not supported>      branches:u                                                            
+           344,653      branch-misses:u                                                         (91.69%)
+        31,501,274      L1-dcache-loads:u                #  511.418 M/sec                     
+           477,740      L1-dcache-load-misses:u          #    1.52% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        30,099,667      L1-icache-loads:u                #  488.663 M/sec                     
+           285,734      L1-icache-load-misses:u          #    0.95% of all L1-icache accesses 
+        41,879,387      dTLB-loads:u                     #  679.904 M/sec                       (54.00%)
+            99,044      dTLB-load-misses:u               #    0.24% of all dTLB cache accesses  (13.61%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+      20.288512544 seconds time elapsed
+
+     134.447078000 seconds user
+    1247.121046000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       5,  ..., 2346288,
+                            2346292, 2346294]),
+       col_indices=tensor([  1513,   5311,    947,  ..., 484460, 482463,
+                           484022]),
+       values=tensor([28364., 12497., 11567.,  ...,  8532., 22622., 35914.]),
+       size=(484481, 484481), nnz=2346294, layout=torch.sparse_csr)
+tensor([0.9700, 0.5813, 0.6566,  ..., 0.4126, 0.7652, 0.9833])
+Matrix: fl2010
+Shape: torch.Size([484481, 484481])
+NNZ: 2346294
+Density: 9.99606174861054e-06
+Time: 16.561575651168823 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/fl2010.mtx 1000':
+
+           331,330      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,331,189      BR_RETIRED:u                                                          
+
+      20.603578845 seconds time elapsed
+
+     136.555709000 seconds user
+    1264.382740000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       5,  ..., 2346288,
+                            2346292, 2346294]),
+       col_indices=tensor([  1513,   5311,    947,  ..., 484460, 482463,
+                           484022]),
+       values=tensor([28364., 12497., 11567.,  ...,  8532., 22622., 35914.]),
+       size=(484481, 484481), nnz=2346294, layout=torch.sparse_csr)
+tensor([0.1770, 0.8270, 0.4236,  ..., 0.0091, 0.2300, 0.5084])
+Matrix: fl2010
+Shape: torch.Size([484481, 484481])
+NNZ: 2346294
+Density: 9.99606174861054e-06
+Time: 17.374610424041748 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/fl2010.mtx 1000':
+
+        27,367,982      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,160      ITLB_WALK:u                                                           
+            17,157      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,828,216      L1D_TLB:u                                                             
+
+      21.377378255 seconds time elapsed
+
+     140.848520000 seconds user
+    1326.124469000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       5,  ..., 2346288,
+                            2346292, 2346294]),
+       col_indices=tensor([  1513,   5311,    947,  ..., 484460, 482463,
+                           484022]),
+       values=tensor([28364., 12497., 11567.,  ...,  8532., 22622., 35914.]),
+       size=(484481, 484481), nnz=2346294, layout=torch.sparse_csr)
+tensor([0.1268, 0.8786, 0.9762,  ..., 0.0649, 0.4474, 0.9707])
+Matrix: fl2010
+Shape: torch.Size([484481, 484481])
+NNZ: 2346294
+Density: 9.99606174861054e-06
+Time: 16.753613471984863 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/fl2010.mtx 1000':
+
+        30,147,304      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           280,082      L1I_CACHE_REFILL:u                                                    
+           454,022      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        31,595,140      L1D_CACHE:u                                                           
+
+      20.706929400 seconds time elapsed
+
+     139.881127000 seconds user
+    1278.527504000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       5,  ..., 2346288,
+                            2346292, 2346294]),
+       col_indices=tensor([  1513,   5311,    947,  ..., 484460, 482463,
+                           484022]),
+       values=tensor([28364., 12497., 11567.,  ...,  8532., 22622., 35914.]),
+       size=(484481, 484481), nnz=2346294, layout=torch.sparse_csr)
+tensor([0.1394, 0.8842, 0.4362,  ..., 0.8265, 0.1643, 0.9034])
+Matrix: fl2010
+Shape: torch.Size([484481, 484481])
+NNZ: 2346294
+Density: 9.99606174861054e-06
+Time: 14.484151124954224 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/fl2010.mtx 1000':
+
+           536,056      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           550,006      LL_CACHE_RD:u                                                         
+           185,998      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,735      L2D_TLB_REFILL:u                                                      
+           296,648      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,723,525      L2D_CACHE:u                                                           
+
+      18.443039315 seconds time elapsed
+
+     135.498625000 seconds user
+    1101.745145000 seconds sys
+
+
+
--- a/pytorch/output_HPC/altra_10_30_ga2010_1000.json
+++ b/pytorch/output_HPC/altra_10_30_ga2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [51.04, 38.64, 22.84, 22.24, 21.88, 21.88, 21.6, 21.4, 21.24, 21.28], "matrix": "ga2010", "shape": [291086, 291086], "nnz": 1418056, "% density": 1.6735964475229304e-05, "time_s": 15.249999523162842, "power": [88.88, 89.52, 78.6, 64.88, 52.64, 52.64, 54.76, 60.16, 71.44, 86.84, 90.72, 89.6, 90.56, 90.36, 91.68, 91.84, 93.4, 93.4, 92.72], "power_after": [21.68, 21.4, 21.28, 21.04, 21.04, 20.96, 20.92, 20.76, 20.8, 20.96], "task clock (msec)": 72.45, "page faults": 3289, "cycles": 24836161, "instructions": 74134706, "branch mispredictions": 325643, "branches": 19697746, "ITLB accesses": 27767290, "ITLB misses": 5832, "DTLB misses": 18134, "DTLB accesses": 37063060, "L1I cache accesses": 32135376, "L1I cache misses": 302429, "L1D cache misses": 484427, "L1D cache accesses": 33639686, "LL cache misses": 548380, "LL cache accesses": 561312, "L2D TLB accesses": 186006, "L2D TLB misses": 25022, "L2D cache misses": 304539, "L2D cache accesses": 1750107, "instructions per cycle": 2.9849502908279586, "branch miss rate": 0.01653199305138771, "ITLB miss rate": 0.00021003129941740803, "DTLB miss rate": 0.0004892742261432272, "L2D TLB miss rate": 0.13452254228358226, "L1I cache miss rate": 0.009411092622659838, "L1D cache miss rate": 0.014400461407398393, "L2D cache miss rate": 0.17401164614506429, "LL cache miss rate": 0.976961119662505}
--- a/pytorch/output_HPC/altra_10_30_ga2010_1000.output
+++ b/pytorch/output_HPC/altra_10_30_ga2010_1000.output
@ -0,0 +1,168 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395281 queued and waiting for resources
+srun: job 3395281 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,      10,  ..., 1418047,
+                            1418054, 1418056]),
+       col_indices=tensor([  1566,   1871,   1997,  ..., 291064, 289820,
+                           290176]),
+       values=tensor([18760., 17851., 18847.,  ..., 65219., 56729., 77629.]),
+       size=(291086, 291086), nnz=1418056, layout=torch.sparse_csr)
+tensor([0.8043, 0.7164, 0.5687,  ..., 0.1275, 0.5142, 0.8456])
+Matrix: ga2010
+Shape: torch.Size([291086, 291086])
+NNZ: 1418056
+Density: 1.6735964475229304e-05
+Time: 13.566045045852661 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ga2010.mtx 1000':
+
+             72.45 msec task-clock:u                     #    0.004 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,289      page-faults:u                    #   45.396 K/sec                     
+        24,836,161      cycles:u                         #    0.343 GHz                         (23.15%)
+        74,134,706      instructions:u                   #    2.98  insn per cycle              (85.49%)
+   <not supported>      branches:u                                                            
+           381,828      branch-misses:u                                                       
+        33,748,654      L1-dcache-loads:u                #  465.814 M/sec                     
+           497,166      L1-dcache-load-misses:u          #    1.47% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        32,271,900      L1-icache-loads:u                #  445.431 M/sec                     
+           311,814      L1-icache-load-misses:u          #    0.97% of all L1-icache accesses 
+        43,431,516      dTLB-loads:u                     #  599.461 M/sec                       (27.81%)
+            33,416      dTLB-load-misses:u               #    0.08% of all dTLB cache accesses  (4.55%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+      17.276157893 seconds time elapsed
+
+     100.320029000 seconds user
+    1057.703228000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,      10,  ..., 1418047,
+                            1418054, 1418056]),
+       col_indices=tensor([  1566,   1871,   1997,  ..., 291064, 289820,
+                           290176]),
+       values=tensor([18760., 17851., 18847.,  ..., 65219., 56729., 77629.]),
+       size=(291086, 291086), nnz=1418056, layout=torch.sparse_csr)
+tensor([0.6290, 0.2236, 0.0669,  ..., 0.6531, 0.4280, 0.4384])
+Matrix: ga2010
+Shape: torch.Size([291086, 291086])
+NNZ: 1418056
+Density: 1.6735964475229304e-05
+Time: 17.094524145126343 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ga2010.mtx 1000':
+
+           325,643      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,697,746      BR_RETIRED:u                                                          
+
+      20.849795214 seconds time elapsed
+
+     115.280665000 seconds user
+    1318.654953000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,      10,  ..., 1418047,
+                            1418054, 1418056]),
+       col_indices=tensor([  1566,   1871,   1997,  ..., 291064, 289820,
+                           290176]),
+       values=tensor([18760., 17851., 18847.,  ..., 65219., 56729., 77629.]),
+       size=(291086, 291086), nnz=1418056, layout=torch.sparse_csr)
+tensor([0.1008, 0.2309, 0.3749,  ..., 0.1568, 0.8852, 0.8182])
+Matrix: ga2010
+Shape: torch.Size([291086, 291086])
+NNZ: 1418056
+Density: 1.6735964475229304e-05
+Time: 15.106332063674927 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ga2010.mtx 1000':
+
+        27,767,290      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             5,832      ITLB_WALK:u                                                           
+            18,134      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        37,063,060      L1D_TLB:u                                                             
+
+      18.753509375 seconds time elapsed
+
+     112.958759000 seconds user
+    1167.457916000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,      10,  ..., 1418047,
+                            1418054, 1418056]),
+       col_indices=tensor([  1566,   1871,   1997,  ..., 291064, 289820,
+                           290176]),
+       values=tensor([18760., 17851., 18847.,  ..., 65219., 56729., 77629.]),
+       size=(291086, 291086), nnz=1418056, layout=torch.sparse_csr)
+tensor([0.8347, 0.6624, 0.6196,  ..., 0.2250, 0.0157, 0.1843])
+Matrix: ga2010
+Shape: torch.Size([291086, 291086])
+NNZ: 1418056
+Density: 1.6735964475229304e-05
+Time: 13.73094367980957 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ga2010.mtx 1000':
+
+        32,135,376      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           302,429      L1I_CACHE_REFILL:u                                                    
+           484,427      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,639,686      L1D_CACHE:u                                                           
+
+      17.400567824 seconds time elapsed
+
+     110.027662000 seconds user
+    1054.271122000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,      10,  ..., 1418047,
+                            1418054, 1418056]),
+       col_indices=tensor([  1566,   1871,   1997,  ..., 291064, 289820,
+                           290176]),
+       values=tensor([18760., 17851., 18847.,  ..., 65219., 56729., 77629.]),
+       size=(291086, 291086), nnz=1418056, layout=torch.sparse_csr)
+tensor([0.8369, 0.3399, 0.1689,  ..., 0.2081, 0.0714, 0.7388])
+Matrix: ga2010
+Shape: torch.Size([291086, 291086])
+NNZ: 1418056
+Density: 1.6735964475229304e-05
+Time: 15.809288501739502 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ga2010.mtx 1000':
+
+           548,380      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           561,312      LL_CACHE_RD:u                                                         
+           186,006      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            25,022      L2D_TLB_REFILL:u                                                      
+           304,539      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,750,107      L2D_CACHE:u                                                           
+
+      19.626934574 seconds time elapsed
+
+     116.733174000 seconds user
+    1214.439657000 seconds sys
+
+
+
--- a/pytorch/output_HPC/altra_10_30_mac_econ_fwd500_1000.json
+++ b/pytorch/output_HPC/altra_10_30_mac_econ_fwd500_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [22.04, 21.32, 21.32, 21.32, 21.12, 21.12, 21.0, 20.68, 20.72, 20.56], "matrix": "mac_econ_fwd500", "shape": [206500, 206500], "nnz": 1273389, "% density": 2.9862143765866013e-05, "time_s": 15.046087741851807, "power": [91.88, 91.12, 83.92, 72.88, 57.76, 51.24, 53.12, 62.84, 78.32, 91.64, 95.8, 95.8, 94.08, 92.48, 91.6, 89.88, 87.36, 87.84, 87.32], "power_after": [20.92, 21.04, 21.12, 20.92, 20.92, 20.88, 20.88, 20.92, 21.04, 20.96], "task clock (msec)": 62.46, "page faults": 3243, "cycles": 57150420, "instructions": 94155455, "branch mispredictions": 320781, "branches": 19491698, "ITLB accesses": 27433101, "ITLB misses": 7382, "DTLB misses": 19213, "DTLB accesses": 37123052, "L1I cache accesses": 32027284, "L1I cache misses": 290368, "L1D cache misses": 471338, "L1D cache accesses": 33366668, "LL cache misses": 571063, "LL cache accesses": 583554, "L2D TLB accesses": 196434, "L2D TLB misses": 25171, "L2D cache misses": 329198, "L2D cache accesses": 1814040, "instructions per cycle": 1.6475024155553013, "branch miss rate": 0.016457314288370363, "ITLB miss rate": 0.0002690909788142434, "DTLB miss rate": 0.0005175490420345827, "L2D TLB miss rate": 0.1281397314110592, "L1I cache miss rate": 0.009066269871650684, "L1D cache miss rate": 0.014126013421537926, "L2D cache miss rate": 0.1814722938854711, "LL cache miss rate": 0.9785949543658342}
--- a/pytorch/output_HPC/altra_10_30_mac_econ_fwd500_1000.output
+++ b/pytorch/output_HPC/altra_10_30_mac_econ_fwd500_1000.output
@ -0,0 +1,173 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395279 queued and waiting for resources
+srun: job 3395279 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,       8,  ..., 1273376,
+                            1273379, 1273389]),
+       col_indices=tensor([     3,     30,     44,  ..., 206363, 206408,
+                           206459]),
+       values=tensor([-3.7877e-03, -1.5420e-01,  9.5305e-04,  ...,
+                       1.2290e-01,  2.2235e-01, -1.0000e+00]),
+       size=(206500, 206500), nnz=1273389, layout=torch.sparse_csr)
+tensor([0.5388, 0.2921, 0.7349,  ..., 0.6379, 0.9676, 0.6389])
+Matrix: mac_econ_fwd500
+Shape: torch.Size([206500, 206500])
+NNZ: 1273389
+Density: 2.9862143765866013e-05
+Time: 21.700236320495605 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/mac_econ_fwd500.mtx 1000':
+
+             62.46 msec task-clock:u                     #    0.002 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,243      page-faults:u                    #   51.921 K/sec                     
+        57,150,420      cycles:u                         #    0.915 GHz                         (90.14%)
+        94,155,455      instructions:u                   #    1.65  insn per cycle            
+   <not supported>      branches:u                                                            
+           373,032      branch-misses:u                                                       
+        33,654,742      L1-dcache-loads:u                #  538.817 M/sec                     
+           479,068      L1-dcache-load-misses:u          #    1.42% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        32,149,866      L1-icache-loads:u                #  514.724 M/sec                     
+           293,643      L1-icache-load-misses:u          #    0.91% of all L1-icache accesses 
+                 0      dTLB-loads:u                     #    0.000 /sec                        (5.14%)
+     <not counted>      dTLB-load-misses:u                                                      (0.00%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+      25.310174677 seconds time elapsed
+
+     125.287203000 seconds user
+    1680.798909000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,       8,  ..., 1273376,
+                            1273379, 1273389]),
+       col_indices=tensor([     3,     30,     44,  ..., 206363, 206408,
+                           206459]),
+       values=tensor([-3.7877e-03, -1.5420e-01,  9.5305e-04,  ...,
+                       1.2290e-01,  2.2235e-01, -1.0000e+00]),
+       size=(206500, 206500), nnz=1273389, layout=torch.sparse_csr)
+tensor([0.6433, 0.3677, 0.3308,  ..., 0.5364, 0.2509, 0.4204])
+Matrix: mac_econ_fwd500
+Shape: torch.Size([206500, 206500])
+NNZ: 1273389
+Density: 2.9862143765866013e-05
+Time: 16.171404361724854 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/mac_econ_fwd500.mtx 1000':
+
+           320,781      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,491,698      BR_RETIRED:u                                                          
+
+      19.988421837 seconds time elapsed
+
+     112.429117000 seconds user
+    1245.246161000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,       8,  ..., 1273376,
+                            1273379, 1273389]),
+       col_indices=tensor([     3,     30,     44,  ..., 206363, 206408,
+                           206459]),
+       values=tensor([-3.7877e-03, -1.5420e-01,  9.5305e-04,  ...,
+                       1.2290e-01,  2.2235e-01, -1.0000e+00]),
+       size=(206500, 206500), nnz=1273389, layout=torch.sparse_csr)
+tensor([0.9344, 0.9844, 0.2313,  ..., 0.8634, 0.6912, 0.9693])
+Matrix: mac_econ_fwd500
+Shape: torch.Size([206500, 206500])
+NNZ: 1273389
+Density: 2.9862143765866013e-05
+Time: 11.788637161254883 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/mac_econ_fwd500.mtx 1000':
+
+        27,433,101      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             7,382      ITLB_WALK:u                                                           
+            19,213      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        37,123,052      L1D_TLB:u                                                             
+
+      15.542834153 seconds time elapsed
+
+      99.681401000 seconds user
+     906.856853000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,       8,  ..., 1273376,
+                            1273379, 1273389]),
+       col_indices=tensor([     3,     30,     44,  ..., 206363, 206408,
+                           206459]),
+       values=tensor([-3.7877e-03, -1.5420e-01,  9.5305e-04,  ...,
+                       1.2290e-01,  2.2235e-01, -1.0000e+00]),
+       size=(206500, 206500), nnz=1273389, layout=torch.sparse_csr)
+tensor([0.2037, 0.6417, 0.9786,  ..., 0.8187, 0.4933, 0.1289])
+Matrix: mac_econ_fwd500
+Shape: torch.Size([206500, 206500])
+NNZ: 1273389
+Density: 2.9862143765866013e-05
+Time: 13.596147060394287 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/mac_econ_fwd500.mtx 1000':
+
+        32,027,284      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           290,368      L1I_CACHE_REFILL:u                                                    
+           471,338      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,366,668      L1D_CACHE:u                                                           
+
+      17.325855116 seconds time elapsed
+
+     101.368582000 seconds user
+    1053.826259000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       3,       8,  ..., 1273376,
+                            1273379, 1273389]),
+       col_indices=tensor([     3,     30,     44,  ..., 206363, 206408,
+                           206459]),
+       values=tensor([-3.7877e-03, -1.5420e-01,  9.5305e-04,  ...,
+                       1.2290e-01,  2.2235e-01, -1.0000e+00]),
+       size=(206500, 206500), nnz=1273389, layout=torch.sparse_csr)
+tensor([0.2072, 0.8681, 0.4768,  ..., 0.4873, 0.8997, 0.8601])
+Matrix: mac_econ_fwd500
+Shape: torch.Size([206500, 206500])
+NNZ: 1273389
+Density: 2.9862143765866013e-05
+Time: 14.157796382904053 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/mac_econ_fwd500.mtx 1000':
+
+           571,063      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           583,554      LL_CACHE_RD:u                                                         
+           196,434      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            25,171      L2D_TLB_REFILL:u                                                      
+           329,198      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,814,040      L2D_CACHE:u                                                           
+
+      17.958287837 seconds time elapsed
+
+     104.145071000 seconds user
+    1089.962121000 seconds sys
+
+
+
--- a/pytorch/output_HPC/altra_10_30_mc2depi_1000.json
+++ b/pytorch/output_HPC/altra_10_30_mc2depi_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [94.16, 91.68, 78.92, 60.88, 46.72, 28.36, 22.08, 21.64, 21.64, 21.64], "matrix": "mc2depi", "shape": [525825, 525825], "nnz": 2100225, "% density": 7.595972132902821e-06, "time_s": 11.03979206085205, "power": [95.44, 94.0, 88.76, 72.12, 59.48, 51.92, 53.88, 68.6, 83.2, 97.76, 98.4, 97.12, 97.12, 95.28, 94.12], "power_after": [21.48, 21.44, 21.28, 21.24, 21.16, 21.08, 21.24, 21.24, 21.24, 21.16], "task clock (msec)": 56.14, "page faults": 3289, "cycles": 47515158, "instructions": 72388154, "branch mispredictions": 327042, "branches": 19309026, "ITLB accesses": 26093030, "ITLB misses": 6189, "DTLB misses": 17253, "DTLB accesses": 35168741, "L1I cache accesses": 30539322, "L1I cache misses": 285404, "L1D cache misses": 465747, "L1D cache accesses": 31932803, "LL cache misses": 530261, "LL cache accesses": 551030, "L2D TLB accesses": 183570, "L2D TLB misses": 23883, "L2D cache misses": 297006, "L2D cache accesses": 1721848, "instructions per cycle": 1.5234749719236964, "branch miss rate": 0.01693726032581861, "ITLB miss rate": 0.0002371897782664566, "DTLB miss rate": 0.0004905776979619486, "L2D TLB miss rate": 0.13010295799967314, "L1I cache miss rate": 0.009345459601231487, "L1D cache miss rate": 0.014585221347465175, "L2D cache miss rate": 0.1724925777420539, "LL cache miss rate": 0.9623087672177558}
--- a/pytorch/output_HPC/altra_10_30_mc2depi_1000.output
+++ b/pytorch/output_HPC/altra_10_30_mc2depi_1000.output
@ -0,0 +1,168 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395280 queued and waiting for resources
+srun: job 3395280 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       5,  ..., 2100220,
+                            2100223, 2100225]),
+       col_indices=tensor([     0,      1,      1,  ..., 525824, 525821,
+                           525824]),
+       values=tensor([-2025.,  2025., -2026.,  ...,  2025.,  1024., -1024.]),
+       size=(525825, 525825), nnz=2100225, layout=torch.sparse_csr)
+tensor([0.7162, 0.9445, 0.3087,  ..., 0.2863, 0.2977, 0.0994])
+Matrix: mc2depi
+Shape: torch.Size([525825, 525825])
+NNZ: 2100225
+Density: 7.595972132902821e-06
+Time: 14.228392839431763 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/mc2depi.mtx 1000':
+
+             56.14 msec task-clock:u                     #    0.003 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,289      page-faults:u                    #   58.584 K/sec                     
+        47,515,158      cycles:u                         #    0.846 GHz                         (55.54%)
+        72,388,154      instructions:u                   #    1.52  insn per cycle              (79.69%)
+   <not supported>      branches:u                                                            
+           369,139      branch-misses:u                                                       
+        32,820,508      L1-dcache-loads:u                #  584.601 M/sec                     
+           483,558      L1-dcache-load-misses:u          #    1.47% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        31,317,848      L1-icache-loads:u                #  557.836 M/sec                     
+           288,398      L1-icache-load-misses:u          #    0.92% of all L1-icache accesses 
+        39,511,659      dTLB-loads:u                     #  703.784 M/sec                       (36.64%)
+                 0      dTLB-load-misses:u                                                      (3.47%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+      18.186987302 seconds time elapsed
+
+     124.639912000 seconds user
+    1088.590740000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       5,  ..., 2100220,
+                            2100223, 2100225]),
+       col_indices=tensor([     0,      1,      1,  ..., 525824, 525821,
+                           525824]),
+       values=tensor([-2025.,  2025., -2026.,  ...,  2025.,  1024., -1024.]),
+       size=(525825, 525825), nnz=2100225, layout=torch.sparse_csr)
+tensor([0.4954, 0.2907, 0.0979,  ..., 0.0742, 0.4519, 0.0278])
+Matrix: mc2depi
+Shape: torch.Size([525825, 525825])
+NNZ: 2100225
+Density: 7.595972132902821e-06
+Time: 11.948119163513184 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/mc2depi.mtx 1000':
+
+           327,042      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,309,026      BR_RETIRED:u                                                          
+
+      15.715674756 seconds time elapsed
+
+     115.898749000 seconds user
+     910.018676000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       5,  ..., 2100220,
+                            2100223, 2100225]),
+       col_indices=tensor([     0,      1,      1,  ..., 525824, 525821,
+                           525824]),
+       values=tensor([-2025.,  2025., -2026.,  ...,  2025.,  1024., -1024.]),
+       size=(525825, 525825), nnz=2100225, layout=torch.sparse_csr)
+tensor([0.1402, 0.9048, 0.8859,  ..., 0.9542, 0.3509, 0.0695])
+Matrix: mc2depi
+Shape: torch.Size([525825, 525825])
+NNZ: 2100225
+Density: 7.595972132902821e-06
+Time: 14.170094966888428 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/mc2depi.mtx 1000':
+
+        26,093,030      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,189      ITLB_WALK:u                                                           
+            17,253      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        35,168,741      L1D_TLB:u                                                             
+
+      18.132605509 seconds time elapsed
+
+     121.020111000 seconds user
+    1090.508165000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       5,  ..., 2100220,
+                            2100223, 2100225]),
+       col_indices=tensor([     0,      1,      1,  ..., 525824, 525821,
+                           525824]),
+       values=tensor([-2025.,  2025., -2026.,  ...,  2025.,  1024., -1024.]),
+       size=(525825, 525825), nnz=2100225, layout=torch.sparse_csr)
+tensor([0.1192, 0.6084, 0.4643,  ..., 0.3445, 0.4658, 0.7085])
+Matrix: mc2depi
+Shape: torch.Size([525825, 525825])
+NNZ: 2100225
+Density: 7.595972132902821e-06
+Time: 13.925398826599121 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/mc2depi.mtx 1000':
+
+        30,539,322      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           285,404      L1I_CACHE_REFILL:u                                                    
+           465,747      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        31,932,803      L1D_CACHE:u                                                           
+
+      17.812911214 seconds time elapsed
+
+     119.918777000 seconds user
+    1067.928403000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       5,  ..., 2100220,
+                            2100223, 2100225]),
+       col_indices=tensor([     0,      1,      1,  ..., 525824, 525821,
+                           525824]),
+       values=tensor([-2025.,  2025., -2026.,  ...,  2025.,  1024., -1024.]),
+       size=(525825, 525825), nnz=2100225, layout=torch.sparse_csr)
+tensor([0.2075, 0.7442, 0.4477,  ..., 0.0794, 0.0859, 0.8652])
+Matrix: mc2depi
+Shape: torch.Size([525825, 525825])
+NNZ: 2100225
+Density: 7.595972132902821e-06
+Time: 12.866743564605713 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/mc2depi.mtx 1000':
+
+           530,261      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           551,030      LL_CACHE_RD:u                                                         
+           183,570      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,883      L2D_TLB_REFILL:u                                                      
+           297,006      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,721,848      L2D_CACHE:u                                                           
+
+      16.812811712 seconds time elapsed
+
+     117.780323000 seconds user
+     986.834040000 seconds sys
+
+
+
--- a/pytorch/output_HPC/altra_10_30_p2p-Gnutella04_1000.json
+++ b/pytorch/output_HPC/altra_10_30_p2p-Gnutella04_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [30.08, 25.12, 24.68, 23.68, 22.84, 21.96, 21.08, 20.96, 20.8, 20.96], "matrix": "p2p-Gnutella04", "shape": [10879, 10879], "nnz": 39994, "% density": 0.0003379223282393842, "time_s": 0.9992897510528564, "power": [29.48, 30.52, 31.88, 31.24, 34.32], "power_after": [20.4, 20.6, 20.64, 20.76, 20.92, 20.84, 20.88, 20.88, 20.88, 20.84], "task clock (msec)": 52.68, "page faults": 3272, "cycles": 63019732, "instructions": 73518898, "branch mispredictions": 333423, "branches": 19435905, "ITLB accesses": 27447537, "ITLB misses": 6417, "DTLB misses": 18300, "DTLB accesses": 37569384, "L1I cache accesses": 30830481, "L1I cache misses": 290545, "L1D cache misses": 473875, "L1D cache accesses": 32284772, "LL cache misses": 529403, "LL cache accesses": 549794, "L2D TLB accesses": 198306, "L2D TLB misses": 24497, "L2D cache misses": 298519, "L2D cache accesses": 1772795, "instructions per cycle": 1.1666012480027683, "branch miss rate": 0.017155002558409294, "ITLB miss rate": 0.00023379146915805232, "DTLB miss rate": 0.000487098750408045, "L2D TLB miss rate": 0.12353131019737174, "L1I cache miss rate": 0.009423952873132274, "L1D cache miss rate": 0.014677972636758903, "L2D cache miss rate": 0.16838890001381998, "LL cache miss rate": 0.9629115632400499}
--- a/pytorch/output_HPC/altra_10_30_p2p-Gnutella04_1000.output
+++ b/pytorch/output_HPC/altra_10_30_p2p-Gnutella04_1000.output
@ -0,0 +1,158 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395271 queued and waiting for resources
+srun: job 3395271 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,    10,    20,  ..., 39994, 39994, 39994]),
+       col_indices=tensor([    1,     2,     3,  ...,  9711, 10875, 10876]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(10879, 10879),
+       nnz=39994, layout=torch.sparse_csr)
+tensor([0.3559, 0.4732, 0.3024,  ..., 0.9176, 0.7712, 0.4949])
+Matrix: p2p-Gnutella04
+Shape: torch.Size([10879, 10879])
+NNZ: 39994
+Density: 0.0003379223282393842
+Time: 1.0082497596740723 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 1000':
+
+             52.68 msec task-clock:u                     #    0.012 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,272      page-faults:u                    #   62.105 K/sec                     
+        63,019,732      cycles:u                         #    1.196 GHz                         (70.67%)
+        73,518,898      instructions:u                   #    1.17  insn per cycle              (85.80%)
+   <not supported>      branches:u                                                            
+           359,236      branch-misses:u                                                         (99.44%)
+        31,459,751      L1-dcache-loads:u                #  597.131 M/sec                     
+           460,969      L1-dcache-load-misses:u          #    1.47% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        29,975,208      L1-icache-loads:u                #  568.954 M/sec                     
+           281,710      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
+        59,589,523      dTLB-loads:u                     #    1.131 G/sec                       (17.10%)
+                 0      dTLB-load-misses:u                                                      (1.27%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+       4.456867719 seconds time elapsed
+
+      16.389568000 seconds user
+      29.247355000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,    10,    20,  ..., 39994, 39994, 39994]),
+       col_indices=tensor([    1,     2,     3,  ...,  9711, 10875, 10876]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(10879, 10879),
+       nnz=39994, layout=torch.sparse_csr)
+tensor([0.0123, 0.4107, 0.7785,  ..., 0.7964, 0.7541, 0.4153])
+Matrix: p2p-Gnutella04
+Shape: torch.Size([10879, 10879])
+NNZ: 39994
+Density: 0.0003379223282393842
+Time: 1.030029058456421 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 1000':
+
+           333,423      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,435,905      BR_RETIRED:u                                                          
+
+       4.359656946 seconds time elapsed
+
+      16.490532000 seconds user
+      28.366462000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,    10,    20,  ..., 39994, 39994, 39994]),
+       col_indices=tensor([    1,     2,     3,  ...,  9711, 10875, 10876]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(10879, 10879),
+       nnz=39994, layout=torch.sparse_csr)
+tensor([0.1898, 0.0740, 0.4564,  ..., 0.7987, 0.1017, 0.5949])
+Matrix: p2p-Gnutella04
+Shape: torch.Size([10879, 10879])
+NNZ: 39994
+Density: 0.0003379223282393842
+Time: 1.004878044128418 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 1000':
+
+        27,447,537      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,417      ITLB_WALK:u                                                           
+            18,300      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        37,569,384      L1D_TLB:u                                                             
+
+       4.355627133 seconds time elapsed
+
+      15.883078000 seconds user
+      27.120829000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,    10,    20,  ..., 39994, 39994, 39994]),
+       col_indices=tensor([    1,     2,     3,  ...,  9711, 10875, 10876]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(10879, 10879),
+       nnz=39994, layout=torch.sparse_csr)
+tensor([0.1682, 0.9350, 0.9210,  ..., 0.3758, 0.2263, 0.1068])
+Matrix: p2p-Gnutella04
+Shape: torch.Size([10879, 10879])
+NNZ: 39994
+Density: 0.0003379223282393842
+Time: 1.0207850933074951 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 1000':
+
+        30,830,481      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           290,545      L1I_CACHE_REFILL:u                                                    
+           473,875      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        32,284,772      L1D_CACHE:u                                                           
+
+       4.427088851 seconds time elapsed
+
+      15.711555000 seconds user
+      29.627091000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,    10,    20,  ..., 39994, 39994, 39994]),
+       col_indices=tensor([    1,     2,     3,  ...,  9711, 10875, 10876]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(10879, 10879),
+       nnz=39994, layout=torch.sparse_csr)
+tensor([0.9351, 0.3836, 0.0822,  ..., 0.9798, 0.3726, 0.7394])
+Matrix: p2p-Gnutella04
+Shape: torch.Size([10879, 10879])
+NNZ: 39994
+Density: 0.0003379223282393842
+Time: 1.041510820388794 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella04.mtx 1000':
+
+           529,403      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           549,794      LL_CACHE_RD:u                                                         
+           198,306      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            24,497      L2D_TLB_REFILL:u                                                      
+           298,519      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,772,795      L2D_CACHE:u                                                           
+
+       4.454107604 seconds time elapsed
+
+      16.577921000 seconds user
+      29.390427000 seconds sys
+
+
+
--- a/pytorch/output_HPC/altra_10_30_p2p-Gnutella24_1000.json
+++ b/pytorch/output_HPC/altra_10_30_p2p-Gnutella24_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [30.72, 30.6, 28.68, 26.48, 22.44, 21.4, 21.28, 21.08, 21.32, 21.6], "matrix": "p2p-Gnutella24", "shape": [26518, 26518], "nnz": 65369, "% density": 9.295875717624285e-05, "time_s": 1.718301773071289, "power": [31.52, 32.48, 33.64, 33.88, 33.44, 31.52], "power_after": [20.96, 20.84, 20.92, 20.8, 20.76, 20.76, 20.76, 20.68, 20.72, 20.92], "task clock (msec)": 67.08, "page faults": 3303, "cycles": 61261862, "instructions": 83757591, "branch mispredictions": 329248, "branches": 19953212, "ITLB accesses": 27084694, "ITLB misses": 7107, "DTLB misses": 17529, "DTLB accesses": 36684333, "L1I cache accesses": 32158234, "L1I cache misses": 286484, "L1D cache misses": 474161, "L1D cache accesses": 33730073, "LL cache misses": 550064, "LL cache accesses": 565245, "L2D TLB accesses": 191046, "L2D TLB misses": 23775, "L2D cache misses": 307419, "L2D cache accesses": 1772169, "instructions per cycle": 1.3672060930828385, "branch miss rate": 0.016501002445120115, "ITLB miss rate": 0.0002623991247602797, "DTLB miss rate": 0.0004778334118818516, "L2D TLB miss rate": 0.12444646838981188, "L1I cache miss rate": 0.008908573773049851, "L1D cache miss rate": 0.014057514788064645, "L2D cache miss rate": 0.1734704760099065, "LL cache miss rate": 0.973142619572044}
--- a/pytorch/output_HPC/altra_10_30_p2p-Gnutella24_1000.output
+++ b/pytorch/output_HPC/altra_10_30_p2p-Gnutella24_1000.output
@ -5,45 +5,46 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394141 queued and waiting for resources
-srun: job 3394141 has been allocated resources
+srun: job 3395289 queued and waiting for resources
+srun: job 3395289 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([    0,     9,     9,  ..., 65369, 65369, 65369]),
       col_indices=tensor([    1,     2,     3,  ..., 15065,  9401, 26517]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(26518, 26518),
       nnz=65369, layout=torch.sparse_csr)
-tensor([0.6616, 0.1149, 0.0110,  ..., 0.2481, 0.7877, 0.5589])
+tensor([0.3210, 0.3418, 0.9584,  ..., 0.8929, 0.9807, 0.5532])
+Matrix: p2p-Gnutella24
 Shape: torch.Size([26518, 26518])
 NNZ: 65369
 Density: 9.295875717624285e-05
-Time: 0.16974925994873047 seconds
+Time: 1.6565663814544678 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 1000':

-             61.92 msec task-clock:u                     #    0.017 CPUs utilized             
+             67.08 msec task-clock:u                     #    0.013 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,281      page-faults:u                    #   52.988 K/sec                     
-        66,250,810      cycles:u                         #    1.070 GHz                         (62.94%)
-        75,178,179      instructions:u                   #    1.13  insn per cycle              (83.47%)
+             3,303      page-faults:u                    #   49.241 K/sec                     
+        61,261,862      cycles:u                         #    0.913 GHz                         (49.19%)
+        83,757,591      instructions:u                   #    1.37  insn per cycle              (88.30%)
   <not supported>      branches:u                                                            
-           367,749      branch-misses:u                                                       
-        33,064,095      L1-dcache-loads:u                #  533.986 M/sec                     
-           465,542      L1-dcache-load-misses:u          #    1.41% of all L1-dcache accesses 
+           364,692      branch-misses:u                                                       
+        31,954,743      L1-dcache-loads:u                #  476.379 M/sec                     
+           490,953      L1-dcache-load-misses:u          #    1.54% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        31,552,264      L1-icache-loads:u                #  509.570 M/sec                     
-           296,060      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
-        73,155,896      dTLB-loads:u                     #    1.181 G/sec                       (17.31%)
+        30,490,915      L1-icache-loads:u                #  454.556 M/sec                     
+           291,964      L1-icache-load-misses:u          #    0.96% of all L1-icache accesses 
+        32,131,046      dTLB-loads:u                     #  479.007 M/sec                       (19.20%)
     <not counted>      dTLB-load-misses:u                                                      (0.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       3.675971385 seconds time elapsed
+       5.107407925 seconds time elapsed

-      14.857293000 seconds user
-      29.791187000 seconds sys
+      16.045361000 seconds user
+      30.574855000 seconds sys



@ -53,21 +54,22 @@ tensor(crow_indices=tensor([    0,     9,     9,  ..., 65369, 65369, 65369]),
       col_indices=tensor([    1,     2,     3,  ..., 15065,  9401, 26517]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(26518, 26518),
       nnz=65369, layout=torch.sparse_csr)
-tensor([0.1683, 0.8999, 0.0578,  ..., 0.5893, 0.0628, 0.8262])
+tensor([0.4851, 0.2524, 0.2134,  ..., 0.5976, 0.0089, 0.2284])
+Matrix: p2p-Gnutella24
 Shape: torch.Size([26518, 26518])
 NNZ: 65369
 Density: 9.295875717624285e-05
-Time: 0.2227163314819336 seconds
+Time: 1.6902527809143066 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 1000':

-           332,366      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        19,076,182      BR_RETIRED:u                                                          
+           329,248      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,953,212      BR_RETIRED:u                                                          

-       3.532329673 seconds time elapsed
+       4.990707186 seconds time elapsed

-      14.883993000 seconds user
-      28.516661000 seconds sys
+      16.713526000 seconds user
+      27.761595000 seconds sys



@ -77,23 +79,24 @@ tensor(crow_indices=tensor([    0,     9,     9,  ..., 65369, 65369, 65369]),
       col_indices=tensor([    1,     2,     3,  ..., 15065,  9401, 26517]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(26518, 26518),
       nnz=65369, layout=torch.sparse_csr)
-tensor([0.8389, 0.5614, 0.9033,  ..., 0.2231, 0.0349, 0.5167])
+tensor([0.1844, 0.9003, 0.0155,  ..., 0.5184, 0.1445, 0.3588])
+Matrix: p2p-Gnutella24
 Shape: torch.Size([26518, 26518])
 NNZ: 65369
 Density: 9.295875717624285e-05
-Time: 0.17095375061035156 seconds
+Time: 1.6478993892669678 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 1000':

-        27,005,133      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             4,791      ITLB_WALK:u                                                           
-            13,403      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        36,457,054      L1D_TLB:u                                                             
+        27,084,694      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             7,107      ITLB_WALK:u                                                           
+            17,529      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,684,333      L1D_TLB:u                                                             

-       3.579041343 seconds time elapsed
+       5.010572757 seconds time elapsed

-      14.885159000 seconds user
-      29.562650000 seconds sys
+      16.570396000 seconds user
+      27.387405000 seconds sys



@ -103,23 +106,24 @@ tensor(crow_indices=tensor([    0,     9,     9,  ..., 65369, 65369, 65369]),
       col_indices=tensor([    1,     2,     3,  ..., 15065,  9401, 26517]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(26518, 26518),
       nnz=65369, layout=torch.sparse_csr)
-tensor([0.8849, 0.5982, 0.0578,  ..., 0.9975, 0.2204, 0.0718])
+tensor([0.2313, 0.8375, 0.3065,  ..., 0.2374, 0.2281, 0.2100])
+Matrix: p2p-Gnutella24
 Shape: torch.Size([26518, 26518])
 NNZ: 65369
 Density: 9.295875717624285e-05
-Time: 0.18003463745117188 seconds
+Time: 1.637598991394043 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 1000':

-        32,367,686      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           287,524      L1I_CACHE_REFILL:u                                                    
-           467,557      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        34,022,862      L1D_CACHE:u                                                           
+        32,158,234      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           286,484      L1I_CACHE_REFILL:u                                                    
+           474,161      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,730,073      L1D_CACHE:u                                                           

-       3.405321132 seconds time elapsed
+       4.963121627 seconds time elapsed

-      15.291636000 seconds user
-      28.005015000 seconds sys
+      16.730431000 seconds user
+      29.869416000 seconds sys



@ -129,25 +133,26 @@ tensor(crow_indices=tensor([    0,     9,     9,  ..., 65369, 65369, 65369]),
       col_indices=tensor([    1,     2,     3,  ..., 15065,  9401, 26517]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(26518, 26518),
       nnz=65369, layout=torch.sparse_csr)
-tensor([0.2790, 0.1291, 0.6053,  ..., 0.1651, 0.4973, 0.6821])
+tensor([0.5006, 0.8470, 0.3527,  ..., 0.3901, 0.3581, 0.1154])
+Matrix: p2p-Gnutella24
 Shape: torch.Size([26518, 26518])
 NNZ: 65369
 Density: 9.295875717624285e-05
-Time: 0.22036528587341309 seconds
+Time: 1.6584653854370117 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella24.mtx 1000':

-           535,707      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           556,316      LL_CACHE_RD:u                                                         
-           150,149      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            18,418      L2D_TLB_REFILL:u                                                      
-           297,042      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,687,364      L2D_CACHE:u                                                           
+           550,064      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           565,245      LL_CACHE_RD:u                                                         
+           191,046      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,775      L2D_TLB_REFILL:u                                                      
+           307,419      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,772,169      L2D_CACHE:u                                                           

-       3.505209576 seconds time elapsed
+       5.019317303 seconds time elapsed

-      15.297738000 seconds user
-      29.848441000 seconds sys
+      16.518292000 seconds user
+      30.069880000 seconds sys



--- a/pytorch/output_HPC/altra_10_30_p2p-Gnutella25_1000.json
+++ b/pytorch/output_HPC/altra_10_30_p2p-Gnutella25_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [86.48, 72.16, 59.36, 41.84, 28.44, 22.96, 22.92, 22.92, 23.04, 23.24], "matrix": "p2p-Gnutella25", "shape": [22687, 22687], "nnz": 54705, "% density": 0.00010628522108964806, "time_s": 1.431199073791504, "power": [35.16, 36.2, 36.72, 37.52, 37.52], "power_after": [21.32, 21.2, 21.2, 21.28, 21.52, 21.44, 21.92, 21.68, 21.6, 21.36], "task clock (msec)": 59.85, "page faults": 3318, "cycles": 76505130, "instructions": 72343215, "branch mispredictions": 322338, "branches": 19784096, "ITLB accesses": 27270404, "ITLB misses": 6607, "DTLB misses": 17981, "DTLB accesses": 36751047, "L1I cache accesses": 30620441, "L1I cache misses": 302139, "L1D cache misses": 471011, "L1D cache accesses": 32141810, "LL cache misses": 531907, "LL cache accesses": 545159, "L2D TLB accesses": 188244, "L2D TLB misses": 23034, "L2D cache misses": 293848, "L2D cache accesses": 1757551, "instructions per cycle": 0.945599530384433, "branch miss rate": 0.016292783860329025, "ITLB miss rate": 0.00024227730546272803, "DTLB miss rate": 0.0004892649725054092, "L2D TLB miss rate": 0.12236246573595971, "L1I cache miss rate": 0.009867232153841285, "L1D cache miss rate": 0.014654152955294054, "L2D cache miss rate": 0.1671917344077071, "LL cache miss rate": 0.9756914955086498}
--- a/pytorch/output_HPC/altra_10_30_p2p-Gnutella25_1000.output
+++ b/pytorch/output_HPC/altra_10_30_p2p-Gnutella25_1000.output
@ -0,0 +1,158 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395288 queued and waiting for resources
+srun: job 3395288 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,     9,     9,  ..., 54704, 54704, 54705]),
+       col_indices=tensor([    1,     2,     3,  ..., 17949, 22685,   144]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(22687, 22687),
+       nnz=54705, layout=torch.sparse_csr)
+tensor([0.9962, 0.2550, 0.9564,  ..., 0.7113, 0.6635, 0.3831])
+Matrix: p2p-Gnutella25
+Shape: torch.Size([22687, 22687])
+NNZ: 54705
+Density: 0.00010628522108964806
+Time: 1.4832944869995117 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 1000':
+
+             59.85 msec task-clock:u                     #    0.012 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,318      page-faults:u                    #   55.439 K/sec                     
+        76,505,130      cycles:u                         #    1.278 GHz                         (43.11%)
+        72,343,215      instructions:u                   #    0.95  insn per cycle              (62.06%)
+   <not supported>      branches:u                                                            
+           371,337      branch-misses:u                                                         (77.63%)
+        33,969,604      L1-dcache-loads:u                #  567.579 M/sec                       (88.85%)
+           472,023      L1-dcache-load-misses:u          #    1.39% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        31,728,689      L1-icache-loads:u                #  530.137 M/sec                     
+           299,356      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
+        50,921,898      dTLB-loads:u                     #  850.825 M/sec                       (39.93%)
+            90,542      dTLB-load-misses:u               #    0.18% of all dTLB cache accesses  (36.53%)
+        11,563,883      iTLB-loads:u                     #  193.214 M/sec                       (20.26%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+       4.953668960 seconds time elapsed
+
+      16.652653000 seconds user
+      30.408692000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,     9,     9,  ..., 54704, 54704, 54705]),
+       col_indices=tensor([    1,     2,     3,  ..., 17949, 22685,   144]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(22687, 22687),
+       nnz=54705, layout=torch.sparse_csr)
+tensor([0.9968, 0.7101, 0.9319,  ..., 0.2871, 0.7386, 0.8934])
+Matrix: p2p-Gnutella25
+Shape: torch.Size([22687, 22687])
+NNZ: 54705
+Density: 0.00010628522108964806
+Time: 1.3799591064453125 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 1000':
+
+           322,338      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,784,096      BR_RETIRED:u                                                          
+
+       4.633544255 seconds time elapsed
+
+      16.572749000 seconds user
+      26.228349000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,     9,     9,  ..., 54704, 54704, 54705]),
+       col_indices=tensor([    1,     2,     3,  ..., 17949, 22685,   144]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(22687, 22687),
+       nnz=54705, layout=torch.sparse_csr)
+tensor([0.3551, 0.8297, 0.9950,  ..., 0.9625, 0.7129, 0.2173])
+Matrix: p2p-Gnutella25
+Shape: torch.Size([22687, 22687])
+NNZ: 54705
+Density: 0.00010628522108964806
+Time: 1.400240182876587 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 1000':
+
+        27,270,404      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,607      ITLB_WALK:u                                                           
+            17,981      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,751,047      L1D_TLB:u                                                             
+
+       4.696092090 seconds time elapsed
+
+      15.781810000 seconds user
+      28.383624000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,     9,     9,  ..., 54704, 54704, 54705]),
+       col_indices=tensor([    1,     2,     3,  ..., 17949, 22685,   144]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(22687, 22687),
+       nnz=54705, layout=torch.sparse_csr)
+tensor([0.3600, 0.0388, 0.5262,  ..., 0.5849, 0.3707, 0.1514])
+Matrix: p2p-Gnutella25
+Shape: torch.Size([22687, 22687])
+NNZ: 54705
+Density: 0.00010628522108964806
+Time: 1.4545772075653076 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 1000':
+
+        30,620,441      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           302,139      L1I_CACHE_REFILL:u                                                    
+           471,011      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        32,141,810      L1D_CACHE:u                                                           
+
+       4.897499310 seconds time elapsed
+
+      16.207163000 seconds user
+      32.246890000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([    0,     9,     9,  ..., 54704, 54704, 54705]),
+       col_indices=tensor([    1,     2,     3,  ..., 17949, 22685,   144]),
+       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(22687, 22687),
+       nnz=54705, layout=torch.sparse_csr)
+tensor([0.1220, 0.8435, 0.7035,  ..., 0.2109, 0.0289, 0.0715])
+Matrix: p2p-Gnutella25
+Shape: torch.Size([22687, 22687])
+NNZ: 54705
+Density: 0.00010628522108964806
+Time: 1.4200170040130615 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella25.mtx 1000':
+
+           531,907      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           545,159      LL_CACHE_RD:u                                                         
+           188,244      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,034      L2D_TLB_REFILL:u                                                      
+           293,848      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,757,551      L2D_CACHE:u                                                           
+
+       4.683262937 seconds time elapsed
+
+      16.111909000 seconds user
+      29.660483000 seconds sys
+
+
+
--- a/pytorch/output_HPC/altra_10_30_p2p-Gnutella30_1000.json
+++ b/pytorch/output_HPC/altra_10_30_p2p-Gnutella30_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [16.44, 16.44, 16.44, 16.84, 16.72, 16.6, 16.72, 16.84, 16.68, 16.84], "matrix": "p2p-Gnutella30", "shape": [36682, 36682], "nnz": 88328, "% density": 6.564359899804003e-05, "time_s": 2.896674871444702, "power": [56.32, 68.24, 71.76, 59.48, 47.6, 48.76, 52.6], "power_after": [16.92, 17.0, 16.96, 16.8, 16.48, 16.52, 16.52, 16.52, 16.24, 16.36], "task clock (msec)": 56.47, "page faults": 3222, "cycles": 69105836, "instructions": 89065155, "branch mispredictions": 333669, "branches": 20078755, "ITLB accesses": 26015038, "ITLB misses": 5212, "DTLB misses": 17039, "DTLB accesses": 35296010, "L1I cache accesses": 31837486, "L1I cache misses": 293353, "L1D cache misses": 462358, "L1D cache accesses": 33478540, "LL cache misses": 546516, "LL cache accesses": 559865, "L2D TLB accesses": 190400, "L2D TLB misses": 23787, "L2D cache misses": 307032, "L2D cache accesses": 1768186, "instructions per cycle": 1.288822480926213, "branch miss rate": 0.016618012421586895, "ITLB miss rate": 0.00020034566161310238, "DTLB miss rate": 0.00048274578344691083, "L2D TLB miss rate": 0.12493172268907562, "L1I cache miss rate": 0.009214075508348869, "L1D cache miss rate": 0.013810578358554464, "L2D cache miss rate": 0.17364236567872385, "LL cache miss rate": 0.9761567520741607}
--- a/pytorch/output_HPC/altra_10_30_p2p-Gnutella30_1000.output
+++ b/pytorch/output_HPC/altra_10_30_p2p-Gnutella30_1000.output
@ -5,45 +5,46 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394142 queued and waiting for resources
-srun: job 3394142 has been allocated resources
+srun: job 3395282 queued and waiting for resources
+srun: job 3395282 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([    0,    10,    10,  ..., 88328, 88328, 88328]),
       col_indices=tensor([    1,     2,     3,  ..., 36675, 36676, 36677]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36682, 36682),
       nnz=88328, layout=torch.sparse_csr)
-tensor([0.5867, 0.3729, 0.0718,  ..., 0.5551, 0.6046, 0.6005])
+tensor([0.0302, 0.1334, 0.4142,  ..., 0.9516, 0.6030, 0.3883])
+Matrix: p2p-Gnutella30
 Shape: torch.Size([36682, 36682])
 NNZ: 88328
 Density: 6.564359899804003e-05
-Time: 0.3765556812286377 seconds
+Time: 2.790724277496338 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 1000':

-             65.91 msec task-clock:u                     #    0.017 CPUs utilized             
+             56.47 msec task-clock:u                     #    0.009 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,247      page-faults:u                    #   49.267 K/sec                     
-        92,293,071      cycles:u                         #    1.400 GHz                         (58.72%)
-        76,208,632      instructions:u                   #    0.83  insn per cycle              (75.47%)
+             3,222      page-faults:u                    #   57.061 K/sec                     
+        69,105,836      cycles:u                         #    1.224 GHz                         (53.55%)
+        89,065,155      instructions:u                   #    1.29  insn per cycle              (92.79%)
   <not supported>      branches:u                                                            
-           336,620      branch-misses:u                                                         (89.96%)
-        33,256,017      L1-dcache-loads:u                #  504.599 M/sec                     
-           479,188      L1-dcache-load-misses:u          #    1.44% of all L1-dcache accesses 
+           367,525      branch-misses:u                                                       
+        32,122,654      L1-dcache-loads:u                #  568.886 M/sec                     
+           467,921      L1-dcache-load-misses:u          #    1.46% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        31,686,331      L1-icache-loads:u                #  480.782 M/sec                     
-           297,521      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
-        55,295,804      dTLB-loads:u                     #  839.012 M/sec                       (27.47%)
-           103,616      dTLB-load-misses:u               #    0.19% of all dTLB cache accesses  (20.17%)
+        30,765,438      L1-icache-loads:u                #  544.850 M/sec                     
+           289,327      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
+        24,642,710      dTLB-loads:u                     #  436.418 M/sec                       (11.11%)
+     <not counted>      dTLB-load-misses:u                                                      (0.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       3.803094533 seconds time elapsed
+       6.334250152 seconds time elapsed

-      16.585763000 seconds user
-      62.703127000 seconds sys
+      32.099712000 seconds user
+     240.206702000 seconds sys



@ -53,21 +54,22 @@ tensor(crow_indices=tensor([    0,    10,    10,  ..., 88328, 88328, 88328]),
       col_indices=tensor([    1,     2,     3,  ..., 36675, 36676, 36677]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36682, 36682),
       nnz=88328, layout=torch.sparse_csr)
-tensor([0.2027, 0.2128, 0.5093,  ..., 0.8069, 0.6413, 0.1136])
+tensor([0.6147, 0.4171, 0.2258,  ..., 0.0253, 0.8932, 0.8040])
+Matrix: p2p-Gnutella30
 Shape: torch.Size([36682, 36682])
 NNZ: 88328
 Density: 6.564359899804003e-05
-Time: 0.2942969799041748 seconds
+Time: 2.092158079147339 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 1000':

-           320,083      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        19,285,106      BR_RETIRED:u                                                          
+           333,669      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,078,755      BR_RETIRED:u                                                          

-       3.763535833 seconds time elapsed
+       5.557038624 seconds time elapsed

-      16.476022000 seconds user
-      55.208213000 seconds sys
+      29.074016000 seconds user
+     186.372846000 seconds sys



@ -77,23 +79,24 @@ tensor(crow_indices=tensor([    0,    10,    10,  ..., 88328, 88328, 88328]),
       col_indices=tensor([    1,     2,     3,  ..., 36675, 36676, 36677]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36682, 36682),
       nnz=88328, layout=torch.sparse_csr)
-tensor([0.5930, 0.8044, 0.8115,  ..., 0.6366, 0.1026, 0.6914])
+tensor([0.0146, 0.2151, 0.1948,  ..., 0.7633, 0.4329, 0.7106])
+Matrix: p2p-Gnutella30
 Shape: torch.Size([36682, 36682])
 NNZ: 88328
 Density: 6.564359899804003e-05
-Time: 0.2431955337524414 seconds
+Time: 3.1269772052764893 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 1000':

-        26,853,940      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,728      ITLB_WALK:u                                                           
-            13,955      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        37,111,059      L1D_TLB:u                                                             
+        26,015,038      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             5,212      ITLB_WALK:u                                                           
+            17,039      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        35,296,010      L1D_TLB:u                                                             

-       3.752433570 seconds time elapsed
+       6.550798214 seconds time elapsed

-      16.433982000 seconds user
-      53.207908000 seconds sys
+      36.334689000 seconds user
+     263.614426000 seconds sys



@ -103,23 +106,24 @@ tensor(crow_indices=tensor([    0,    10,    10,  ..., 88328, 88328, 88328]),
       col_indices=tensor([    1,     2,     3,  ..., 36675, 36676, 36677]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36682, 36682),
       nnz=88328, layout=torch.sparse_csr)
-tensor([0.9666, 0.8206, 0.6252,  ..., 0.5180, 0.8170, 0.7406])
+tensor([0.1810, 0.5208, 0.0542,  ..., 0.6108, 0.4905, 0.8918])
+Matrix: p2p-Gnutella30
 Shape: torch.Size([36682, 36682])
 NNZ: 88328
 Density: 6.564359899804003e-05
-Time: 0.15313339233398438 seconds
+Time: 1.9065814018249512 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 1000':

-        32,554,796      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           298,729      L1I_CACHE_REFILL:u                                                    
-           473,779      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        34,117,102      L1D_CACHE:u                                                           
+        31,837,486      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           293,353      L1I_CACHE_REFILL:u                                                    
+           462,358      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,478,540      L1D_CACHE:u                                                           

-       3.595579651 seconds time elapsed
+       5.319975004 seconds time elapsed

-      15.817851000 seconds user
-      44.491315000 seconds sys
+      26.918342000 seconds user
+     175.603919000 seconds sys



@ -129,25 +133,26 @@ tensor(crow_indices=tensor([    0,    10,    10,  ..., 88328, 88328, 88328]),
       col_indices=tensor([    1,     2,     3,  ..., 36675, 36676, 36677]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]), size=(36682, 36682),
       nnz=88328, layout=torch.sparse_csr)
-tensor([0.9800, 0.9021, 0.5677,  ..., 0.3869, 0.2468, 0.3286])
+tensor([0.8456, 0.8302, 0.2078,  ..., 0.8155, 0.5148, 0.5853])
+Matrix: p2p-Gnutella30
 Shape: torch.Size([36682, 36682])
 NNZ: 88328
 Density: 6.564359899804003e-05
-Time: 0.2539215087890625 seconds
+Time: 3.8523874282836914 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/p2p-Gnutella30.mtx 1000':

-           535,040      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           547,502      LL_CACHE_RD:u                                                         
-           179,876      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            21,809      L2D_TLB_REFILL:u                                                      
-           298,620      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,722,959      L2D_CACHE:u                                                           
+           546,516      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           559,865      LL_CACHE_RD:u                                                         
+           190,400      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,787      L2D_TLB_REFILL:u                                                      
+           307,032      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,768,186      L2D_CACHE:u                                                           

-       3.549060962 seconds time elapsed
+       7.266305868 seconds time elapsed

-      16.570077000 seconds user
-      52.238012000 seconds sys
+      37.085321000 seconds user
+     320.780766000 seconds sys



--- a/pytorch/output_HPC/altra_10_30_ri2010_1000.json
+++ b/pytorch/output_HPC/altra_10_30_ri2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [16.6, 16.64, 17.04, 17.08, 16.92, 17.24, 16.88, 16.36, 16.4, 16.4], "matrix": "ri2010", "shape": [25181, 25181], "nnz": 125750, "% density": 0.00019831796057928155, "time_s": 2.970583200454712, "power": [23.04, 23.28, 23.76, 24.12, 21.4, 26.28, 26.36], "power_after": [16.16, 16.16, 16.52, 16.48, 16.52, 16.44, 16.36, 16.48, 16.76, 16.6], "task clock (msec)": 52.61, "page faults": 3292, "cycles": 42915672, "instructions": 71002596, "branch mispredictions": 344300, "branches": 20224759, "ITLB accesses": 26039851, "ITLB misses": 5035, "DTLB misses": 16402, "DTLB accesses": 34820806, "L1I cache accesses": 31878105, "L1I cache misses": 299057, "L1D cache misses": 471869, "L1D cache accesses": 33450518, "LL cache misses": 530093, "LL cache accesses": 551126, "L2D TLB accesses": 188315, "L2D TLB misses": 22856, "L2D cache misses": 299885, "L2D cache accesses": 1763155, "instructions per cycle": 1.6544677664607, "branch miss rate": 0.01702368863826758, "ITLB miss rate": 0.00019335748119296073, "DTLB miss rate": 0.0004710402165877493, "L2D TLB miss rate": 0.12137110692191275, "L1I cache miss rate": 0.009381266546427399, "L1D cache miss rate": 0.014106478111938357, "L2D cache miss rate": 0.1700843090936418, "LL cache miss rate": 0.9618363132931489}
--- a/pytorch/output_HPC/altra_10_30_ri2010_1000.output
+++ b/pytorch/output_HPC/altra_10_30_ri2010_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394145 queued and waiting for resources
-srun: job 3394145 has been allocated resources
+srun: job 3395268 queued and waiting for resources
+srun: job 3395268 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
@ -14,37 +14,38 @@ tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
       col_indices=tensor([   25,    56,   662,  ..., 21738, 22279, 23882]),
       values=tensor([17171., 37318.,  5284.,  ..., 25993., 24918.,   803.]),
       size=(25181, 25181), nnz=125750, layout=torch.sparse_csr)
-tensor([0.1402, 0.0708, 0.4576,  ..., 0.4700, 0.5629, 0.9120])
+tensor([0.4029, 0.5373, 0.8376,  ..., 0.9299, 0.3127, 0.4778])
+Matrix: ri2010
 Shape: torch.Size([25181, 25181])
 NNZ: 125750
 Density: 0.00019831796057928155
-Time: 0.3585643768310547 seconds
+Time: 2.9858975410461426 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 1000':

-             60.77 msec task-clock:u                     #    0.016 CPUs utilized             
+             52.61 msec task-clock:u                     #    0.008 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,361      page-faults:u                    #   55.311 K/sec                     
-        63,493,475      cycles:u                         #    1.045 GHz                         (49.59%)
-        91,578,911      instructions:u                   #    1.44  insn per cycle              (92.22%)
+             3,292      page-faults:u                    #   62.576 K/sec                     
+        42,915,672      cycles:u                         #    0.816 GHz                         (55.04%)
+        71,002,596      instructions:u                   #    1.65  insn per cycle              (81.89%)
   <not supported>      branches:u                                                            
-           374,941      branch-misses:u                                                       
-        33,905,978      L1-dcache-loads:u                #  557.979 M/sec                     
-           470,553      L1-dcache-load-misses:u          #    1.39% of all L1-dcache accesses 
+           369,793      branch-misses:u                                                       
+        33,163,106      L1-dcache-loads:u                #  630.381 M/sec                     
+           471,533      L1-dcache-load-misses:u          #    1.42% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        32,247,376      L1-icache-loads:u                #  530.684 M/sec                     
-           299,037      L1-icache-load-misses:u          #    0.93% of all L1-icache accesses 
-        27,428,635      dTLB-loads:u                     #  451.384 M/sec                       (13.50%)
-     <not counted>      dTLB-load-misses:u                                                      (0.00%)
+        31,640,002      L1-icache-loads:u                #  601.429 M/sec                     
+           297,919      L1-icache-load-misses:u          #    0.94% of all L1-icache accesses 
+        48,642,108      dTLB-loads:u                     #  924.614 M/sec                       (29.77%)
+                 0      dTLB-load-misses:u                                                      (5.06%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       3.818532962 seconds time elapsed
+       6.215745697 seconds time elapsed

-      15.563570000 seconds user
-      30.194882000 seconds sys
+      17.600216000 seconds user
+      30.777524000 seconds sys



@ -55,21 +56,22 @@ tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
       col_indices=tensor([   25,    56,   662,  ..., 21738, 22279, 23882]),
       values=tensor([17171., 37318.,  5284.,  ..., 25993., 24918.,   803.]),
       size=(25181, 25181), nnz=125750, layout=torch.sparse_csr)
-tensor([0.1841, 0.4436, 0.8281,  ..., 0.0546, 0.5967, 0.9496])
+tensor([0.8706, 0.3724, 0.8779,  ..., 0.4299, 0.0920, 0.4238])
+Matrix: ri2010
 Shape: torch.Size([25181, 25181])
 NNZ: 125750
 Density: 0.00019831796057928155
-Time: 0.3050577640533447 seconds
+Time: 2.9231789112091064 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 1000':

-           329,084      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        20,406,595      BR_RETIRED:u                                                          
+           344,300      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,224,759      BR_RETIRED:u                                                          

-       3.673527837 seconds time elapsed
+       6.297708483 seconds time elapsed

-      15.520198000 seconds user
-      29.068211000 seconds sys
+      17.546068000 seconds user
+      26.920857000 seconds sys



@ -80,23 +82,24 @@ tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
       col_indices=tensor([   25,    56,   662,  ..., 21738, 22279, 23882]),
       values=tensor([17171., 37318.,  5284.,  ..., 25993., 24918.,   803.]),
       size=(25181, 25181), nnz=125750, layout=torch.sparse_csr)
-tensor([0.1849, 0.5991, 0.5040,  ..., 0.4916, 0.4789, 0.8887])
+tensor([0.2988, 0.0160, 0.4360,  ..., 0.7543, 0.0919, 0.2321])
+Matrix: ri2010
 Shape: torch.Size([25181, 25181])
 NNZ: 125750
 Density: 0.00019831796057928155
-Time: 0.3605458736419678 seconds
+Time: 2.9701316356658936 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 1000':

-        26,859,919      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,237      ITLB_WALK:u                                                           
-            16,689      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        36,348,977      L1D_TLB:u                                                             
+        26,039,851      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             5,035      ITLB_WALK:u                                                           
+            16,402      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        34,820,806      L1D_TLB:u                                                             

-       3.769690988 seconds time elapsed
+       6.227977259 seconds time elapsed

-      15.173839000 seconds user
-      29.963392000 seconds sys
+      17.937381000 seconds user
+      30.196552000 seconds sys



@ -107,23 +110,24 @@ tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
       col_indices=tensor([   25,    56,   662,  ..., 21738, 22279, 23882]),
       values=tensor([17171., 37318.,  5284.,  ..., 25993., 24918.,   803.]),
       size=(25181, 25181), nnz=125750, layout=torch.sparse_csr)
-tensor([0.0513, 0.4498, 0.6748,  ..., 0.2114, 0.6847, 0.2188])
+tensor([0.5797, 0.8992, 0.8317,  ..., 0.0283, 0.7124, 0.2690])
+Matrix: ri2010
 Shape: torch.Size([25181, 25181])
 NNZ: 125750
 Density: 0.00019831796057928155
-Time: 0.3485410213470459 seconds
+Time: 2.968733072280884 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 1000':

-        30,979,764      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           292,038      L1I_CACHE_REFILL:u                                                    
-           469,219      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        32,411,890      L1D_CACHE:u                                                           
+        31,878,105      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           299,057      L1I_CACHE_REFILL:u                                                    
+           471,869      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,450,518      L1D_CACHE:u                                                           

-       3.598754329 seconds time elapsed
+       6.278062824 seconds time elapsed

-      16.139631000 seconds user
-      29.287026000 seconds sys
+      17.822878000 seconds user
+      27.932170000 seconds sys



@ -134,25 +138,26 @@ tensor(crow_indices=tensor([     0,      3,      8,  ..., 125742, 125747,
       col_indices=tensor([   25,    56,   662,  ..., 21738, 22279, 23882]),
       values=tensor([17171., 37318.,  5284.,  ..., 25993., 24918.,   803.]),
       size=(25181, 25181), nnz=125750, layout=torch.sparse_csr)
-tensor([0.7270, 0.7858, 0.3165,  ..., 0.7139, 0.8270, 0.9478])
+tensor([0.0630, 0.5194, 0.8720,  ..., 0.9537, 0.3959, 0.5550])
+Matrix: ri2010
 Shape: torch.Size([25181, 25181])
 NNZ: 125750
 Density: 0.00019831796057928155
-Time: 0.3687746524810791 seconds
+Time: 2.9069995880126953 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ri2010.mtx 1000':

-           571,870      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           598,306      LL_CACHE_RD:u                                                         
-           205,488      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            26,392      L2D_TLB_REFILL:u                                                      
-           342,141      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,857,697      L2D_CACHE:u                                                           
+           530,093      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           551,126      LL_CACHE_RD:u                                                         
+           188,315      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            22,856      L2D_TLB_REFILL:u                                                      
+           299,885      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,763,155      L2D_CACHE:u                                                           

-       3.726794738 seconds time elapsed
+       6.075529293 seconds time elapsed

-      15.231331000 seconds user
-      32.108693000 seconds sys
+      17.073983000 seconds user
+      27.811966000 seconds sys



--- a/pytorch/output_HPC/altra_10_30_rma10_1000.json
+++ b/pytorch/output_HPC/altra_10_30_rma10_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [31.36, 30.64, 31.12, 24.52, 24.16, 23.12, 22.08, 21.28, 21.16, 20.88], "matrix": "rma10", "shape": [46835, 46835], "nnz": 2374001, "% density": 0.0010822805369125833, "time_s": 68.86891412734985, "power": [81.8, 81.32, 75.08, 63.48, 51.92, 51.96, 51.8, 65.0, 65.0, 75.12, 82.68, 82.32, 82.08, 82.76, 82.8, 83.6, 83.36, 83.08, 82.88, 83.0, 83.32, 83.32, 83.36, 84.64, 84.56, 84.24, 83.52, 83.4, 83.36, 83.36, 83.72, 84.16, 83.24, 82.76, 82.76, 82.96, 82.36, 82.24, 81.64, 81.6, 81.4, 81.6, 81.88, 82.32, 83.04, 83.48, 83.48, 84.32, 84.04, 84.32, 83.16, 82.44, 81.96, 81.4, 81.8, 82.08, 81.8, 81.84, 82.04, 82.04, 82.08, 82.44, 82.6, 82.84, 83.8, 84.24, 84.6, 85.4, 85.6, 86.0, 85.72, 85.36], "power_after": [21.96, 21.88, 21.96, 21.96, 22.0, 21.68, 21.44, 21.16, 21.04, 20.92], "task clock (msec)": 58.3, "page faults": 3281, "cycles": 81319364, "instructions": 90830397, "branch mispredictions": 342237, "branches": 20641135, "ITLB accesses": 27974213, "ITLB misses": 6660, "DTLB misses": 18441, "DTLB accesses": 37780346, "L1I cache accesses": 31166891, "L1I cache misses": 291301, "L1D cache misses": 477186, "L1D cache accesses": 32682323, "LL cache misses": 538552, "LL cache accesses": 552543, "L2D TLB accesses": 202351, "L2D TLB misses": 24178, "L2D cache misses": 298051, "L2D cache accesses": 1775481, "instructions per cycle": 1.1169590185186398, "branch miss rate": 0.01658033824205888, "ITLB miss rate": 0.00023807640272132053, "DTLB miss rate": 0.00048811093471722044, "L2D TLB miss rate": 0.11948544855226809, "L1I cache miss rate": 0.00934648887500521, "L1D cache miss rate": 0.014600736918241704, "L2D cache miss rate": 0.1678705657790762, "LL cache miss rate": 0.9746788937693537}
--- a/pytorch/output_HPC/altra_10_30_rma10_1000.output
+++ b/pytorch/output_HPC/altra_10_30_rma10_1000.output
@ -0,0 +1,168 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395286 queued and waiting for resources
+srun: job 3395286 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,      17,      34,  ..., 2373939,
+                            2373970, 2374001]),
+       col_indices=tensor([    0,     1,     2,  ..., 46831, 46833, 46834]),
+       values=tensor([ 1.2636e+05, -1.6615e+07, -8.2015e+04,  ...,
+                       8.3378e+01,  2.5138e+00,  1.2184e+03]),
+       size=(46835, 46835), nnz=2374001, layout=torch.sparse_csr)
+tensor([0.4937, 0.5946, 0.4240,  ..., 0.9888, 0.5278, 0.9155])
+Matrix: rma10
+Shape: torch.Size([46835, 46835])
+NNZ: 2374001
+Density: 0.0010822805369125833
+Time: 52.320035219192505 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/rma10.mtx 1000':
+
+             58.30 msec task-clock:u                     #    0.001 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,281      page-faults:u                    #   56.279 K/sec                     
+        81,319,364      cycles:u                         #    1.395 GHz                         (62.38%)
+        90,830,397      instructions:u                   #    1.12  insn per cycle              (94.62%)
+   <not supported>      branches:u                                                            
+           358,947      branch-misses:u                                                       
+        32,561,141      L1-dcache-loads:u                #  558.523 M/sec                     
+           477,147      L1-dcache-load-misses:u          #    1.47% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        31,044,361      L1-icache-loads:u                #  532.506 M/sec                     
+           286,125      L1-icache-load-misses:u          #    0.92% of all L1-icache accesses 
+        29,678,379      dTLB-loads:u                     #  509.075 M/sec                       (5.72%)
+     <not counted>      dTLB-load-misses:u                                                      (0.00%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+      56.145511940 seconds time elapsed
+
+     269.541895000 seconds user
+    3993.928150000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,      17,      34,  ..., 2373939,
+                            2373970, 2374001]),
+       col_indices=tensor([    0,     1,     2,  ..., 46831, 46833, 46834]),
+       values=tensor([ 1.2636e+05, -1.6615e+07, -8.2015e+04,  ...,
+                       8.3378e+01,  2.5138e+00,  1.2184e+03]),
+       size=(46835, 46835), nnz=2374001, layout=torch.sparse_csr)
+tensor([0.2401, 0.9608, 0.9686,  ..., 0.2643, 0.1097, 0.0695])
+Matrix: rma10
+Shape: torch.Size([46835, 46835])
+NNZ: 2374001
+Density: 0.0010822805369125833
+Time: 65.29214668273926 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/rma10.mtx 1000':
+
+           342,237      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,641,135      BR_RETIRED:u                                                          
+
+      69.131216008 seconds time elapsed
+
+     324.908899000 seconds user
+    4969.165543000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,      17,      34,  ..., 2373939,
+                            2373970, 2374001]),
+       col_indices=tensor([    0,     1,     2,  ..., 46831, 46833, 46834]),
+       values=tensor([ 1.2636e+05, -1.6615e+07, -8.2015e+04,  ...,
+                       8.3378e+01,  2.5138e+00,  1.2184e+03]),
+       size=(46835, 46835), nnz=2374001, layout=torch.sparse_csr)
+tensor([0.5237, 0.3525, 0.2809,  ..., 0.8641, 0.3894, 0.4198])
+Matrix: rma10
+Shape: torch.Size([46835, 46835])
+NNZ: 2374001
+Density: 0.0010822805369125833
+Time: 66.05637407302856 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/rma10.mtx 1000':
+
+        27,974,213      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,660      ITLB_WALK:u                                                           
+            18,441      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        37,780,346      L1D_TLB:u                                                             
+
+      69.880637029 seconds time elapsed
+
+     320.759259000 seconds user
+    5037.255757000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,      17,      34,  ..., 2373939,
+                            2373970, 2374001]),
+       col_indices=tensor([    0,     1,     2,  ..., 46831, 46833, 46834]),
+       values=tensor([ 1.2636e+05, -1.6615e+07, -8.2015e+04,  ...,
+                       8.3378e+01,  2.5138e+00,  1.2184e+03]),
+       size=(46835, 46835), nnz=2374001, layout=torch.sparse_csr)
+tensor([0.8185, 0.4278, 0.7553,  ..., 0.5022, 0.1058, 0.0783])
+Matrix: rma10
+Shape: torch.Size([46835, 46835])
+NNZ: 2374001
+Density: 0.0010822805369125833
+Time: 63.55399775505066 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/rma10.mtx 1000':
+
+        31,166,891      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           291,301      L1I_CACHE_REFILL:u                                                    
+           477,186      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        32,682,323      L1D_CACHE:u                                                           
+
+      67.517251505 seconds time elapsed
+
+     319.301754000 seconds user
+    4839.755901000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,      17,      34,  ..., 2373939,
+                            2373970, 2374001]),
+       col_indices=tensor([    0,     1,     2,  ..., 46831, 46833, 46834]),
+       values=tensor([ 1.2636e+05, -1.6615e+07, -8.2015e+04,  ...,
+                       8.3378e+01,  2.5138e+00,  1.2184e+03]),
+       size=(46835, 46835), nnz=2374001, layout=torch.sparse_csr)
+tensor([0.8358, 0.0086, 0.1779,  ..., 0.6354, 0.7134, 0.5745])
+Matrix: rma10
+Shape: torch.Size([46835, 46835])
+NNZ: 2374001
+Density: 0.0010822805369125833
+Time: 63.55393171310425 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/rma10.mtx 1000':
+
+           538,552      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           552,543      LL_CACHE_RD:u                                                         
+           202,351      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            24,178      L2D_TLB_REFILL:u                                                      
+           298,051      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,775,481      L2D_CACHE:u                                                           
+
+      67.538674790 seconds time elapsed
+
+     321.810383000 seconds user
+    4836.154538000 seconds sys
+
+
+
--- a/pytorch/output_HPC/altra_10_30_tn2010_1000.json
+++ b/pytorch/output_HPC/altra_10_30_tn2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [37.56, 23.12, 22.32, 22.28, 22.28, 21.96, 21.76, 21.72, 21.88, 21.84], "matrix": "tn2010", "shape": [240116, 240116], "nnz": 1193966, "% density": 2.070855328296721e-05, "time_s": 16.282614707946777, "power": [85.48, 85.84, 79.28, 70.16, 55.52, 49.48, 49.48, 60.48, 76.32, 88.88, 91.0, 91.0, 90.68, 88.32, 86.92, 86.4, 88.08, 86.8, 87.32, 87.8], "power_after": [21.68, 21.48, 21.44, 21.36, 21.52, 21.4, 21.4, 21.32, 21.2, 21.04], "task clock (msec)": 68.11, "page faults": 3486, "cycles": 70427921, "instructions": 85638293, "branch mispredictions": 333780, "branches": 19402540, "ITLB accesses": 26935483, "ITLB misses": 5639, "DTLB misses": 16688, "DTLB accesses": 36421540, "L1I cache accesses": 33029213, "L1I cache misses": 302558, "L1D cache misses": 481598, "L1D cache accesses": 34668833, "LL cache misses": 551659, "LL cache accesses": 564579, "L2D TLB accesses": 188346, "L2D TLB misses": 24479, "L2D cache misses": 311796, "L2D cache accesses": 1767924, "instructions per cycle": 1.215970765344614, "branch miss rate": 0.017202902300420462, "ITLB miss rate": 0.0002093521025778524, "DTLB miss rate": 0.00045819040051573877, "L2D TLB miss rate": 0.12996824992301403, "L1I cache miss rate": 0.00916031514284037, "L1D cache miss rate": 0.013891381922200843, "L2D cache miss rate": 0.17636278482559206, "LL cache miss rate": 0.9771156915152707}
--- a/pytorch/output_HPC/altra_10_30_tn2010_1000.output
+++ b/pytorch/output_HPC/altra_10_30_tn2010_1000.output
@ -0,0 +1,173 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395275 queued and waiting for resources
+srun: job 3395275 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       4,      20,  ..., 1193961,
+                            1193963, 1193966]),
+       col_indices=tensor([  1152,   1272,   1961,  ..., 238254, 239142,
+                           240113]),
+       values=tensor([  5728.,   2871., 418449.,  ...,  10058.,  33324.,
+                       34928.]), size=(240116, 240116), nnz=1193966,
+       layout=torch.sparse_csr)
+tensor([0.2511, 0.1104, 0.8257,  ..., 0.4006, 0.1534, 0.0009])
+Matrix: tn2010
+Shape: torch.Size([240116, 240116])
+NNZ: 1193966
+Density: 2.070855328296721e-05
+Time: 12.89618182182312 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/tn2010.mtx 1000':
+
+             68.11 msec task-clock:u                     #    0.004 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,486      page-faults:u                    #   51.182 K/sec                     
+        70,427,921      cycles:u                         #    1.034 GHz                         (46.81%)
+        85,638,293      instructions:u                   #    1.22  insn per cycle              (74.19%)
+   <not supported>      branches:u                                                            
+           356,748      branch-misses:u                                                         (89.74%)
+        34,044,117      L1-dcache-loads:u                #  499.843 M/sec                     
+           481,076      L1-dcache-load-misses:u          #    1.41% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        32,553,977      L1-icache-loads:u                #  477.965 M/sec                     
+           309,127      L1-icache-load-misses:u          #    0.95% of all L1-icache accesses 
+        41,245,978      dTLB-loads:u                     #  605.583 M/sec                       (33.60%)
+           127,770      dTLB-load-misses:u               #    0.31% of all dTLB cache accesses  (15.43%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+      16.626373547 seconds time elapsed
+
+     101.073288000 seconds user
+     996.348020000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       4,      20,  ..., 1193961,
+                            1193963, 1193966]),
+       col_indices=tensor([  1152,   1272,   1961,  ..., 238254, 239142,
+                           240113]),
+       values=tensor([  5728.,   2871., 418449.,  ...,  10058.,  33324.,
+                       34928.]), size=(240116, 240116), nnz=1193966,
+       layout=torch.sparse_csr)
+tensor([0.0138, 0.1394, 0.6273,  ..., 0.8681, 0.0444, 0.2705])
+Matrix: tn2010
+Shape: torch.Size([240116, 240116])
+NNZ: 1193966
+Density: 2.070855328296721e-05
+Time: 14.216531038284302 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/tn2010.mtx 1000':
+
+           333,780      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,402,540      BR_RETIRED:u                                                          
+
+      17.985093703 seconds time elapsed
+
+     106.904608000 seconds user
+    1091.172933000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       4,      20,  ..., 1193961,
+                            1193963, 1193966]),
+       col_indices=tensor([  1152,   1272,   1961,  ..., 238254, 239142,
+                           240113]),
+       values=tensor([  5728.,   2871., 418449.,  ...,  10058.,  33324.,
+                       34928.]), size=(240116, 240116), nnz=1193966,
+       layout=torch.sparse_csr)
+tensor([0.6279, 0.1696, 0.6937,  ..., 0.4267, 0.4847, 0.6447])
+Matrix: tn2010
+Shape: torch.Size([240116, 240116])
+NNZ: 1193966
+Density: 2.070855328296721e-05
+Time: 12.462992429733276 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/tn2010.mtx 1000':
+
+        26,935,483      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             5,639      ITLB_WALK:u                                                           
+            16,688      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,421,540      L1D_TLB:u                                                             
+
+      15.984498303 seconds time elapsed
+
+      95.195897000 seconds user
+     962.237122000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       4,      20,  ..., 1193961,
+                            1193963, 1193966]),
+       col_indices=tensor([  1152,   1272,   1961,  ..., 238254, 239142,
+                           240113]),
+       values=tensor([  5728.,   2871., 418449.,  ...,  10058.,  33324.,
+                       34928.]), size=(240116, 240116), nnz=1193966,
+       layout=torch.sparse_csr)
+tensor([0.4060, 0.4915, 0.8557,  ..., 0.9902, 0.0548, 0.2450])
+Matrix: tn2010
+Shape: torch.Size([240116, 240116])
+NNZ: 1193966
+Density: 2.070855328296721e-05
+Time: 9.298198223114014 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/tn2010.mtx 1000':
+
+        33,029,213      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           302,558      L1I_CACHE_REFILL:u                                                    
+           481,598      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        34,668,833      L1D_CACHE:u                                                           
+
+      12.985459942 seconds time elapsed
+
+      78.950722000 seconds user
+     727.126874000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       4,      20,  ..., 1193961,
+                            1193963, 1193966]),
+       col_indices=tensor([  1152,   1272,   1961,  ..., 238254, 239142,
+                           240113]),
+       values=tensor([  5728.,   2871., 418449.,  ...,  10058.,  33324.,
+                       34928.]), size=(240116, 240116), nnz=1193966,
+       layout=torch.sparse_csr)
+tensor([0.0166, 0.6910, 0.0311,  ..., 0.6156, 0.5689, 0.9849])
+Matrix: tn2010
+Shape: torch.Size([240116, 240116])
+NNZ: 1193966
+Density: 2.070855328296721e-05
+Time: 12.012693405151367 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/tn2010.mtx 1000':
+
+           551,659      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           564,579      LL_CACHE_RD:u                                                         
+           188,346      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            24,479      L2D_TLB_REFILL:u                                                      
+           311,796      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,767,924      L2D_CACHE:u                                                           
+
+      15.749851583 seconds time elapsed
+
+      98.008506000 seconds user
+     926.127594000 seconds sys
+
+
+
--- a/pytorch/output_HPC/altra_10_30_ut2010_1000.json
+++ b/pytorch/output_HPC/altra_10_30_ut2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [93.52, 87.76, 71.6, 58.32, 39.6, 26.24, 26.24, 22.16, 22.24, 22.24], "matrix": "ut2010", "shape": [115406, 115406], "nnz": 572066, "% density": 4.295259032005559e-05, "time_s": 8.478580713272095, "power": [89.68, 88.92, 80.84, 68.96, 56.64, 54.52, 55.88, 70.44, 85.36, 85.36, 98.2, 96.52], "power_after": [21.24, 21.32, 21.16, 21.44, 21.68, 21.76, 21.72, 22.0, 21.72, 21.72], "task clock (msec)": 53.84, "page faults": 3291, "cycles": 66389970, "instructions": 74935543, "branch mispredictions": 330515, "branches": 19475058, "ITLB accesses": 26125490, "ITLB misses": 6431, "DTLB misses": 13728, "DTLB accesses": 35274185, "L1I cache accesses": 30428652, "L1I cache misses": 288897, "L1D cache misses": 475615, "L1D cache accesses": 31855716, "LL cache misses": 553829, "LL cache accesses": 574192, "L2D TLB accesses": 181148, "L2D TLB misses": 23202, "L2D cache misses": 307806, "L2D cache accesses": 1767037, "instructions per cycle": 1.1287178319255153, "branch miss rate": 0.016971194642911976, "ITLB miss rate": 0.00024615806248992844, "DTLB miss rate": 0.0003891797925309968, "L2D TLB miss rate": 0.12808311435952924, "L1I cache miss rate": 0.009494242465949527, "L1D cache miss rate": 0.014930287550278261, "L2D cache miss rate": 0.17419329646181717, "LL cache miss rate": 0.9645362526820297}
--- a/pytorch/output_HPC/altra_10_30_ut2010_1000.output
+++ b/pytorch/output_HPC/altra_10_30_ut2010_1000.output
@ -5,8 +5,8 @@ srun: # All submission nodes and all other compute nodes have x86_64 architectur
 srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
 srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
 srun: ################################################################################
-srun: job 3394146 queued and waiting for resources
-srun: job 3394146 has been allocated resources
+srun: job 3395284 queued and waiting for resources
+srun: job 3395284 has been allocated resources
 /nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
  ).to_sparse_csr().type(torch.float)
 tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
@ -16,37 +16,38 @@ tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
       values=tensor([160642.,  31335., 282373.,  ...,  88393.,  99485.,
                       18651.]), size=(115406, 115406), nnz=572066,
       layout=torch.sparse_csr)
-tensor([0.4608, 0.1516, 0.8492,  ..., 0.8920, 0.4275, 0.8070])
+tensor([0.1487, 0.4275, 0.9471,  ..., 0.3851, 0.0801, 0.4295])
+Matrix: ut2010
 Shape: torch.Size([115406, 115406])
 NNZ: 572066
 Density: 4.295259032005559e-05
-Time: 1.3751039505004883 seconds
+Time: 8.772023677825928 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 1000':

-             60.55 msec task-clock:u                     #    0.012 CPUs utilized             
+             53.84 msec task-clock:u                     #    0.004 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
-             3,490      page-faults:u                    #   57.638 K/sec                     
-        49,977,496      cycles:u                         #    0.825 GHz                         (40.93%)
-        78,622,993      instructions:u                   #    1.57  insn per cycle              (85.37%)
+             3,291      page-faults:u                    #   61.127 K/sec                     
+        66,389,970      cycles:u                         #    1.233 GHz                         (67.37%)
+        74,935,543      instructions:u                   #    1.13  insn per cycle              (83.30%)
   <not supported>      branches:u                                                            
-           358,029      branch-misses:u                                                       
-        31,478,500      L1-dcache-loads:u                #  519.877 M/sec                     
-           479,449      L1-dcache-load-misses:u          #    1.52% of all L1-dcache accesses 
+           365,846      branch-misses:u                                                       
+        31,684,169      L1-dcache-loads:u                #  588.504 M/sec                     
+           462,583      L1-dcache-load-misses:u          #    1.46% of all L1-dcache accesses 
   <not supported>      LLC-loads:u                                                           
   <not supported>      LLC-load-misses:u                                                     
-        29,991,824      L1-icache-loads:u                #  495.324 M/sec                     
-           294,864      L1-icache-load-misses:u          #    0.98% of all L1-icache accesses 
-        35,154,647      dTLB-loads:u                     #  580.589 M/sec                       (23.19%)
+        30,260,337      L1-icache-loads:u                #  562.058 M/sec                     
+           288,196      L1-icache-load-misses:u          #    0.95% of all L1-icache accesses 
+        57,721,334      dTLB-loads:u                     #    1.072 G/sec                       (18.54%)
     <not counted>      dTLB-load-misses:u                                                      (0.00%)
     <not counted>      iTLB-loads:u                                                            (0.00%)
     <not counted>      iTLB-load-misses:u                                                      (0.00%)

-       4.986156121 seconds time elapsed
+      12.179628060 seconds time elapsed

-      23.724703000 seconds user
-     145.034521000 seconds sys
+      68.068275000 seconds user
+     690.223452000 seconds sys



@ -59,21 +60,22 @@ tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
       values=tensor([160642.,  31335., 282373.,  ...,  88393.,  99485.,
                       18651.]), size=(115406, 115406), nnz=572066,
       layout=torch.sparse_csr)
-tensor([0.4697, 0.7121, 0.5987,  ..., 0.2619, 0.7308, 0.3129])
+tensor([0.9553, 0.9401, 0.7135,  ..., 0.8664, 0.5986, 0.8459])
+Matrix: ut2010
 Shape: torch.Size([115406, 115406])
 NNZ: 572066
 Density: 4.295259032005559e-05
-Time: 1.6881086826324463 seconds
+Time: 8.94040060043335 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 1000':

-           327,078      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
-        20,135,808      BR_RETIRED:u                                                          
+           330,515      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,475,058      BR_RETIRED:u                                                          

-       5.374156677 seconds time elapsed
+      12.428594105 seconds time elapsed

-      25.609168000 seconds user
-     167.278028000 seconds sys
+      67.011228000 seconds user
+     709.528404000 seconds sys



@ -86,23 +88,24 @@ tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
       values=tensor([160642.,  31335., 282373.,  ...,  88393.,  99485.,
                       18651.]), size=(115406, 115406), nnz=572066,
       layout=torch.sparse_csr)
-tensor([0.9215, 0.6706, 0.8015,  ..., 0.8507, 0.8546, 0.4441])
+tensor([0.6289, 0.8171, 0.1590,  ..., 0.7515, 0.5400, 0.3693])
+Matrix: ut2010
 Shape: torch.Size([115406, 115406])
 NNZ: 572066
 Density: 4.295259032005559e-05
-Time: 1.2785694599151611 seconds
+Time: 14.403366804122925 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 1000':

-        27,608,093      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
-             6,616      ITLB_WALK:u                                                           
-            17,185      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
-        36,866,957      L1D_TLB:u                                                             
+        26,125,490      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,431      ITLB_WALK:u                                                           
+            13,728      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        35,274,185      L1D_TLB:u                                                             

-       4.861513311 seconds time elapsed
+      18.084508405 seconds time elapsed

-      23.339077000 seconds user
-     141.584760000 seconds sys
+      95.162133000 seconds user
+    1117.716009000 seconds sys



@ -115,23 +118,24 @@ tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
       values=tensor([160642.,  31335., 282373.,  ...,  88393.,  99485.,
                       18651.]), size=(115406, 115406), nnz=572066,
       layout=torch.sparse_csr)
-tensor([0.8973, 0.5228, 0.4492,  ..., 0.7677, 0.7722, 0.1700])
+tensor([0.8824, 0.0692, 0.7225,  ..., 0.8736, 0.6854, 0.7514])
+Matrix: ut2010
 Shape: torch.Size([115406, 115406])
 NNZ: 572066
 Density: 4.295259032005559e-05
-Time: 1.1654376983642578 seconds
+Time: 9.64679503440857 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 1000':

-        32,639,204      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
-           309,643      L1I_CACHE_REFILL:u                                                    
-           478,856      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
-        34,280,618      L1D_CACHE:u                                                           
+        30,428,652      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           288,897      L1I_CACHE_REFILL:u                                                    
+           475,615      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        31,855,716      L1D_CACHE:u                                                           

-       4.677973310 seconds time elapsed
+      13.170070008 seconds time elapsed

-      22.972655000 seconds user
-     125.062401000 seconds sys
+      68.362809000 seconds user
+     761.360459000 seconds sys



@ -144,25 +148,26 @@ tensor(crow_indices=tensor([     0,      3,      9,  ..., 572056, 572061,
       values=tensor([160642.,  31335., 282373.,  ...,  88393.,  99485.,
                       18651.]), size=(115406, 115406), nnz=572066,
       layout=torch.sparse_csr)
-tensor([0.4542, 0.7095, 0.5701,  ..., 0.2172, 0.8829, 0.7757])
+tensor([0.9552, 0.0509, 0.7738,  ..., 0.7722, 0.4417, 0.7772])
+Matrix: ut2010
 Shape: torch.Size([115406, 115406])
 NNZ: 572066
 Density: 4.295259032005559e-05
-Time: 1.1153452396392822 seconds
+Time: 12.372079133987427 seconds

- Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 100':
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/ut2010.mtx 1000':

-           555,275      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
-           578,455      LL_CACHE_RD:u                                                         
-           188,723      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
-            24,635      L2D_TLB_REFILL:u                                                      
-           319,663      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
-         1,799,940      L2D_CACHE:u                                                           
+           553,829      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           574,192      LL_CACHE_RD:u                                                         
+           181,148      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,202      L2D_TLB_REFILL:u                                                      
+           307,806      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,767,037      L2D_CACHE:u                                                           

-       4.655024760 seconds time elapsed
+      15.923392394 seconds time elapsed

-      23.104641000 seconds user
-     122.294597000 seconds sys
+      83.307253000 seconds user
+     958.949992000 seconds sys



--- a/pytorch/output_HPC/altra_10_30_va2010_1000.json
+++ b/pytorch/output_HPC/altra_10_30_va2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [32.08, 31.8, 28.68, 27.6, 22.96, 22.08, 21.0, 20.84, 20.68, 20.72], "matrix": "va2010", "shape": [285762, 285762], "nnz": 1402128, "% density": 1.717033263003816e-05, "time_s": 14.632386922836304, "power": [85.16, 83.48, 76.96, 67.44, 54.04, 51.4, 54.24, 66.76, 83.2, 96.44, 96.44, 95.84, 94.24, 92.36, 91.2, 89.32, 87.48, 88.68, 88.24], "power_after": [21.12, 21.0, 21.16, 21.4, 21.32, 21.36, 21.36, 21.12, 20.76, 20.84], "task clock (msec)": 57.32, "page faults": 3280, "cycles": 39497791, "instructions": 64385555, "branch mispredictions": 332792, "branches": 19983954, "ITLB accesses": 27156853, "ITLB misses": 6466, "DTLB misses": 18244, "DTLB accesses": 36466301, "L1I cache accesses": 30929971, "L1I cache misses": 291811, "L1D cache misses": 473063, "L1D cache accesses": 32462905, "LL cache misses": 544953, "LL cache accesses": 565172, "L2D TLB accesses": 183225, "L2D TLB misses": 23924, "L2D cache misses": 301362, "L2D cache accesses": 1756590, "instructions per cycle": 1.6301052127193645, "branch miss rate": 0.01665296067034582, "ITLB miss rate": 0.00023809828038616994, "DTLB miss rate": 0.000500297521264907, "L2D TLB miss rate": 0.13057170145995362, "L1I cache miss rate": 0.009434570759862659, "L1D cache miss rate": 0.014572417348354991, "L2D cache miss rate": 0.17156080815671274, "LL cache miss rate": 0.964225050073252}
--- a/pytorch/output_HPC/altra_10_30_va2010_1000.output
+++ b/pytorch/output_HPC/altra_10_30_va2010_1000.output
@ -0,0 +1,173 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395277 queued and waiting for resources
+srun: job 3395277 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       8,  ..., 1402119,
+                            1402123, 1402128]),
+       col_indices=tensor([  2006,   2464,   1166,  ..., 285581, 285634,
+                           285760]),
+       values=tensor([125334.,   3558.,   1192.,  ...,  10148.,   1763.,
+                        9832.]), size=(285762, 285762), nnz=1402128,
+       layout=torch.sparse_csr)
+tensor([0.2920, 0.3583, 0.0598,  ..., 0.2208, 0.1741, 0.4955])
+Matrix: va2010
+Shape: torch.Size([285762, 285762])
+NNZ: 1402128
+Density: 1.717033263003816e-05
+Time: 14.792448997497559 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/va2010.mtx 1000':
+
+             57.32 msec task-clock:u                     #    0.003 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,280      page-faults:u                    #   57.220 K/sec                     
+        39,497,791      cycles:u                         #    0.689 GHz                         (54.25%)
+        64,385,555      instructions:u                   #    1.63  insn per cycle              (81.24%)
+   <not supported>      branches:u                                                            
+           362,674      branch-misses:u                                                       
+        33,532,520      L1-dcache-loads:u                #  584.977 M/sec                     
+           481,355      L1-dcache-load-misses:u          #    1.44% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        31,924,348      L1-icache-loads:u                #  556.922 M/sec                     
+           296,637      L1-icache-load-misses:u          #    0.93% of all L1-icache accesses 
+        43,420,143      dTLB-loads:u                     #  757.467 M/sec                       (40.22%)
+            30,923      dTLB-load-misses:u               #    0.07% of all dTLB cache accesses  (19.05%)
+     <not counted>      iTLB-loads:u                                                            (0.00%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+      18.678937115 seconds time elapsed
+
+     112.979167000 seconds user
+    1135.785668000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       8,  ..., 1402119,
+                            1402123, 1402128]),
+       col_indices=tensor([  2006,   2464,   1166,  ..., 285581, 285634,
+                           285760]),
+       values=tensor([125334.,   3558.,   1192.,  ...,  10148.,   1763.,
+                        9832.]), size=(285762, 285762), nnz=1402128,
+       layout=torch.sparse_csr)
+tensor([0.7703, 0.7481, 0.5351,  ..., 0.4663, 0.6089, 0.3679])
+Matrix: va2010
+Shape: torch.Size([285762, 285762])
+NNZ: 1402128
+Density: 1.717033263003816e-05
+Time: 14.130552530288696 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/va2010.mtx 1000':
+
+           332,792      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        19,983,954      BR_RETIRED:u                                                          
+
+      17.923156218 seconds time elapsed
+
+     107.999690000 seconds user
+    1091.659165000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       8,  ..., 1402119,
+                            1402123, 1402128]),
+       col_indices=tensor([  2006,   2464,   1166,  ..., 285581, 285634,
+                           285760]),
+       values=tensor([125334.,   3558.,   1192.,  ...,  10148.,   1763.,
+                        9832.]), size=(285762, 285762), nnz=1402128,
+       layout=torch.sparse_csr)
+tensor([0.8850, 0.1406, 0.0617,  ..., 0.4325, 0.2725, 0.9292])
+Matrix: va2010
+Shape: torch.Size([285762, 285762])
+NNZ: 1402128
+Density: 1.717033263003816e-05
+Time: 13.32977032661438 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/va2010.mtx 1000':
+
+        27,156,853      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,466      ITLB_WALK:u                                                           
+            18,244      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        36,466,301      L1D_TLB:u                                                             
+
+      17.186572497 seconds time elapsed
+
+     104.940187000 seconds user
+    1032.527271000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       8,  ..., 1402119,
+                            1402123, 1402128]),
+       col_indices=tensor([  2006,   2464,   1166,  ..., 285581, 285634,
+                           285760]),
+       values=tensor([125334.,   3558.,   1192.,  ...,  10148.,   1763.,
+                        9832.]), size=(285762, 285762), nnz=1402128,
+       layout=torch.sparse_csr)
+tensor([0.6289, 0.0403, 0.9207,  ..., 0.0183, 0.4807, 0.7504])
+Matrix: va2010
+Shape: torch.Size([285762, 285762])
+NNZ: 1402128
+Density: 1.717033263003816e-05
+Time: 13.460915803909302 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/va2010.mtx 1000':
+
+        30,929,971      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           291,811      L1I_CACHE_REFILL:u                                                    
+           473,063      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        32,462,905      L1D_CACHE:u                                                           
+
+      17.219448483 seconds time elapsed
+
+     100.274467000 seconds user
+    1045.271682000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([      0,       2,       8,  ..., 1402119,
+                            1402123, 1402128]),
+       col_indices=tensor([  2006,   2464,   1166,  ..., 285581, 285634,
+                           285760]),
+       values=tensor([125334.,   3558.,   1192.,  ...,  10148.,   1763.,
+                        9832.]), size=(285762, 285762), nnz=1402128,
+       layout=torch.sparse_csr)
+tensor([0.6412, 0.1151, 0.5075,  ..., 0.9251, 0.9288, 0.3560])
+Matrix: va2010
+Shape: torch.Size([285762, 285762])
+NNZ: 1402128
+Density: 1.717033263003816e-05
+Time: 15.992860555648804 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/va2010.mtx 1000':
+
+           544,953      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           565,172      LL_CACHE_RD:u                                                         
+           183,225      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            23,924      L2D_TLB_REFILL:u                                                      
+           301,362      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,756,590      L2D_CACHE:u                                                           
+
+      19.884223259 seconds time elapsed
+
+     113.211516000 seconds user
+    1230.525804000 seconds sys
+
+
+
--- a/pytorch/output_HPC/altra_10_30_vt2010_1000.json
+++ b/pytorch/output_HPC/altra_10_30_vt2010_1000.json
@ -0,0 +1 @@
+{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [77.2, 64.12, 64.12, 48.92, 36.2, 21.72, 21.88, 22.36, 22.36, 22.44], "matrix": "vt2010", "shape": [32580, 32580], "nnz": 155598, "% density": 0.00014658915806621921, "time_s": 3.5892834663391113, "power": [33.44, 30.68, 31.08, 26.96, 26.88, 32.48, 32.24], "power_after": [21.24, 21.24, 21.36, 21.36, 21.2, 21.04, 20.84, 20.72, 20.72, 20.56], "task clock (msec)": 55.26, "page faults": 3297, "cycles": 49276491, "instructions": 64763517, "branch mispredictions": 340611, "branches": 20355849, "ITLB accesses": 27946393, "ITLB misses": 6805, "DTLB misses": 17877, "DTLB accesses": 38226912, "L1I cache accesses": 31946141, "L1I cache misses": 295259, "L1D cache misses": 468136, "L1D cache accesses": 33395666, "LL cache misses": 527109, "LL cache accesses": 540409, "L2D TLB accesses": 192519, "L2D TLB misses": 24204, "L2D cache misses": 290933, "L2D cache accesses": 1743452, "instructions per cycle": 1.3142883286880147, "branch miss rate": 0.016732831924622747, "ITLB miss rate": 0.00024350190738389746, "DTLB miss rate": 0.0004676548291423592, "L2D TLB miss rate": 0.1257226559456469, "L1I cache miss rate": 0.009242399574959616, "L1D cache miss rate": 0.014017866869311724, "L2D cache miss rate": 0.16687181522634406, "LL cache miss rate": 0.9753890109158063}
--- a/pytorch/output_HPC/altra_10_30_vt2010_1000.output
+++ b/pytorch/output_HPC/altra_10_30_vt2010_1000.output
@ -0,0 +1,163 @@
+srun: Job time limit was unset; set to partition default of 60 minutes
+srun: ################################################################################
+srun: # Please note that the oasis compute nodes have aarch64 architecture CPUs.     #
+srun: # All submission nodes and all other compute nodes have x86_64 architecture    #
+srun: # CPUs. Programs, environments, or other software that was built on x86_64     #
+srun: # nodes may need to be rebuilt to properly execute on these nodes.             #
+srun: ################################################################################
+srun: job 3395285 queued and waiting for resources
+srun: job 3395285 has been allocated resources
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
+                            155598]),
+       col_indices=tensor([  131,   561,   996,  ..., 32237, 32238, 32570]),
+       values=tensor([79040.,  7820., 15136.,  ...,  2828., 17986.,  2482.]),
+       size=(32580, 32580), nnz=155598, layout=torch.sparse_csr)
+tensor([0.1179, 0.2288, 0.5357,  ..., 0.4845, 0.6375, 0.4513])
+Matrix: vt2010
+Shape: torch.Size([32580, 32580])
+NNZ: 155598
+Density: 0.00014658915806621921
+Time: 3.628732681274414 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 1000':
+
+             55.26 msec task-clock:u                     #    0.008 CPUs utilized             
+                 0      context-switches:u               #    0.000 /sec                      
+                 0      cpu-migrations:u                 #    0.000 /sec                      
+             3,297      page-faults:u                    #   59.661 K/sec                     
+        49,276,491      cycles:u                         #    0.892 GHz                         (31.65%)
+        64,763,517      instructions:u                   #    1.31  insn per cycle              (57.73%)
+   <not supported>      branches:u                                                            
+           357,693      branch-misses:u                                                         (76.18%)
+        32,426,852      L1-dcache-loads:u                #  586.784 M/sec                       (88.36%)
+           469,495      L1-dcache-load-misses:u          #    1.45% of all L1-dcache accesses 
+   <not supported>      LLC-loads:u                                                           
+   <not supported>      LLC-load-misses:u                                                     
+        30,941,957      L1-icache-loads:u                #  559.914 M/sec                     
+           279,512      L1-icache-load-misses:u          #    0.90% of all L1-icache accesses 
+        47,128,547      dTLB-loads:u                     #  852.821 M/sec                       (46.73%)
+           108,931      dTLB-load-misses:u               #    0.23% of all dTLB cache accesses  (32.30%)
+        14,189,608      iTLB-loads:u                     #  256.770 M/sec                       (19.86%)
+     <not counted>      iTLB-load-misses:u                                                      (0.00%)
+
+       7.117399121 seconds time elapsed
+
+      18.404618000 seconds user
+      29.532104000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
+                            155598]),
+       col_indices=tensor([  131,   561,   996,  ..., 32237, 32238, 32570]),
+       values=tensor([79040.,  7820., 15136.,  ...,  2828., 17986.,  2482.]),
+       size=(32580, 32580), nnz=155598, layout=torch.sparse_csr)
+tensor([0.7544, 0.0071, 0.0491,  ..., 0.7236, 0.5537, 0.4901])
+Matrix: vt2010
+Shape: torch.Size([32580, 32580])
+NNZ: 155598
+Density: 0.00014658915806621921
+Time: 3.6322426795959473 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 1000':
+
+           340,611      BR_MIS_PRED_RETIRED:u            #      0.0 per branch  branch_misprediction_ratio
+        20,355,849      BR_RETIRED:u                                                          
+
+       7.112879848 seconds time elapsed
+
+      18.362004000 seconds user
+      29.398677000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
+                            155598]),
+       col_indices=tensor([  131,   561,   996,  ..., 32237, 32238, 32570]),
+       values=tensor([79040.,  7820., 15136.,  ...,  2828., 17986.,  2482.]),
+       size=(32580, 32580), nnz=155598, layout=torch.sparse_csr)
+tensor([0.7651, 0.6605, 0.7128,  ..., 0.7434, 0.6656, 0.3987])
+Matrix: vt2010
+Shape: torch.Size([32580, 32580])
+NNZ: 155598
+Density: 0.00014658915806621921
+Time: 3.7933311462402344 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 1000':
+
+        27,946,393      L1I_TLB:u                        #      0.0 per TLB access  itlb_walk_ratio
+             6,805      ITLB_WALK:u                                                           
+            17,877      DTLB_WALK:u                      #      0.0 per TLB access  dtlb_walk_ratio
+        38,226,912      L1D_TLB:u                                                             
+
+       7.235266934 seconds time elapsed
+
+      18.566568000 seconds user
+      29.759130000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
+                            155598]),
+       col_indices=tensor([  131,   561,   996,  ..., 32237, 32238, 32570]),
+       values=tensor([79040.,  7820., 15136.,  ...,  2828., 17986.,  2482.]),
+       size=(32580, 32580), nnz=155598, layout=torch.sparse_csr)
+tensor([0.3319, 0.1241, 0.4830,  ..., 0.5188, 0.8684, 0.1488])
+Matrix: vt2010
+Shape: torch.Size([32580, 32580])
+NNZ: 155598
+Density: 0.00014658915806621921
+Time: 3.662006378173828 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 1000':
+
+        31,946,141      L1I_CACHE:u                      #      0.0 per cache access  l1i_cache_miss_ratio
+           295,259      L1I_CACHE_REFILL:u                                                    
+           468,136      L1D_CACHE_REFILL:u               #      0.0 per cache access  l1d_cache_miss_ratio
+        33,395,666      L1D_CACHE:u                                                           
+
+       7.187008251 seconds time elapsed
+
+      18.275672000 seconds user
+      30.724065000 seconds sys
+
+
+
+/nfshomes/vut/ampere_research/pytorch/spmv.py:20: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /space/jenkins/workspace/Releases/pytorch-dls/pytorch-dls/aten/src/ATen/SparseCsrTensorImpl.cpp:55.)
+  ).to_sparse_csr().type(torch.float)
+tensor(crow_indices=tensor([     0,      4,      7,  ..., 155588, 155592,
+                            155598]),
+       col_indices=tensor([  131,   561,   996,  ..., 32237, 32238, 32570]),
+       values=tensor([79040.,  7820., 15136.,  ...,  2828., 17986.,  2482.]),
+       size=(32580, 32580), nnz=155598, layout=torch.sparse_csr)
+tensor([0.0055, 0.7774, 0.9046,  ..., 0.5143, 0.0678, 0.4725])
+Matrix: vt2010
+Shape: torch.Size([32580, 32580])
+NNZ: 155598
+Density: 0.00014658915806621921
+Time: 3.616023063659668 seconds
+
+ Performance counter stats for 'apptainer run pytorch-altra.sif -c numactl --cpunodebind=0 --membind=0 python spmv.py matrices/vt2010.mtx 1000':
+
+           527,109      LL_CACHE_MISS_RD:u               #      1.0 per cache access  ll_cache_read_miss_ratio
+           540,409      LL_CACHE_RD:u                                                         
+           192,519      L2D_TLB:u                        #      0.1 per TLB access  l2_tlb_miss_ratio
+            24,204      L2D_TLB_REFILL:u                                                      
+           290,933      L2D_CACHE_REFILL:u               #      0.2 per cache access  l2_cache_miss_ratio
+         1,743,452      L2D_CACHE:u                                                           
+
+       7.030605378 seconds time elapsed
+
+      18.274323000 seconds user
+      28.779020000 seconds sys
+
+
+
--- a/pytorch/run.py
+++ b/pytorch/run.py
@ -42,6 +42,10 @@ def run_program(program: list[str]) -> tuple[dict, str]:
    return (json.loads(process.stdout), process.stderr)

 result = dict()
+result['architecture'] = args.arch
+result['iterations'] = args.iterations
+result['baseline_time_s'] = args.baseline_time_s
+result['baseline_delay_s'] = args.baseline_delay_s

 if args.power is True:
    result['power_before'] = baseline_power(args.baseline_time_s)
--- a/pytorch/spmv.py
+++ b/pytorch/spmv.py
@ -3,7 +3,7 @@ import numpy as np
 import argparse
 import time
 import json
-import sys
+import sys, os

 parser = argparse.ArgumentParser()
 parser.add_argument('matrix_file', help='the input matrix (.mtx) file')
@ -32,6 +32,9 @@ end = time.time()

 result = dict()

+result['matrix'] = os.path.splitext(os.path.basename(args.matrix_file))[0]
+print(f"Matrix: {result['matrix']}", file=sys.stderr)
+
 result['shape'] = matrix.shape
 print(f"Shape: {result['shape']}", file=sys.stderr)
Author	SHA1	Message	Date
cephi	9e54411c5a	ignored matrices	2024-12-03 08:54:48 -05:00
cephi	93690abfee	new output	2024-12-03 08:53:39 -05:00
				`@ -0,0 +1 @@`
				{"architecture": "altra", "iterations": 1000, "baseline_time_s": 10, "baseline_delay_s": 30, "power_before": [37.36, 22.88, 22.36, 22.72, 22.52, 22.2, 21.96, 21.8, 21.48, 21.48], "matrix": "Oregon-2", "shape": [11806, 11806], "nnz": 65460, "% density": 0.0004696458003979807, "time_s": 1.5312557220458984, "power": [26.68, 27.84, 28.48, 29.92, 30.0], "power_after": [21.16, 21.32, 21.16, 21.16, 21.16, 20.88, 20.92, 20.76, 20.96, 21.2], "task clock (msec)": 64.81, "page faults": 3244, "cycles": 82069432, "instructions": 78292700, "branch mispredictions": 319703, "branches": 19996903, "ITLB accesses": 26988315, "ITLB misses": 5988, "DTLB misses": 14570, "DTLB accesses": 36879854, "L1I cache accesses": 30465174, "L1I cache misses": 293085, "L1D cache misses": 487330, "L1D cache accesses": 31932249, "LL cache misses": 545501, "LL cache accesses": 558084, "L2D TLB accesses": 204746, "L2D TLB misses": 25302, "L2D cache misses": 314594, "L2D cache accesses": 1828047, "instructions per cycle": 0.9539812582107307, "branch miss rate": 0.01598762568383714, "ITLB miss rate": 0.00022187379982781437, "DTLB miss rate": 0.0003950666399058955, "L2D TLB miss rate": 0.12357750578765886, "L1I cache miss rate": 0.009620329101025322, "L1D cache miss rate": 0.015261374167538278, "L2D cache miss rate": 0.17209294947011755, "LL cache miss rate": 0.9774532149282187}
				`@ -1 +0,0 @@`
				{"power_before": [50.88, 50.88], "shape": [11806, 11806], "nnz": 65460, "% density": 0.0004696458003979807, "time_s": 0.1896660327911377, "power": [25.52, 32.28, 33.12, 33.12], "power_after": [32.88, 26.52], "task clock (msec)": 42.01, "page faults": 3263, "cycles": 47084933, "instructions": 77895119, "branch mispredictions": 330923, "branches": 19740519, "ITLB accesses": 27761239, "ITLB misses": 6471, "DTLB misses": 17268, "DTLB accesses": 36993265, "L1I cache accesses": 31834980, "L1I cache misses": 298333, "L1D cache misses": 466901, "L1D cache accesses": 33528976, "LL cache misses": 525505, "LL cache accesses": 546521, "L2D TLB accesses": 184884, "L2D TLB misses": 22933, "L2D cache misses": 292367, "L2D cache accesses": 1706226, "instructions per cycle": 1.6543534000568716, "branch miss rate": 0.016763642333821112, "ITLB miss rate": 0.00023309478370183695, "DTLB miss rate": 0.0004667876706746485, "L2D TLB miss rate": 0.12403993855606758, "L1I cache miss rate": 0.009371232524725947, "L1D cache miss rate": 0.013925298523879763, "L2D cache miss rate": 0.1713530329510862, "LL cache miss rate": 0.9615458509371094}
				`@ -1 +0,0 @@`
				{"power_before": [20.16, 20.08], "shape": [31379, 31379], "nnz": 106762, "% density": 0.00010842726485909405, "time_s": 0.336850643157959, "power": [24.28, 30.72, 30.72, 34.56], "power_after": [37.32, 32.92], "task clock (msec)": 60.78, "page faults": 3300, "cycles": 66733059, "instructions": 87889334, "branch mispredictions": 326300, "branches": 19832700, "ITLB accesses": 27233629, "ITLB misses": 5868, "DTLB misses": 16893, "DTLB accesses": 36409508, "L1I cache accesses": 30924532, "L1I cache misses": 288199, "L1D cache misses": 462816, "L1D cache accesses": 32428375, "LL cache misses": 551997, "LL cache accesses": 568528, "L2D TLB accesses": 193991, "L2D TLB misses": 24353, "L2D cache misses": 312207, "L2D cache accesses": 1821196, "instructions per cycle": 1.3170284011707, "branch miss rate": 0.016452626218316214, "ITLB miss rate": 0.0002154688969288669, "DTLB miss rate": 0.00046397221297250155, "L2D TLB miss rate": 0.125536751704976, "L1I cache miss rate": 0.009319429635992551, "L1D cache miss rate": 0.014271945479845968, "L2D cache miss rate": 0.17142965391973186, "LL cache miss rate": 0.9709231559395491}
				`@ -1 +0,0 @@`
				{"power_before": [16.32, 16.2], "shape": [116835, 116835], "nnz": 766396, "% density": 5.614451099680581e-05, "time_s": 2.2665774822235107, "power": [35.16, 50.8, 53.4, 53.4, 46.08, 46.88], "power_after": [58.4, 57.32], "task clock (msec)": 50.43, "page faults": 3285, "cycles": 54118679, "instructions": 77692421, "branch mispredictions": 325039, "branches": 19383216, "ITLB accesses": 26060519, "ITLB misses": 4749, "DTLB misses": 16865, "DTLB accesses": 34819729, "L1I cache accesses": 30777115, "L1I cache misses": 293980, "L1D cache misses": 461522, "L1D cache accesses": 32216597, "LL cache misses": 567700, "LL cache accesses": 588689, "L2D TLB accesses": 189417, "L2D TLB misses": 22360, "L2D cache misses": 328306, "L2D cache accesses": 1908607, "instructions per cycle": 1.4355934482436277, "branch miss rate": 0.0167690954896236, "ITLB miss rate": 0.00018222967854170517, "DTLB miss rate": 0.00048435184547243316, "L2D TLB miss rate": 0.11804642666708902, "L1I cache miss rate": 0.009551902444397404, "L1D cache miss rate": 0.014325597455249542, "L2D cache miss rate": 0.172013410827897, "LL cache miss rate": 0.9643461997761127}
				`@ -1 +0,0 @@`
				{"power_before": [20.48, 20.96], "shape": [24115, 24115], "nnz": 116056, "% density": 0.0001995689928120616, "time_s": 0.3271017074584961, "power": [25.28, 26.08, 31.28, 32.96], "power_after": [33.4, 30.24], "task clock (msec)": 59.88, "page faults": 3313, "cycles": 58169777, "instructions": 57993431, "branch mispredictions": 330494, "branches": 20578427, "ITLB accesses": 27982097, "ITLB misses": 6614, "DTLB misses": 17270, "DTLB accesses": 37728899, "L1I cache accesses": 29754926, "L1I cache misses": 278786, "L1D cache misses": 454742, "L1D cache accesses": 31173246, "LL cache misses": 543243, "LL cache accesses": 560716, "L2D TLB accesses": 162281, "L2D TLB misses": 19847, "L2D cache misses": 300577, "L2D cache accesses": 1696278, "instructions per cycle": 0.9969684257170179, "branch miss rate": 0.016060216847478187, "ITLB miss rate": 0.0002363654160729984, "DTLB miss rate": 0.00045773930482307474, "L2D TLB miss rate": 0.12230020766448321, "L1I cache miss rate": 0.009369406598423401, "L1D cache miss rate": 0.014587572946365611, "L2D cache miss rate": 0.1771979592967662, "LL cache miss rate": 0.9688380570556218}
				`@ -1 +0,0 @@`
				{"power_before": [20.28, 20.32], "shape": [36692, 36692], "nnz": 367662, "% density": 0.0002730901120626302, "time_s": 1.030203104019165, "power": [32.08, 47.84, 55.76, 58.08, 58.24], "power_after": [48.76, 45.16], "task clock (msec)": 60.43, "page faults": 3319, "cycles": 66114448, "instructions": 90786829, "branch mispredictions": 341625, "branches": 20129354, "ITLB accesses": 27441303, "ITLB misses": 6807, "DTLB misses": 20551, "DTLB accesses": 36867114, "L1I cache accesses": 31744243, "L1I cache misses": 271027, "L1D cache misses": 464135, "L1D cache accesses": 33441141, "LL cache misses": 539935, "LL cache accesses": 552519, "L2D TLB accesses": 188291, "L2D TLB misses": 24177, "L2D cache misses": 301281, "L2D cache accesses": 1737575, "instructions per cycle": 1.3731768432824245, "branch miss rate": 0.016971483535934636, "ITLB miss rate": 0.00024805673404065397, "DTLB miss rate": 0.0005574344658494288, "L2D TLB miss rate": 0.12840231344036623, "L1I cache miss rate": 0.008537831568388637, "L1D cache miss rate": 0.01387916159918108, "L2D cache miss rate": 0.17339165215889962, "LL cache miss rate": 0.9772243126480719}
				`@ -1 +0,0 @@`
				{"power_before": [50.68, 49.4], "shape": [10879, 10879], "nnz": 39994, "% density": 0.0003379223282393842, "time_s": 0.11296772956848145, "power": [26.2, 29.76, 33.64, 34.44], "power_after": [36.84, 29.44], "task clock (msec)": 67.56, "page faults": 3829, "cycles": 47862000, "instructions": 84392375, "branch mispredictions": 331622, "branches": 19800140, "ITLB accesses": 25905045, "ITLB misses": 6746, "DTLB misses": 17547, "DTLB accesses": 35220079, "L1I cache accesses": 30359576, "L1I cache misses": 283204, "L1D cache misses": 465520, "L1D cache accesses": 31843274, "LL cache misses": 560542, "LL cache accesses": 575610, "L2D TLB accesses": 173643, "L2D TLB misses": 21499, "L2D cache misses": 313335, "L2D cache accesses": 1741621, "instructions per cycle": 1.7632438051063475, "branch miss rate": 0.016748467435078743, "ITLB miss rate": 0.0002604125953072075, "DTLB miss rate": 0.0004982101261044871, "L2D TLB miss rate": 0.12381149830399152, "L1I cache miss rate": 0.009328325270418797, "L1D cache miss rate": 0.014619099782264852, "L2D cache miss rate": 0.17990998041479747, "LL cache miss rate": 0.9738225534650197}
				`@ -1 +0,0 @@`
				{"power_before": [16.52, 16.24], "shape": [26518, 26518], "nnz": 65369, "% density": 9.295875717624285e-05, "time_s": 0.1715233325958252, "power": [18.56, 24.92, 27.84, 27.84], "power_after": [33.2, 27.28], "task clock (msec)": 61.92, "page faults": 3281, "cycles": 66250810, "instructions": 75178179, "branch mispredictions": 332366, "branches": 19076182, "ITLB accesses": 27005133, "ITLB misses": 4791, "DTLB misses": 13403, "DTLB accesses": 36457054, "L1I cache accesses": 32367686, "L1I cache misses": 287524, "L1D cache misses": 467557, "L1D cache accesses": 34022862, "LL cache misses": 535707, "LL cache accesses": 556316, "L2D TLB accesses": 150149, "L2D TLB misses": 18418, "L2D cache misses": 297042, "L2D cache accesses": 1687364, "instructions per cycle": 1.1347510920998551, "branch miss rate": 0.017423088121092577, "ITLB miss rate": 0.00017741071669597036, "DTLB miss rate": 0.00036763804338112453, "L2D TLB miss rate": 0.12266481961251822, "L1I cache miss rate": 0.008883057009388932, "L1D cache miss rate": 0.013742435895016709, "L2D cache miss rate": 0.1760390763344483, "LL cache miss rate": 0.9629545078696281}
				`@ -1 +0,0 @@`
				{"power_before": [29.76, 33.16], "shape": [22687, 22687], "nnz": 54705, "% density": 0.00010628522108964806, "time_s": 0.14322686195373535, "power": [22.6, 22.6, 26.16, 29.2], "power_after": [34.0, 30.16], "task clock (msec)": 64.71, "page faults": 3319, "cycles": 57611295, "instructions": 83148228, "branch mispredictions": 318386, "branches": 19233431, "ITLB accesses": 27039805, "ITLB misses": 6375, "DTLB misses": 17290, "DTLB accesses": 36688544, "L1I cache accesses": 32508072, "L1I cache misses": 297568, "L1D cache misses": 477654, "L1D cache accesses": 34044579, "LL cache misses": 549474, "LL cache accesses": 561939, "L2D TLB accesses": 185622, "L2D TLB misses": 23295, "L2D cache misses": 305878, "L2D cache accesses": 1763089, "instructions per cycle": 1.4432626102225268, "branch miss rate": 0.01655378075809771, "ITLB miss rate": 0.00023576353453732377, "DTLB miss rate": 0.00047126427257511227, "L2D TLB miss rate": 0.12549697772893298, "L1I cache miss rate": 0.009153664972810446, "L1D cache miss rate": 0.014030251336049713, "L2D cache miss rate": 0.17348982382625042, "LL cache miss rate": 0.9778178770293573}