Skip to content

Commit

Permalink
feat: experimental performance modeller / browser for CPU multithreading
Browse files Browse the repository at this point in the history
Added the experimental "discopop_optimizer" to the discopop_library and GUI
  • Loading branch information
lukasrothenberger authored Sep 27, 2023
1 parent 31b2150 commit b6d2d0a
Show file tree
Hide file tree
Showing 74 changed files with 5,180 additions and 107 deletions.
83 changes: 49 additions & 34 deletions discopop_explorer/pattern_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
# directory for details.
import os
import sys
from typing import Dict, Union

from discopop_library.discopop_optimizer.OptimizationGraph import OptimizationGraph
from discopop_library.discopop_optimizer.Variables.Experiment import Experiment
from discopop_library.discopop_optimizer.classes.system.System import System
from discopop_library.discopop_optimizer.scheduling.workload_delta import (
get_workload_delta_for_cu_node,
)
Expand Down Expand Up @@ -124,39 +126,52 @@ def __identify_scheduling_clauses(
) -> DetectionResult:
"""Identifies scheduling clauses for suggestions and returns the updated DetectionResult"""
# construct optimization graph (basically an acyclic representation of the PET)
experiment = Experiment(project_folder_path, res, file_mapping_path)
print("\tcreating optimization graph...")
# saves optimization graph in experiment
optimization_graph = OptimizationGraph(project_folder_path, experiment)
print("\tDetermining scheduling clauses...")
with alive_bar(len(res.do_all)) as progress_bar:
for do_all_suggestion in res.do_all:
for node_id in get_nodes_from_cu_id(
experiment.optimization_graph, do_all_suggestion.node_id
):
workload_delta, min_workload, max_workload = get_workload_delta_for_cu_node(
experiment, node_id
)
print(
"DOALL @ ",
do_all_suggestion.node_id,
" -> ",
"node_id: ",
node_id,
" --> Delta WL: ",
workload_delta,
" (",
min_workload,
"/",
max_workload,
")",
file=sys.stderr,
)
# todo
# very naive and non-robust approach, needs improvement in the future
# reflects the behavior as described in https://dl.acm.org/doi/pdf/10.1145/3330345.3330375
if workload_delta != 0:
do_all_suggestion.scheduling_clause = "dynamic"
progress_bar()
system = System(headless=True)
discopop_output_path = project_folder_path
discopop_optimizer_path = "INVALID_DUMMY"
code_export_path = "INVALID_DUMMY"
arguments_1 = {"--compile-command": "make"}
experiment = Experiment(
project_folder_path,
discopop_output_path,
discopop_optimizer_path,
code_export_path,
file_mapping_path,
system,
res,
arguments_1,
)
arguments_2 = {"--exhaustive-search": False, "--headless-mode": True}
optimization_graph = OptimizationGraph(
project_folder_path, experiment, arguments_2, None, False
)

for do_all_suggestion in res.do_all:
for node_id in get_nodes_from_cu_id(
experiment.optimization_graph, do_all_suggestion.node_id
):
workload_delta, min_workload, max_workload = get_workload_delta_for_cu_node(
experiment, node_id
)
print(
"DOALL @ ",
do_all_suggestion.node_id,
" -> ",
"node_id: ",
node_id,
" --> Delta WL: ",
workload_delta,
" (",
min_workload,
"/",
max_workload,
")",
file=sys.stderr,
)
# todo
# very naive and non-robust approach, needs improvement in the future
# reflects the behavior as described in https://dl.acm.org/doi/pdf/10.1145/3330345.3330375
if workload_delta != 0:
do_all_suggestion.scheduling_clause = "dynamic"

return res
61 changes: 48 additions & 13 deletions discopop_explorer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ def is_loop_index2(pet: PETGraphX, root_loop: Node, var_name: str) -> bool:

# NOTE: left old code as it may become relevant again in the near future
# We decided to omit the information that computes the workload and the relevant codes. For large programs (e.g., ffmpeg), the generated Data.xml file becomes very large. However, we keep the code here because we would like to integrate a hotspot detection algorithm (TODO: Bertin) with the parallelism discovery. Then, we need to retrieve the information to decide which code sections (loops or functions) are worth parallelizing.
def calculate_workload(pet: PETGraphX, node: Node) -> int:
def calculate_workload(
pet: PETGraphX, node: Node, ignore_function_calls_and_cached_values: bool = False
) -> int:
"""Calculates and stores the workload for a given node
The workload is the number of instructions multiplied by respective number of iterations
Expand All @@ -108,7 +110,8 @@ def calculate_workload(pet: PETGraphX, node: Node) -> int:
"""
# check if value already present
if node.workload is not None:
return node.workload
if not ignore_function_calls_and_cached_values:
return node.workload
res = 0
if node.type == NodeType.DUMMY:
# store workload
Expand All @@ -118,15 +121,25 @@ def calculate_workload(pet: PETGraphX, node: Node) -> int:
# if a function is called, replace the instruction with the costs of the called function
# note: recursive function calls are counted as a single instruction
res += cast(CUNode, node).instructions_count
for calls_edge in pet.out_edges(cast(CUNode, node).id, EdgeType.CALLSNODE):
# add costs of the called function
res += calculate_workload(pet, pet.node_at(calls_edge[1]))
# substract 1 to ignore the call instruction
# todo: should we keep the cost for the call instruction and just add the costs of the called funciton?
res -= 1
if not ignore_function_calls_and_cached_values:
for calls_edge in pet.out_edges(cast(CUNode, node).id, EdgeType.CALLSNODE):
# add costs of the called function
res += calculate_workload(
pet,
pet.node_at(calls_edge[1]),
ignore_function_calls_and_cached_values=ignore_function_calls_and_cached_values,
)
# substract 1 to ignore the call instruction
# todo: should we keep the cost for the call instruction and just add the costs of the called funciton?
res -= 1
elif node.type == NodeType.FUNC:
for child in find_subnodes(pet, node, EdgeType.CHILD):
res += calculate_workload(pet, child)
if not ignore_function_calls_and_cached_values:
for child in find_subnodes(pet, node, EdgeType.CHILD):
res += calculate_workload(
pet,
child,
ignore_function_calls_and_cached_values=ignore_function_calls_and_cached_values,
)
elif node.type == NodeType.LOOP:
for child in find_subnodes(pet, node, EdgeType.CHILD):
if child.type == NodeType.CU:
Expand All @@ -139,23 +152,45 @@ def calculate_workload(pet: PETGraphX, node: Node) -> int:
if cast(LoopNode, node).loop_data is None
else cast(LoopData, cast(LoopNode, node).loop_data).average_iteration_count
)
res += calculate_workload(pet, child) * average_iteration_count + 1
res += (
calculate_workload(
pet,
child,
ignore_function_calls_and_cached_values=ignore_function_calls_and_cached_values,
)
* average_iteration_count
+ 1
)
else:
# determine average iteration count. Use traditional iteration count as a fallback
average_iteration_count = (
cast(LoopNode, node).loop_iterations
if cast(LoopNode, node).loop_data is None
else cast(LoopData, cast(LoopNode, node).loop_data).average_iteration_count
)
res += calculate_workload(pet, child) * average_iteration_count
res += (
calculate_workload(
pet,
child,
ignore_function_calls_and_cached_values=ignore_function_calls_and_cached_values,
)
* average_iteration_count
)
else:
# determine average iteration count. Use traditional iteration count as a fallback
average_iteration_count = (
cast(LoopNode, node).loop_iterations
if cast(LoopNode, node).loop_data is None
else cast(LoopData, cast(LoopNode, node).loop_data).average_iteration_count
)
res += calculate_workload(pet, child) * average_iteration_count
res += (
calculate_workload(
pet,
child,
ignore_function_calls_and_cached_values=ignore_function_calls_and_cached_values,
)
* average_iteration_count
)
# store workload
node.workload = res
return res
Expand Down
Loading

0 comments on commit b6d2d0a

Please sign in to comment.