# SPDX-License-Identifier: GPL-2.0

# Page Isolation
# Note: Run-time configuration is unsupported, service restart needed.
# Note: this file should be installed at /etc/sysconfig/rasdaemon

# Specify the threshold of isolating buggy pages.
#
# Format:
#   [0-9]+[unit]
# Notice: please make sure match this format, rasdaemon will use default value for exception input cases.
#
# Supported units:
# PAGE_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour
# PAGE_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none
#
# The two configs will only take no effect when PAGE_CE_ACTION is "off".
PAGE_CE_REFRESH_CYCLE="24h"
PAGE_CE_THRESHOLD="50"

# Specify the threshold of isolating buggy memory rows.
#
# Format:
#   [0-9]+[unit]
# Notice: please make sure match this format, rasdaemon will use default value for exception input cases.
#
# Supported units:
# ROW_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour
# ROW_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none
#
# The two configs will only take no effect when PAGE_CE_ACTION is "off".
ROW_CE_REFRESH_CYCLE="24h"
ROW_CE_THRESHOLD="50"

# Specify the internal action in rasdaemon to exceeding a row error threshold.
#
# off      no action
# account  only account errors
# soft     try to soft-offline row without killing any processes
#          This requires an uptodate kernel. Might not be successfull.
# hard     try to hard-offline row by killing processes
#          Requires an uptodate kernel. Might not be successfull.
# soft-then-hard   First try to soft offline, then try hard offlining.
# Note: default offline choice is "off".
ROW_CE_ACTION="off"

# Specify the internal action in rasdaemon to exceeding a page error threshold.
#
# off      no action
# account  only account errors
# soft     try to soft-offline page without killing any processes
#          This requires an uptodate kernel. Might not be successfull.
# hard     try to hard-offline page by killing processes
#          Requires an uptodate kernel. Might not be successfull.
# soft-then-hard   First try to soft offline, then try hard offlining.
# Note: default offline choice is "soft".
PAGE_CE_ACTION="off"

# CPU Online Fault Isolation
# Whether to enable cpu online fault isolation (yes|no).
CPU_ISOLATION_ENABLE="no"
# Specify the threshold of CE numbers.
#
# Format:
#   [0-9]+[unit]
#
# Supported units:
# CPU_CE_THRESHOLD: no unit
# CPU_ISOLATION_CYCLE: D|d (day), H|h (hour), M|m (minute), S|s (second), default is in second
CPU_CE_THRESHOLD="18"
CPU_ISOLATION_CYCLE="24h"

# Prevent excessive isolation from causing an avalanche effect
CPU_ISOLATION_LIMIT="10"

DISABLE="json_report,kmsg_monitor,block:block_rq_complete,devlink:devlink_health_report"

# Event Trigger

# Event trigger will be executed when the specified event occurs.
#
# Execute triggers path
# For example: TRIGGER_DIR=/etc/ras/triggers
TRIGGER_DIR=

# Execute these triggers when the mc_event occured, the triggers will not
# be executed if the trigger is not specified.
# You can set timeout for trigger, trigger thread will be killed if timeout.
# The default timeout is 1, if you do not want any timeout, set it to 0.
# For example:
#   MC_UE_TRIGGER=mc_event_trigger
#   MC_UE_TRIGGER_TIMEOUT=1

# trigger for mc_event
MC_UE_TRIGGER=
MC_UE_TRIGGER_TIMEOUT=0

MCE_DE_TRIGGER=
MCE_UE_TRIGGER=
MCE_DE_TRIGGER_TIMEOUT=0
MCE_UE_TRIGGER_TIMEOUT=0

MF_TRIGGER=
MF_TRIGGER_TIMEOUT=0

AER_CE_TRIGGER=
AER_UE_TRIGGER=
AER_FATAL_TRIGGER=
AER_CE_TRIGGER_TIMEOUT=0
AER_UE_TRIGGER_TIMEOUT=0
AER_FATAL_TRIGGER_TIMEOUT=0

PRE_PAGE_OFFLINE_TRIGGER=
POST_PAGE_OFFLINE_TRIGGER=
PRE_PAGE_OFFLINE_TRIGGER_TIMEOUT=0
POST_PAGE_OFFLINE_TRIGGER_TIMEOUT=0

#trigger for kmsg
KMSG_TRIGGER=
KMSG_TRIGGER_TIMEOUT=0

NVGPU_TRIGGER=
NVGPU_TRIGGER_TIMEOUT=0

# CE Statistic Threshold
#
# Specify the threshold of CE per second.
MC_CE_STAT_THRESHOLD=2000

# Poison page statistics
#
# Supported units:
# POISON_STAT_THRESHOLD: kB
POISON_STAT_THRESHOLD=102400

ERST_DELETE=1
AMDGPU_MCA_ENABLED=0

# EDPC config
#
# rasdaemon will enable EDPC for fatal error if PCIE_EDPC_ENABLE set to 1
# All device with EDPC capability will be enabled by default,
# if EDPC_DEVICE is specified, only the specified device will be enabled
# For example:
#   PCIE_EDPC_ENABLE=1
#   EDPC_DEVICE=0000:01:00.0 // only enable device 0000:01:00.0
PCIE_EDPC_ENABLE=0
EDPC_DEVICE=

# Registered event type for nvgpu, default is
# nvmlEventTypeAll & ~nvmlEventTypeClock
# ref: https://docs.nvidia.com/deploy/nvml-api/group__nvmlEventType.html
# For example:
#   NVGPU_DISABLE_EVENT="0x10" # disable nvmlEventTypeClock
NVGPU_DISABLE_EVENT="0x10"


# KMSG MONITOR
KMSG_IGNORE_XID=""
KMSG_LIMIT=100
KMSG_TRACE_NUM=6
KMSG_TRACE_END=1

KMSG_TRACER_NAME_0="xid"
KMSG_TRACER_REGEX_0="NVRM: Xid \\(PCI:(.*)( GPU-I:[0-9]+)?( GPU-CI:[0-9]+)?\\): ([0-9]+), pid=([^,]*)(, name=([^,]*))?, (.*)"
KMSG_TRACER_GROUP_COUNT_0=8
KMSG_TRACER_GROUP_KEY_0="pci_port,gpu-i,gpu-ci,xid,pid,has_name,name,msg"

KMSG_TRACER_NAME_1="sxid"
KMSG_TRACER_REGEX_1="nvidia-nvswitch[0-9]+: SXid \\(PCI:(.*)\\): ([0-9]+), (.*)"
KMSG_TRACER_GROUP_COUNT_1=3
KMSG_TRACER_GROUP_KEY_1="pci_port,xid,msg"

KMSG_TRACER_NAME_2="axid"
KMSG_TRACER_REGEX_2="PPU.* Xid \\((.*)\\): ([0-9]+)(, pid=([^,]*))?, (.*)"
KMSG_TRACER_GROUP_COUNT_2=5
KMSG_TRACER_GROUP_KEY_2="pci_port,xid,has_pid,pid,msg"

KMSG_TRACER_NAME_3="aer_recovery"
KMSG_TRACER_REGEX_3="pcieport (.*): AER: device recovery (successful|failed)"
KMSG_TRACER_GROUP_COUNT_3=2
KMSG_TRACER_GROUP_KEY_3="pci_port,res"

KMSG_TRACER_NAME_4="pcihp"
KMSG_TRACER_REGEX_4="pcieport (.*): pciehp: Slot\\(([0-9]+)\\): (Link Up|Link Down|Card present|Card not present|Link Down/Up ignored \\(recovered by DPC\\))"
KMSG_TRACER_GROUP_COUNT_4=3
KMSG_TRACER_GROUP_KEY_4="pci_port,slot,res"

KMSG_TRACER_NAME_5="cmci_storm"
KMSG_TRACER_REGEX_5="CMCI storm (.*): switching to .* mode"
KMSG_TRACER_GROUP_COUNT_5=1
KMSG_TRACER_GROUP_KEY_5="storm"
