1*85d86c8aSOliver O'Halloran#!/bin/sh 2*85d86c8aSOliver O'Halloran# SPDX-License-Identifier: GPL-2.0-only 3*85d86c8aSOliver O'Halloran 4*85d86c8aSOliver O'Halloran. ./eeh-functions.sh 5*85d86c8aSOliver O'Halloran 6*85d86c8aSOliver O'Halloranif ! eeh_supported ; then 7*85d86c8aSOliver O'Halloran echo "EEH not supported on this system, skipping" 8*85d86c8aSOliver O'Halloran exit 0; 9*85d86c8aSOliver O'Halloranfi 10*85d86c8aSOliver O'Halloran 11*85d86c8aSOliver O'Halloranif [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \ 12*85d86c8aSOliver O'Halloran [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then 13*85d86c8aSOliver O'Halloran echo "debugfs EEH testing files are missing. Is debugfs mounted?" 14*85d86c8aSOliver O'Halloran exit 1; 15*85d86c8aSOliver O'Halloranfi 16*85d86c8aSOliver O'Halloran 17*85d86c8aSOliver O'Halloranpre_lspci=`mktemp` 18*85d86c8aSOliver O'Halloranlspci > $pre_lspci 19*85d86c8aSOliver O'Halloran 20*85d86c8aSOliver O'Halloran# Bump the max freeze count to something absurd so we don't 21*85d86c8aSOliver O'Halloran# trip over it while breaking things. 22*85d86c8aSOliver O'Halloranecho 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes 23*85d86c8aSOliver O'Halloran 24*85d86c8aSOliver O'Halloran# record the devices that we break in here. Assuming everything 25*85d86c8aSOliver O'Halloran# goes to plan we should get them back once the recover process 26*85d86c8aSOliver O'Halloran# is finished. 27*85d86c8aSOliver O'Hallorandevices="" 28*85d86c8aSOliver O'Halloran 29*85d86c8aSOliver O'Halloran# Build up a list of candidate devices. 30*85d86c8aSOliver O'Halloranfor dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do 31*85d86c8aSOliver O'Halloran # skip bridges since we can't recover them (yet...) 32*85d86c8aSOliver O'Halloran if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then 33*85d86c8aSOliver O'Halloran echo "$dev, Skipped: bridge" 34*85d86c8aSOliver O'Halloran continue; 35*85d86c8aSOliver O'Halloran fi 36*85d86c8aSOliver O'Halloran 37*85d86c8aSOliver O'Halloran # Skip VFs for now since we don't have a reliable way 38*85d86c8aSOliver O'Halloran # to break them. 39*85d86c8aSOliver O'Halloran if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then 40*85d86c8aSOliver O'Halloran echo "$dev, Skipped: virtfn" 41*85d86c8aSOliver O'Halloran continue; 42*85d86c8aSOliver O'Halloran fi 43*85d86c8aSOliver O'Halloran 44*85d86c8aSOliver O'Halloran # Don't inject errosr into an already-frozen PE. This happens with 45*85d86c8aSOliver O'Halloran # PEs that contain multiple PCI devices (e.g. multi-function cards) 46*85d86c8aSOliver O'Halloran # and injecting new errors during the recovery process will probably 47*85d86c8aSOliver O'Halloran # result in the recovery failing and the device being marked as 48*85d86c8aSOliver O'Halloran # failed. 49*85d86c8aSOliver O'Halloran if ! pe_ok $dev ; then 50*85d86c8aSOliver O'Halloran echo "$dev, Skipped: Bad initial PE state" 51*85d86c8aSOliver O'Halloran continue; 52*85d86c8aSOliver O'Halloran fi 53*85d86c8aSOliver O'Halloran 54*85d86c8aSOliver O'Halloran echo "$dev, Added" 55*85d86c8aSOliver O'Halloran 56*85d86c8aSOliver O'Halloran # Add to this list of device to check 57*85d86c8aSOliver O'Halloran devices="$devices $dev" 58*85d86c8aSOliver O'Hallorandone 59*85d86c8aSOliver O'Halloran 60*85d86c8aSOliver O'Hallorandev_count="$(echo $devices | wc -w)" 61*85d86c8aSOliver O'Halloranecho "Found ${dev_count} breakable devices..." 62*85d86c8aSOliver O'Halloran 63*85d86c8aSOliver O'Halloranfailed=0 64*85d86c8aSOliver O'Halloranfor dev in $devices ; do 65*85d86c8aSOliver O'Halloran echo "Breaking $dev..." 66*85d86c8aSOliver O'Halloran 67*85d86c8aSOliver O'Halloran if ! pe_ok $dev ; then 68*85d86c8aSOliver O'Halloran echo "Skipping $dev, Initial PE state is not ok" 69*85d86c8aSOliver O'Halloran failed="$((failed + 1))" 70*85d86c8aSOliver O'Halloran continue; 71*85d86c8aSOliver O'Halloran fi 72*85d86c8aSOliver O'Halloran 73*85d86c8aSOliver O'Halloran if ! eeh_one_dev $dev ; then 74*85d86c8aSOliver O'Halloran failed="$((failed + 1))" 75*85d86c8aSOliver O'Halloran fi 76*85d86c8aSOliver O'Hallorandone 77*85d86c8aSOliver O'Halloran 78*85d86c8aSOliver O'Halloranecho "$failed devices failed to recover ($dev_count tested)" 79*85d86c8aSOliver O'Halloranlspci | diff -u $pre_lspci - 80*85d86c8aSOliver O'Halloranrm -f $pre_lspci 81*85d86c8aSOliver O'Halloran 82*85d86c8aSOliver O'Halloranexit $failed 83