Skip to content

Commit bc3a9d2

Browse files
committed
ipmi:si: Gracefully handle if the BMC is non-functional
If the BMC is not functional, the driver goes into an error state and starts a 1 second timer. When the timer times out, it will attempt a simple message. If the BMC interacts correctly, the driver will start accepting messages again. If not, it remains in error state. If the driver goes into error state, all messages current and pending will return with an error. This should more gracefully handle when the BMC becomes non-operational, as opposed to trying each messages individually and failing them. Signed-off-by: Corey Minyard <corey@minyard.net>
1 parent 3bc54ab commit bc3a9d2

1 file changed

Lines changed: 23 additions & 6 deletions

File tree

drivers/char/ipmi/ipmi_si_intf.c

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
#define SI_TIMEOUT_JIFFIES (SI_TIMEOUT_TIME_USEC/SI_USEC_PER_JIFFY)
5454
#define SI_SHORT_TIMEOUT_USEC 250 /* .25ms when the SM request a
5555
short timeout */
56+
#define SI_TIMEOUT_HOSED (HZ) /* 1 second when in hosed state. */
5657

5758
enum si_intf_state {
5859
SI_NORMAL,
@@ -61,7 +62,8 @@ enum si_intf_state {
6162
SI_CLEARING_FLAGS,
6263
SI_GETTING_MESSAGES,
6364
SI_CHECKING_ENABLES,
64-
SI_SETTING_ENABLES
65+
SI_SETTING_ENABLES,
66+
SI_HOSED
6567
/* FIXME - add watchdog stuff. */
6668
};
6769

@@ -753,6 +755,8 @@ static void handle_transaction_done(struct smi_info *smi_info)
753755
}
754756
break;
755757
}
758+
case SI_HOSED: /* Shouldn't happen. */
759+
break;
756760
}
757761
}
758762

@@ -767,6 +771,10 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info,
767771
enum si_sm_result si_sm_result;
768772

769773
restart:
774+
if (smi_info->si_state == SI_HOSED)
775+
/* Just in case, hosed state is only left from the timeout. */
776+
return SI_SM_HOSED;
777+
770778
/*
771779
* There used to be a loop here that waited a little while
772780
* (around 25us) before giving up. That turned out to be
@@ -790,18 +798,20 @@ static enum si_sm_result smi_event_handler(struct smi_info *smi_info,
790798

791799
/*
792800
* Do the before return_hosed_msg, because that
793-
* releases the lock.
801+
* releases the lock. We just disable operations for
802+
* a while and retry in hosed state.
794803
*/
795-
smi_info->si_state = SI_NORMAL;
804+
smi_info->si_state = SI_HOSED;
796805
if (smi_info->curr_msg != NULL) {
797806
/*
798807
* If we were handling a user message, format
799808
* a response to send to the upper layer to
800809
* tell it about the error.
801810
*/
802-
return_hosed_msg(smi_info, IPMI_ERR_UNSPECIFIED);
811+
return_hosed_msg(smi_info, IPMI_BUS_ERR);
803812
}
804-
goto restart;
813+
smi_mod_timer(smi_info, jiffies + SI_TIMEOUT_HOSED);
814+
goto out;
805815
}
806816

807817
/*
@@ -899,7 +909,7 @@ static void flush_messages(void *send_info)
899909
* mode. This means we are single-threaded, no need for locks.
900910
*/
901911
result = smi_event_handler(smi_info, 0);
902-
while (result != SI_SM_IDLE) {
912+
while (result != SI_SM_IDLE && result != SI_SM_HOSED) {
903913
udelay(SI_SHORT_TIMEOUT_USEC);
904914
result = smi_event_handler(smi_info, SI_SHORT_TIMEOUT_USEC);
905915
}
@@ -912,6 +922,9 @@ static int sender(void *send_info, struct ipmi_smi_msg *msg)
912922

913923
debug_timestamp(smi_info, "Enqueue");
914924

925+
if (smi_info->si_state == SI_HOSED)
926+
return IPMI_BUS_ERR;
927+
915928
if (smi_info->run_to_completion) {
916929
/*
917930
* If we are running to completion, start it. Upper
@@ -1092,6 +1105,10 @@ static void smi_timeout(struct timer_list *t)
10921105
spin_lock_irqsave(&(smi_info->si_lock), flags);
10931106
debug_timestamp(smi_info, "Timer");
10941107

1108+
if (smi_info->si_state == SI_HOSED)
1109+
/* Try something to see if the BMC is now operational. */
1110+
start_get_flags(smi_info);
1111+
10951112
jiffies_now = jiffies;
10961113
time_diff = (((long)jiffies_now - (long)smi_info->last_timeout_jiffies)
10971114
* SI_USEC_PER_JIFFY);

0 commit comments

Comments
 (0)