Skip to content

Commit 7ace53e

Browse files
committed
fix a race in communication between Task and Motion
Fixes #2386 Before this commit, communication of the emcmot_command_t from Task to Motion had a race condition. The race could cause Motion to process a command buffer consisting of some old fields from the previous command and some new fields from the current command. The race is demonstrated by the following sequence of events: 1. Motion starts `emcmotCommandHandler()`. It gets to command.c line 421 and reads the (old) `head` and `tail`, sees that they are the same, and believes that the command buffer is in a consistent state (i.e., not in the process of being updated). 2. Task starts `usrmotWriteEmcmotCommand()`. It gets to usrmotintf.cc line 96 and starts writing a new command to shared memory. It writes `head` and `commandNum`, but nothing after that. 3. Motion continues, gets to command.c line 425, sees the new `commandNum`, and believes that it has a new command. Motion starts processing the command buffer, even though most of the fields haven't been updated by Task yet. 4. Task finishes copying the command into the shared memory buffer, but it's too late; Motion has processed an inconsistent command. This commit fixes the race by doing away with the broken `head`/`tail` synchronization mechanism and replacing it with a mutex in the emcmot struct, which protects the command struct. Task locks the mutex before starting to copy the new command to shared memory. Motion trylocks the mutex before accessing the command shared memory. If the trylock fails, Motion gives up for now and tries again on the next invocation. I'm not sure why this bug doesn't cause us more problems - i don't think i've ever noticed Motion executing a command incorrectly. The window for the race is very small, maybe we've just been getting lucky? Seems fishy... Also, the head/tail synchronization mechanism probably worked fine back when it was written, before multi-processor systems were common.
1 parent b8996e2 commit 7ace53e

6 files changed

Lines changed: 31 additions & 17 deletions

File tree

src/emc/motion-logger/motion-logger.c

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -312,12 +312,11 @@ int main(int argc, char* argv[]) {
312312
init_comm_buffers();
313313

314314
while (1) {
315-
if (c->commandNum != c->tail) {
316-
// "split read"
317-
continue;
318-
}
315+
rtapi_mutex_get(&emcmotStruct->command_mutex);
316+
319317
if (c->commandNum == emcmotStatus->commandNumEcho) {
320318
// nothing new
319+
rtapi_mutex_give(&emcmotStruct->command_mutex);
321320
maybe_reopen_logfile();
322321
usleep(10 * 1000);
323322
continue;
@@ -706,6 +705,8 @@ int main(int argc, char* argv[]) {
706705
emcmotStatus->commandNumEcho = c->commandNum;
707706
emcmotStatus->commandStatus = EMCMOT_COMMAND_OK;
708707
emcmotStatus->tail = emcmotStatus->head;
708+
709+
rtapi_mutex_give(&emcmotStruct->command_mutex);
709710
}
710711

711712
return 0;

src/emc/motion/command.c

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -401,11 +401,14 @@ STATIC int is_feed_type(int motion_type)
401401
}
402402
}
403403

404+
404405
/*
405406
emcmotCommandHandler() is called each main cycle to read the
406407
shared memory buffer
408+
409+
This function runs with the emcmotCommand struct locked.
407410
*/
408-
void emcmotCommandHandler(void *arg, long servo_period)
411+
void emcmotCommandHandler_locked(void *arg, long servo_period)
409412
{
410413
int joint_num, axis_num, spindle_num;
411414
int n,s0,s1;
@@ -417,11 +420,6 @@ void emcmotCommandHandler(void *arg, long servo_period)
417420
int abort = 0;
418421
char* emsg = "";
419422

420-
/* check for split read */
421-
if (emcmotCommand->head != emcmotCommand->tail) {
422-
emcmotDebug->split++;
423-
return; /* not really an error */
424-
}
425423
if (emcmotCommand->commandNum != emcmotStatus->commandNumEcho) {
426424
/* increment head count-- we'll be modifying emcmotStatus */
427425
emcmotStatus->head++;
@@ -2021,3 +2019,15 @@ void emcmotCommandHandler(void *arg, long servo_period)
20212019

20222020
return;
20232021
}
2022+
2023+
2024+
void emcmotCommandHandler(void *arg, long servo_period) {
2025+
if (rtapi_mutex_try(&emcmotStruct->command_mutex) != 0) {
2026+
// Failed to take the mutex, because it is held by Task.
2027+
// This means Task is in the process of updating the command.
2028+
// Give up for now, and try again on the next invocation.
2029+
return;
2030+
}
2031+
emcmotCommandHandler_locked(arg, servo_period);
2032+
rtapi_mutex_give(&emcmotStruct->command_mutex);
2033+
}

src/emc/motion/motion.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -692,10 +692,8 @@ static int init_comm_buffers(void)
692692
emcmotErrorInit(emcmotError);
693693

694694
/* init command struct */
695-
emcmotCommand->head = 0;
696695
emcmotCommand->command = 0;
697696
emcmotCommand->commandNum = 0;
698-
emcmotCommand->tail = 0;
699697

700698
/* init status struct */
701699
emcmotStatus->head = 0;

src/emc/motion/motion.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,6 @@ extern "C" {
214214
memory, and all commands from higher level code come thru it.
215215
*/
216216
typedef struct emcmot_command_t {
217-
unsigned char head; /* flag count for mutex detect */
218217
cmd_code_t command; /* command code (enum) */
219218
int commandNum; /* increment this for new command */
220219
double motor_offset; /* offset from joint to motor position */
@@ -265,7 +264,6 @@ extern "C" {
265264
char direction; /* CANON_DIRECTION flag for spindle orient */
266265
double timeout; /* of wait for spindle orient to complete */
267266
unsigned char wait_for_spindle_at_speed; // EMCMOT_SPINDLE_ON now carries this, for next feed move
268-
unsigned char tail; /* flag count for mutex detect */
269267
int arcBlendOptDepth;
270268
int arcBlendEnable;
271269
int arcBlendFallbackEnable;

src/emc/motion/motion_struct.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,14 @@
1212
#ifndef MOTION_STRUCT_H
1313
#define MOTION_STRUCT_H
1414

15+
#include <rtapi_mutex.h>
16+
17+
1518
/* big comm structure, for upper memory */
1619
typedef struct emcmot_struct_t {
20+
rtapi_mutex_t command_mutex; // Used to protect access to `command`.
1721
struct emcmot_command_t command; /* struct used to pass commands/data from Task to Motion */
22+
1823
struct emcmot_status_t status; /* Struct used to store RT status */
1924
struct emcmot_config_t config; /* Struct used to store RT config */
2025
struct emcmot_internal_t internal; /*! \todo FIXME - doesn't need to be in

src/emc/motion/usrmotintf.cc

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,24 +76,26 @@ int usrmotWriteEmcmotCommand(emcmot_command_t * c)
7676
{
7777
emcmot_status_t s;
7878
static int commandNum = 0;
79-
static unsigned char headCount = 0;
8079
double end;
8180

8281
if (!MOTION_ID_VALID(c->id)) {
8382
rcs_print("USRMOT: ERROR: invalid motion id: %d\n",c->id);
8483
return EMCMOT_COMM_INVALID_MOTION_ID;
8584
}
86-
c->head = ++headCount;
87-
c->tail = c->head;
85+
8886
c->commandNum = ++commandNum;
8987

9088
/* check for mapped mem still around */
9189
if (0 == emcmotCommand) {
9290
rcs_print("USRMOT: ERROR: can't connect to shared memory\n");
9391
return EMCMOT_COMM_ERROR_CONNECT;
9492
}
93+
9594
/* copy entire command structure to shared memory */
95+
rtapi_mutex_get(&emcmotStruct->command_mutex);
9696
*emcmotCommand = *c;
97+
rtapi_mutex_give(&emcmotStruct->command_mutex);
98+
9799
/* poll for receipt of command */
98100
/* set timeout for comm failure, now + timeout */
99101
end = etime() + EMCMOT_COMM_TIMEOUT;

0 commit comments

Comments
 (0)