Skip to content

Commit 09cba8b

Browse files
author
Thananon Patinyasakdikul
committed
pml/ob1: fixed out of sequence bug.
This commit fixes open-mpi#4795 - Fixed typo that sometimes causes deadlock in change of protocol. - Redesigned out of sequence ordering and address the overflow case of sequence number from uint16_t. Signed-off-by: Thananon Patinyasakdikul <tpatinya@utk.edu>
1 parent 7d0e023 commit 09cba8b

File tree

6 files changed

+241
-73
lines changed

6 files changed

+241
-73
lines changed

ompi/mca/pml/ob1/pml_ob1.c

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2012 The University of Tennessee and The University
6+
* Copyright (c) 2004-2018 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -250,9 +250,9 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
250250
continue;
251251
}
252252

253-
add_fragment_to_unexpected:
254-
255253
if (((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) {
254+
255+
add_fragment_to_unexpected:
256256
/* We're now expecting the next sequence number. */
257257
pml_proc->expected_sequence++;
258258
opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag );
@@ -264,17 +264,16 @@ int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
264264
* situation as the cant_match is only checked when a new fragment is received from
265265
* the network.
266266
*/
267-
OPAL_LIST_FOREACH(frag, &pml_proc->frags_cant_match, mca_pml_ob1_recv_frag_t) {
268-
hdr = &frag->hdr.hdr_match;
269-
/* If the message has the next expected seq from that proc... */
270-
if(hdr->hdr_seq != pml_proc->expected_sequence)
271-
continue;
272-
273-
opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag);
274-
goto add_fragment_to_unexpected;
275-
}
267+
if( NULL != pml_proc->frags_cant_match ) {
268+
frag = check_cantmatch_for_match(pml_proc);
269+
if( NULL != frag ) {
270+
hdr = &frag->hdr.hdr_match;
271+
goto add_fragment_to_unexpected;
272+
}
273+
}
276274
} else {
277-
opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag );
275+
append_frag_to_ordered_list(&pml_proc->frags_cant_match, frag,
276+
pml_proc->expected_sequence);
278277
}
279278
}
280279
return OMPI_SUCCESS;
@@ -561,6 +560,23 @@ static void mca_pml_ob1_dump_frag_list(opal_list_t* queue, bool is_req)
561560
}
562561
}
563562

563+
void mca_pml_ob1_dump_cant_match(mca_pml_ob1_recv_frag_t* queue)
564+
{
565+
mca_pml_ob1_recv_frag_t* item = queue;
566+
567+
do {
568+
mca_pml_ob1_dump_hdr( &item->hdr );
569+
if( NULL != item->range ) {
570+
mca_pml_ob1_recv_frag_t* frag = item->range;
571+
do {
572+
mca_pml_ob1_dump_hdr( &frag->hdr );
573+
frag = (mca_pml_ob1_recv_frag_t*)frag->super.super.opal_list_next;
574+
} while( frag != item->range );
575+
}
576+
item = (mca_pml_ob1_recv_frag_t*)item->super.super.opal_list_next;
577+
} while( item != queue );
578+
}
579+
564580
int mca_pml_ob1_dump(struct ompi_communicator_t* comm, int verbose)
565581
{
566582
struct mca_pml_comm_t* pml_comm = comm->c_pml_comm;
@@ -596,9 +612,9 @@ int mca_pml_ob1_dump(struct ompi_communicator_t* comm, int verbose)
596612
opal_output(0, "expected specific receives\n");
597613
mca_pml_ob1_dump_frag_list(&proc->specific_receives, true);
598614
}
599-
if( opal_list_get_size(&proc->frags_cant_match) ) {
615+
if( NULL != proc->frags_cant_match ) {
600616
opal_output(0, "out of sequence\n");
601-
mca_pml_ob1_dump_frag_list(&proc->frags_cant_match, false);
617+
mca_pml_ob1_dump_cant_match(proc->frags_cant_match);
602618
}
603619
if( opal_list_get_size(&proc->unexpected_frags) ) {
604620
opal_output(0, "unexpected frag\n");

ompi/mca/pml/ob1/pml_ob1.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2018 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,

ompi/mca/pml/ob1/pml_ob1_comm.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33
* University Research and Technology
44
* Corporation. All rights reserved.
5-
* Copyright (c) 2004-2006 The University of Tennessee and The University
5+
* Copyright (c) 2004-2018 The University of Tennessee and The University
66
* of Tennessee Research Foundation. All rights
77
* reserved.
88
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -29,15 +29,15 @@ static void mca_pml_ob1_comm_proc_construct(mca_pml_ob1_comm_proc_t* proc)
2929
proc->ompi_proc = NULL;
3030
proc->expected_sequence = 1;
3131
proc->send_sequence = 0;
32-
OBJ_CONSTRUCT(&proc->frags_cant_match, opal_list_t);
32+
proc->frags_cant_match = NULL;
3333
OBJ_CONSTRUCT(&proc->specific_receives, opal_list_t);
3434
OBJ_CONSTRUCT(&proc->unexpected_frags, opal_list_t);
3535
}
3636

3737

3838
static void mca_pml_ob1_comm_proc_destruct(mca_pml_ob1_comm_proc_t* proc)
3939
{
40-
OBJ_DESTRUCT(&proc->frags_cant_match);
40+
assert(NULL == proc->frags_cant_match);
4141
OBJ_DESTRUCT(&proc->specific_receives);
4242
OBJ_DESTRUCT(&proc->unexpected_frags);
4343
if (proc->ompi_proc) {

ompi/mca/pml/ob1/pml_ob1_comm.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44
* University Research and Technology
55
* Corporation. All rights reserved.
6-
* Copyright (c) 2004-2016 The University of Tennessee and The University
6+
* Copyright (c) 2004-2018 The University of Tennessee and The University
77
* of Tennessee Research Foundation. All rights
88
* reserved.
99
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
@@ -40,7 +40,7 @@ struct mca_pml_ob1_comm_proc_t {
4040
#else
4141
int32_t send_sequence; /**< send side sequence number */
4242
#endif
43-
opal_list_t frags_cant_match; /**< out-of-order fragment queues */
43+
struct mca_pml_ob1_recv_frag_t* frags_cant_match; /**< out-of-order fragment queues */
4444
opal_list_t specific_receives; /**< queues of unmatched specific receives */
4545
opal_list_t unexpected_frags; /**< unexpected fragment queues */
4646
};

0 commit comments

Comments
 (0)