Skip to content

Commit 37cd0c7

Browse files
committed
Fixes a case of ungraceful daemon restart + unreachable store
For ungraceful daemon restarts, libnetwork has sandbox cleanup logic to remove any stale & dangling resources. But, if the store is down during the daemon restart, then the cleanup logic would not be able to perform complete cleanup. During such cases, the sandbox has been removed. With this fix, we retain the sandbox if the store is down and the endpoint couldnt be cleaned. When the container is later restarted in docker daemon, we will perform a sandbox cleanup and that will complete the cleanup round. Signed-off-by: Madhu Venugopal <madhu@docker.com>
1 parent 20351a8 commit 37cd0c7

3 files changed

Lines changed: 28 additions & 9 deletions

File tree

drivers/overlay/joinleave.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,12 @@ func (d *driver) Leave(nid, eid string) error {
118118
return fmt.Errorf("could not find network with id %s", nid)
119119
}
120120

121+
ep := n.endpoint(eid)
122+
123+
if ep == nil {
124+
return types.InternalMaskableErrorf("could not find endpoint with id %s", eid)
125+
}
126+
121127
if d.notifyCh != nil {
122128
d.notifyCh <- ovNotify{
123129
action: "leave",

sandbox.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ func (sb *sandbox) Delete() error {
168168
c := sb.controller
169169

170170
// Detach from all endpoints
171+
retain := false
171172
for _, ep := range sb.getConnectedEndpoints() {
172173
// endpoint in the Gateway network will be cleaned up
173174
// when when sandbox no longer needs external connectivity
@@ -176,14 +177,22 @@ func (sb *sandbox) Delete() error {
176177
}
177178

178179
if err := ep.Leave(sb); err != nil {
180+
retain = true
179181
log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
180182
}
181183

182184
if err := ep.Delete(); err != nil {
185+
retain = true
183186
log.Warnf("Failed deleting endpoint %s: %v\n", ep.ID(), err)
184187
}
185188
}
186189

190+
if retain {
191+
sb.Lock()
192+
sb.inDelete = false
193+
sb.Unlock()
194+
return fmt.Errorf("could not cleanup all the endpoints in container %s / sandbox %s", sb.containerID, sb.id)
195+
}
187196
// Container is going away. Path cache in etchosts is most
188197
// likely not required any more. Drop it.
189198
etchosts.Drop(sb.config.hostsPath)

sandbox_store.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package libnetwork
33
import (
44
"container/heap"
55
"encoding/json"
6+
"sync"
67

78
"github.com/Sirupsen/logrus"
89
"github.com/docker/libnetwork/datastore"
@@ -119,8 +120,9 @@ func (sbs *sbState) DataScope() string {
119120

120121
func (sb *sandbox) storeUpdate() error {
121122
sbs := &sbState{
122-
c: sb.controller,
123-
ID: sb.id,
123+
c: sb.controller,
124+
ID: sb.id,
125+
Cid: sb.containerID,
124126
}
125127

126128
retry:
@@ -197,15 +199,17 @@ func (c *controller) sandboxCleanup() {
197199

198200
for _, eps := range sbs.Eps {
199201
n, err := c.getNetworkFromStore(eps.Nid)
202+
var ep *endpoint
200203
if err != nil {
201204
logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err)
202-
continue
203-
}
204-
205-
ep, err := n.getEndpointFromStore(eps.Eid)
206-
if err != nil {
207-
logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
208-
continue
205+
n = &network{id: eps.Nid, ctrlr: c, drvOnce: &sync.Once{}}
206+
ep = &endpoint{id: eps.Eid, network: n}
207+
} else {
208+
ep, err = n.getEndpointFromStore(eps.Eid)
209+
if err != nil {
210+
logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err)
211+
ep = &endpoint{id: eps.Eid, network: n}
212+
}
209213
}
210214

211215
heap.Push(&sb.endpoints, ep)

0 commit comments

Comments
 (0)