Skip to content

Commit 86ae3cf

Browse files
authored
Merge pull request #1796 from fcrisciani/name-resolution-race
Service discovery hardening
2 parents eb57059 + d64e71e commit 86ae3cf

12 files changed

Lines changed: 660 additions & 158 deletions

agent.go

Lines changed: 84 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ func (ep *endpoint) deleteDriverInfoFromCluster() error {
583583
return nil
584584
}
585585

586-
func (ep *endpoint) addServiceInfoToCluster() error {
586+
func (ep *endpoint) addServiceInfoToCluster(sb *sandbox) error {
587587
if ep.isAnonymous() && len(ep.myAliases) == 0 || ep.Iface().Address() == nil {
588588
return nil
589589
}
@@ -593,24 +593,49 @@ func (ep *endpoint) addServiceInfoToCluster() error {
593593
return nil
594594
}
595595

596+
sb.Service.Lock()
597+
defer sb.Service.Unlock()
598+
logrus.Debugf("addServiceInfoToCluster START for %s %s", ep.svcName, ep.ID())
599+
600+
// Check that the endpoint is still present on the sandbox before adding it to the service discovery.
601+
// This is to handle a race between the EnableService and the sbLeave
602+
// It is possible that the EnableService starts, fetches the list of the endpoints and
603+
// by the time the addServiceInfoToCluster is called the endpoint got removed from the sandbox
604+
// The risk is that the deleteServiceInfoToCluster happens before the addServiceInfoToCluster.
605+
// This check under the Service lock of the sandbox ensure the correct behavior.
606+
// If the addServiceInfoToCluster arrives first may find or not the endpoint and will proceed or exit
607+
// but in any case the deleteServiceInfoToCluster will follow doing the cleanup if needed.
608+
// In case the deleteServiceInfoToCluster arrives first, this one is happening after the endpoint is
609+
// removed from the list, in this situation the delete will bail out not finding any data to cleanup
610+
// and the add will bail out not finding the endpoint on the sandbox.
611+
if e := sb.getEndpoint(ep.ID()); e == nil {
612+
logrus.Warnf("addServiceInfoToCluster suppressing service resolution ep is not anymore in the sandbox %s", ep.ID())
613+
return nil
614+
}
615+
596616
c := n.getController()
597617
agent := c.getAgent()
598618

619+
name := ep.Name()
620+
if ep.isAnonymous() {
621+
name = ep.MyAliases()[0]
622+
}
623+
599624
var ingressPorts []*PortConfig
600625
if ep.svcID != "" {
626+
// This is a task part of a service
601627
// Gossip ingress ports only in ingress network.
602628
if n.ingress {
603629
ingressPorts = ep.ingressPorts
604630
}
605-
606-
if err := c.addServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), ep.virtualIP, ingressPorts, ep.svcAliases, ep.Iface().Address().IP); err != nil {
631+
if err := c.addServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), name, ep.virtualIP, ingressPorts, ep.svcAliases, ep.myAliases, ep.Iface().Address().IP, "addServiceInfoToCluster"); err != nil {
632+
return err
633+
}
634+
} else {
635+
// This is a container simply attached to an attachable network
636+
if err := c.addContainerNameResolution(n.ID(), ep.ID(), name, ep.myAliases, ep.Iface().Address().IP, "addServiceInfoToCluster"); err != nil {
607637
return err
608638
}
609-
}
610-
611-
name := ep.Name()
612-
if ep.isAnonymous() {
613-
name = ep.MyAliases()[0]
614639
}
615640

616641
buf, err := proto.Marshal(&EndpointRecord{
@@ -634,10 +659,12 @@ func (ep *endpoint) addServiceInfoToCluster() error {
634659
}
635660
}
636661

662+
logrus.Debugf("addServiceInfoToCluster END for %s %s", ep.svcName, ep.ID())
663+
637664
return nil
638665
}
639666

640-
func (ep *endpoint) deleteServiceInfoFromCluster() error {
667+
func (ep *endpoint) deleteServiceInfoFromCluster(sb *sandbox, method string) error {
641668
if ep.isAnonymous() && len(ep.myAliases) == 0 {
642669
return nil
643670
}
@@ -647,17 +674,33 @@ func (ep *endpoint) deleteServiceInfoFromCluster() error {
647674
return nil
648675
}
649676

677+
sb.Service.Lock()
678+
defer sb.Service.Unlock()
679+
logrus.Debugf("deleteServiceInfoFromCluster from %s START for %s %s", method, ep.svcName, ep.ID())
680+
650681
c := n.getController()
651682
agent := c.getAgent()
652683

653-
if ep.svcID != "" && ep.Iface().Address() != nil {
654-
var ingressPorts []*PortConfig
655-
if n.ingress {
656-
ingressPorts = ep.ingressPorts
657-
}
684+
name := ep.Name()
685+
if ep.isAnonymous() {
686+
name = ep.MyAliases()[0]
687+
}
658688

659-
if err := c.rmServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), ep.virtualIP, ingressPorts, ep.svcAliases, ep.Iface().Address().IP); err != nil {
660-
return err
689+
if ep.Iface().Address() != nil {
690+
if ep.svcID != "" {
691+
// This is a task part of a service
692+
var ingressPorts []*PortConfig
693+
if n.ingress {
694+
ingressPorts = ep.ingressPorts
695+
}
696+
if err := c.rmServiceBinding(ep.svcName, ep.svcID, n.ID(), ep.ID(), name, ep.virtualIP, ingressPorts, ep.svcAliases, ep.myAliases, ep.Iface().Address().IP, "deleteServiceInfoFromCluster"); err != nil {
697+
return err
698+
}
699+
} else {
700+
// This is a container simply attached to an attachable network
701+
if err := c.delContainerNameResolution(n.ID(), ep.ID(), name, ep.myAliases, ep.Iface().Address().IP, "deleteServiceInfoFromCluster"); err != nil {
702+
return err
703+
}
661704
}
662705
}
663706

@@ -667,6 +710,8 @@ func (ep *endpoint) deleteServiceInfoFromCluster() error {
667710
}
668711
}
669712

713+
logrus.Debugf("deleteServiceInfoFromCluster from %s END for %s %s", method, ep.svcName, ep.ID())
714+
670715
return nil
671716
}
672717

@@ -814,58 +859,56 @@ func (c *controller) handleEpTableEvent(ev events.Event) {
814859
value = event.Value
815860
case networkdb.UpdateEvent:
816861
logrus.Errorf("Unexpected update service table event = %#v", event)
817-
}
818-
819-
nw, err := c.NetworkByID(nid)
820-
if err != nil {
821-
logrus.Errorf("Could not find network %s while handling service table event: %v", nid, err)
822862
return
823863
}
824-
n := nw.(*network)
825864

826-
err = proto.Unmarshal(value, &epRec)
865+
err := proto.Unmarshal(value, &epRec)
827866
if err != nil {
828867
logrus.Errorf("Failed to unmarshal service table value: %v", err)
829868
return
830869
}
831870

832-
name := epRec.Name
871+
containerName := epRec.Name
833872
svcName := epRec.ServiceName
834873
svcID := epRec.ServiceID
835874
vip := net.ParseIP(epRec.VirtualIP)
836875
ip := net.ParseIP(epRec.EndpointIP)
837876
ingressPorts := epRec.IngressPorts
838-
aliases := epRec.Aliases
839-
taskaliases := epRec.TaskAliases
877+
serviceAliases := epRec.Aliases
878+
taskAliases := epRec.TaskAliases
840879

841-
if name == "" || ip == nil {
880+
if containerName == "" || ip == nil {
842881
logrus.Errorf("Invalid endpoint name/ip received while handling service table event %s", value)
843882
return
844883
}
845884

846885
if isAdd {
886+
logrus.Debugf("handleEpTableEvent ADD %s R:%v", isAdd, eid, epRec)
847887
if svcID != "" {
848-
if err := c.addServiceBinding(svcName, svcID, nid, eid, vip, ingressPorts, aliases, ip); err != nil {
849-
logrus.Errorf("Failed adding service binding for value %s: %v", value, err)
888+
// This is a remote task part of a service
889+
if err := c.addServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent"); err != nil {
890+
logrus.Errorf("failed adding service binding for %s epRec:%v err:%s", eid, epRec, err)
850891
return
851892
}
852-
}
853-
854-
n.addSvcRecords(name, ip, nil, true)
855-
for _, alias := range taskaliases {
856-
n.addSvcRecords(alias, ip, nil, true)
893+
} else {
894+
// This is a remote container simply attached to an attachable network
895+
if err := c.addContainerNameResolution(nid, eid, containerName, taskAliases, ip, "handleEpTableEvent"); err != nil {
896+
logrus.Errorf("failed adding service binding for %s epRec:%v err:%s", eid, epRec, err)
897+
}
857898
}
858899
} else {
900+
logrus.Debugf("handleEpTableEvent DEL %s R:%v", isAdd, eid, epRec)
859901
if svcID != "" {
860-
if err := c.rmServiceBinding(svcName, svcID, nid, eid, vip, ingressPorts, aliases, ip); err != nil {
861-
logrus.Errorf("Failed adding service binding for value %s: %v", value, err)
902+
// This is a remote task part of a service
903+
if err := c.rmServiceBinding(svcName, svcID, nid, eid, containerName, vip, ingressPorts, serviceAliases, taskAliases, ip, "handleEpTableEvent"); err != nil {
904+
logrus.Errorf("failed removing service binding for %s epRec:%v err:%s", eid, epRec, err)
862905
return
863906
}
864-
}
865-
866-
n.deleteSvcRecords(name, ip, nil, true)
867-
for _, alias := range taskaliases {
868-
n.deleteSvcRecords(alias, ip, nil, true)
907+
} else {
908+
// This is a remote container simply attached to an attachable network
909+
if err := c.delContainerNameResolution(nid, eid, containerName, taskAliases, ip, "handleEpTableEvent"); err != nil {
910+
logrus.Errorf("failed adding service binding for %s epRec:%v err:%s", eid, epRec, err)
911+
}
869912
}
870913
}
871914
}

agent.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ option (gogoproto.goproto_stringer_all) = false;
1414
// EndpointRecord specifies all the endpoint specific information that
1515
// needs to gossiped to nodes participating in the network.
1616
message EndpointRecord {
17-
// Name of the endpoint
17+
// Name of the container
1818
string name = 1;
1919

2020
// Service name of the service to which this endpoint belongs.

common/setmatrix.go

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
package common
2+
3+
import (
4+
"sync"
5+
6+
mapset "github.com/deckarep/golang-set"
7+
)
8+
9+
// SetMatrix is a map of Sets
10+
type SetMatrix interface {
11+
// Get returns the members of the set for a specific key as a slice.
12+
Get(key string) ([]interface{}, bool)
13+
// Contains is used to verify is an element is in a set for a specific key
14+
// returns true if the element is in the set
15+
// returns true if there is a set for the key
16+
Contains(key string, value interface{}) (bool, bool)
17+
// Insert inserts the mapping between the IP and the endpoint identifier
18+
// returns true if the mapping was not present, false otherwise
19+
// returns also the number of endpoints associated to the IP
20+
Insert(key string, value interface{}) (bool, int)
21+
// Remove removes the mapping between the IP and the endpoint identifier
22+
// returns true if the mapping was deleted, false otherwise
23+
// returns also the number of endpoints associated to the IP
24+
Remove(key string, value interface{}) (bool, int)
25+
// Cardinality returns the number of elements in the set of a specfic key
26+
// returns false if the key is not in the map
27+
Cardinality(key string) (int, bool)
28+
// String returns the string version of the set, empty otherwise
29+
// returns false if the key is not in the map
30+
String(key string) (string, bool)
31+
}
32+
33+
type setMatrix struct {
34+
matrix map[string]mapset.Set
35+
36+
sync.Mutex
37+
}
38+
39+
// NewSetMatrix creates a new set matrix object
40+
func NewSetMatrix() SetMatrix {
41+
s := &setMatrix{}
42+
s.init()
43+
return s
44+
}
45+
46+
func (s *setMatrix) init() {
47+
s.matrix = make(map[string]mapset.Set)
48+
}
49+
50+
func (s *setMatrix) Get(key string) ([]interface{}, bool) {
51+
s.Lock()
52+
defer s.Unlock()
53+
set, ok := s.matrix[key]
54+
if !ok {
55+
return nil, ok
56+
}
57+
return set.ToSlice(), ok
58+
}
59+
60+
func (s *setMatrix) Contains(key string, value interface{}) (bool, bool) {
61+
s.Lock()
62+
defer s.Unlock()
63+
set, ok := s.matrix[key]
64+
if !ok {
65+
return false, ok
66+
}
67+
return set.Contains(value), ok
68+
}
69+
70+
func (s *setMatrix) Insert(key string, value interface{}) (bool, int) {
71+
s.Lock()
72+
defer s.Unlock()
73+
set, ok := s.matrix[key]
74+
if !ok {
75+
s.matrix[key] = mapset.NewSet()
76+
s.matrix[key].Add(value)
77+
return true, 1
78+
}
79+
80+
return set.Add(value), set.Cardinality()
81+
}
82+
83+
func (s *setMatrix) Remove(key string, value interface{}) (bool, int) {
84+
s.Lock()
85+
defer s.Unlock()
86+
set, ok := s.matrix[key]
87+
if !ok {
88+
return false, 0
89+
}
90+
91+
var removed bool
92+
if set.Contains(value) {
93+
set.Remove(value)
94+
removed = true
95+
// If the set is empty remove it from the matrix
96+
if set.Cardinality() == 0 {
97+
delete(s.matrix, key)
98+
}
99+
}
100+
101+
return removed, set.Cardinality()
102+
}
103+
104+
func (s *setMatrix) Cardinality(key string) (int, bool) {
105+
s.Lock()
106+
defer s.Unlock()
107+
set, ok := s.matrix[key]
108+
if !ok {
109+
return 0, ok
110+
}
111+
112+
return set.Cardinality(), ok
113+
}
114+
115+
func (s *setMatrix) String(key string) (string, bool) {
116+
s.Lock()
117+
defer s.Unlock()
118+
set, ok := s.matrix[key]
119+
if !ok {
120+
return "", ok
121+
}
122+
return set.String(), ok
123+
}

0 commit comments

Comments
 (0)