Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ IsolationGroup defines the name of zone as well attributes for the zone configur
| ----- | ----------- | ------ | -------- |
| name | Name is the value that will be used in StatefulSet labels, pod labels, and M3DB placement \"isolationGroup\" fields. | string | true |
| nodeAffinityTerms | NodeAffinityTerms is an array of NodeAffinityTerm requirements, which are ANDed together to indicate what nodes an isolation group can be assigned to. | [][NodeAffinityTerm](#nodeaffinityterm) | false |
| usePodAntiAffinity | UsePodAntiAffinity enables M3DB pod anti-affinity by using M3DB pod component labels to prevent multiple M3DB pods from being scheduled in the same failure domain, determined by podAffinityToplogyKey. | bool | false |
| podAffinityToplogyKey | PodAffinityToplogyKey defines the node label used for pod anti-affinity. This parameter is required when usePodAntiAffinity is set to true. | string | false |
| numInstances | NumInstances defines the number of instances. | int32 | true |
| storageClassName | StorageClassName is the name of the StorageClass to use for this isolation group. This allows ensuring that PVs will be created in the same zone as the pinned statefulset on Kubernetes < 1.12 (when topology aware volume scheduling was introduced). Only has effect if the clusters `dataDirVolumeClaimTemplate` is non-nil. If set, the volume claim template will have its storageClassName field overridden per-isolationgroup. If unset the storageClassName of the volumeClaimTemplate will be used. | string | false |

Expand Down
9 changes: 9 additions & 0 deletions pkg/apis/m3dboperator/v1alpha1/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,15 @@ type IsolationGroup struct {
// to.
NodeAffinityTerms []NodeAffinityTerm `json:"nodeAffinityTerms,omitempty"`

// UsePodAntiAffinity enables M3DB pod anti-affinity by using M3DB pod
// component labels to prevent multiple M3DB pods from being scheduled in the
// same failure domain, determined by podAffinityToplogyKey.
UsePodAntiAffinity bool `json:"usePodAntiAffinity,omitempty"`

// PodAffinityToplogyKey defines the node label used for pod anti-affinity.
// This parameter is required when usePodAntiAffinity is set to true.
PodAffinityToplogyKey string `json:"podAffinityToplogyKey,omitempty"`

// NumInstances defines the number of instances.
NumInstances int32 `json:"numInstances"`

Expand Down
14 changes: 14 additions & 0 deletions pkg/apis/m3dboperator/v1alpha1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

73 changes: 62 additions & 11 deletions pkg/k8sops/m3db/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ const (
)

var (
errEmptyNodeAffinityKey = errors.New("node affinity term key cannot be empty")
errEmptyNodeAffinityValues = errors.New("node affinity term values cannot be empty")
errEmptyNodeAffinityKey = errors.New("node affinity term key cannot be empty")
errEmptyNodeAffinityValues = errors.New("node affinity term values cannot be empty")
errEmptyPodAffinityToplogyKey = errors.New("pod affinity toplogy key cannot be empty")
)

// NewBaseStatefulSet returns a base configured stateful set.
Expand Down Expand Up @@ -219,9 +220,39 @@ func generateDownwardAPIVolumeMount() v1.VolumeMount {
}
}

// GenerateStatefulSetAffinity generates a node affinity requiring a strict match for
// GenerateStatefulSetPodAntiAffinity generates a pod anti-affinity for m3db
// pods, using labels.Component and labels.ComponentM3DBNode consts as
// matchexpression key and values, respectively.
func GenerateStatefulSetPodAntiAffinity(isoGroup myspec.IsolationGroup) (*v1.PodAntiAffinity, error) {
if !isoGroup.UsePodAntiAffinity {
return nil, nil
}

if isoGroup.PodAffinityToplogyKey == "" {
return nil, errEmptyPodAffinityToplogyKey
}

return &v1.PodAntiAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{
{
LabelSelector: &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: labels.Component,
Operator: "In",
Values: []string{labels.ComponentM3DBNode},
},
},
},
TopologyKey: isoGroup.PodAffinityToplogyKey,
},
},
}, nil
}

// GenerateStatefulSetNodeAffinity generates a node affinity requiring a strict match for
// given key and values.
func GenerateStatefulSetAffinity(isoGroup myspec.IsolationGroup) (*v1.Affinity, error) {
func GenerateStatefulSetNodeAffinity(isoGroup myspec.IsolationGroup) (*v1.NodeAffinity, error) {
if len(isoGroup.NodeAffinityTerms) == 0 {
return nil, nil
}
Expand All @@ -242,19 +273,39 @@ func GenerateStatefulSetAffinity(isoGroup myspec.IsolationGroup) (*v1.Affinity,
}
}

return &v1.Affinity{
NodeAffinity: &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: expressions,
},
return &v1.NodeAffinity{
RequiredDuringSchedulingIgnoredDuringExecution: &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: expressions,
},
},
},
}, nil
}

// GenerateStatefulSetAffinity generates affinity settings for the statefulset.
func GenerateStatefulSetAffinity(isoGroup myspec.IsolationGroup) (*v1.Affinity, error) {
if len(isoGroup.NodeAffinityTerms) == 0 && !isoGroup.UsePodAntiAffinity {
return nil, nil
}

nodeAffinity, nodeErr := GenerateStatefulSetNodeAffinity(isoGroup)
if nodeErr != nil {
return nil, nodeErr
}

podAntiAffinity, podErr := GenerateStatefulSetPodAntiAffinity(isoGroup)
if podErr != nil {
return nil, podErr
}

return &v1.Affinity{
NodeAffinity: nodeAffinity,
PodAntiAffinity: podAntiAffinity,
}, nil
}

// GenerateOwnerRef generates an owner reference to a given m3db cluster.
func GenerateOwnerRef(cluster *myspec.M3DBCluster) *metav1.OwnerReference {
return metav1.NewControllerRef(cluster, schema.GroupVersionKind{
Expand Down
77 changes: 73 additions & 4 deletions pkg/k8sops/m3db/statefulset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ import (

myspec "github.com/m3db/m3db-operator/pkg/apis/m3dboperator/v1alpha1"

"github.com/m3db/m3db-operator/pkg/k8sops/labels"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -73,7 +76,7 @@ func TestGenerateDownwardAPIVolumePath(t *testing.T) {
assert.Equal(t, exp, vm)
}

func TestGenerateStatefulSetAffinity(t *testing.T) {
func TestGenerateStatefulSetNodeAffinity(t *testing.T) {
type expTerm struct {
key string
values []string
Expand Down Expand Up @@ -155,18 +158,18 @@ func TestGenerateStatefulSetAffinity(t *testing.T) {
}

for _, test := range tests {
affinity, err := GenerateStatefulSetAffinity(test.isoGroup)
nodeaffinity, err := GenerateStatefulSetNodeAffinity(test.isoGroup)
if test.expErr != nil {
assert.Equal(t, test.expErr, err)
continue
}

if len(test.expTerms) == 0 {
assert.Nil(t, affinity)
assert.Nil(t, nodeaffinity)
continue
}

terms := affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
terms := nodeaffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms
assert.Len(t, terms, 1)

expTerms := make([]corev1.NodeSelectorRequirement, len(test.expTerms))
Expand All @@ -181,3 +184,69 @@ func TestGenerateStatefulSetAffinity(t *testing.T) {
assert.Equal(t, expTerms, terms[0].MatchExpressions)
}
}

func TestGenerateStatefulSetPodAntiAffinity(t *testing.T) {
tests := []struct {
isoGroup myspec.IsolationGroup
expBool bool
expErr error
}{
{
isoGroup: myspec.IsolationGroup{
Name: "group1",
},
expBool: false,
},
{
isoGroup: myspec.IsolationGroup{
Name: "group2",
UsePodAntiAffinity: false,
},
expBool: false,
},
{
isoGroup: myspec.IsolationGroup{
Name: "group3",
UsePodAntiAffinity: true,
PodAffinityToplogyKey: "hostname",
},
expBool: true,
},
{
isoGroup: myspec.IsolationGroup{
Name: "group4",
UsePodAntiAffinity: true,
},
expBool: true,
expErr: errEmptyPodAffinityToplogyKey,
},
}

for _, test := range tests {
antiaffinity, err := GenerateStatefulSetPodAntiAffinity(test.isoGroup)

if !test.expBool {
assert.Nil(t, antiaffinity)
continue
}

if test.expErr != nil {
assert.Equal(t, test.expErr, err)
continue
}

terms := antiaffinity.RequiredDuringSchedulingIgnoredDuringExecution

expTerms := &metav1.LabelSelector{
MatchExpressions: []metav1.LabelSelectorRequirement{
{
Key: labels.Component,
Operator: "In",
Values: []string{labels.ComponentM3DBNode},
},
},
}

assert.Equal(t, expTerms, terms[0].LabelSelector)
}
}