initial cruise-control

double16 · double16 · commit c583a49ff0f7 · 2018-10-31T15:55:23.000-05:00
diff --git a/cruise-control/10broker-cruise-control-reporter-config.yml b/cruise-control/10broker-cruise-control-reporter-config.yml
@@ -0,0 +1,12 @@
+kind: ConfigMap
+metadata:
+  name: broker-cruise-control-reporter-config
+  namespace: kafka
+apiVersion: v1
+data:
+  cruise-control-reporter-init.sh: |-
+    #!/bin/bash
+    set -xe
+    VERSION=2.0.6
+    curl -L -o /opt/kafka/libs/extensions/cruise-control-metrics-reporter.jar https://linkedin.jfrog.io/linkedin/cruise-control/com/linkedin/cruisecontrol/cruise-control-metrics-reporter/${VERSION}/cruise-control-metrics-reporter-${VERSION}.jar
+    echo -e "\n\nmetric.reporters = com.linkedin.kafka.cruisecontrol.metricsreporter.CruiseControlMetricsReporter" >> /etc/kafka/server.properties
diff --git a/cruise-control/11cruise-control-config.yml b/cruise-control/11cruise-control-config.yml
@@ -0,0 +1,311 @@
+kind: ConfigMap
+metadata:
+  name: broker-cruise-control-config
+  namespace: kafka
+apiVersion: v1
+data:
+  cruisecontrol.properties: |-
+    #
+    # Copyright 2017 LinkedIn Corp. Licensed under the BSD 2-Clause License (the "License"). See License in the project root for license information.
+    #
+
+    # This is an example property file for Kafka Cruise Control. See KafkaCruiseControlConfig for more details.
+
+    # Configuration for the metadata client.
+    # =======================================
+
+    # The Kafka cluster to control.
+    bootstrap.servers=bootstrap:9092
+
+    # The maximum interval in milliseconds between two metadata refreshes.
+    #metadata.max.age.ms=300000
+
+    # Client id for the Cruise Control. It is used for the metadata client.
+    #client.id=kafka-cruise-control
+
+    # The size of TCP send buffer bytes for the metadata client.
+    #send.buffer.bytes=131072
+
+    # The size of TCP receive buffer size for the metadata client.
+    #receive.buffer.bytes=131072
+
+    # The time to wait before disconnect an idle TCP connection.
+    #connections.max.idle.ms=540000
+
+    # The time to wait before reconnect to a given host.
+    #reconnect.backoff.ms=50
+
+    # The time to wait for a response from a host after sending a request.
+    #request.timeout.ms=30000
+
+
+    # Configurations for the load monitor
+    # =======================================
+
+    # The number of metric fetcher thread to fetch metrics for the Kafka cluster
+    num.metric.fetchers=1
+
+    # The metric sampler class
+    metric.sampler.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.CruiseControlMetricsReporterSampler
+    # Configurations for CruiseControlMetricsReporterSampler
+    metric.reporter.topic.pattern=__CruiseControlMetrics
+
+    # The sample store class name
+    sample.store.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.KafkaSampleStore
+
+    # The config for the Kafka sample store to save the partition metric samples
+    partition.metric.sample.store.topic=__KafkaCruiseControlPartitionMetricSamples
+
+    # The config for the Kafka sample store to save the model training samples
+    broker.metric.sample.store.topic=__KafkaCruiseControlModelTrainingSamples
+
+    # The replication factor of Kafka metric sample store topic
+    sample.store.topic.replication.factor=2
+
+    # The config for the number of Kafka sample store consumer threads
+    num.sample.loading.threads=8
+
+    # The partition assignor class for the metric samplers
+    metric.sampler.partition.assignor.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.DefaultMetricSamplerPartitionAssignor
+
+    # The metric sampling interval in milliseconds
+    metric.sampling.interval.ms=120000
+
+    # The partition metrics window size in milliseconds
+    partition.metrics.window.ms=300000
+
+    # The number of partition metric windows to keep in memory
+    num.partition.metrics.windows=1
+
+    # The minimum partition metric samples required for a partition in each window
+    min.samples.per.partition.metrics.window=1
+
+    # The broker metrics window size in milliseconds
+    broker.metrics.window.ms=300000
+
+    # The number of broker metric windows to keep in memory
+    num.broker.metrics.windows=20
+
+    # The minimum broker metric samples required for a partition in each window
+    min.samples.per.broker.metrics.window=1
+
+    # The configuration for the BrokerCapacityConfigFileResolver (supports JBOD and non-JBOD broker capacities)
+    capacity.config.file=config/capacity.json
+    #capacity.config.file=config/capacityJBOD.json
+
+    # Configurations for the analyzer
+    # =======================================
+
+    # The list of goals to optimize the Kafka cluster for with pre-computed proposals
+    default.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PotentialNwOutGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.TopicReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.LeaderBytesInDistributionGoal
+
+    # The list of supported goals
+    goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PotentialNwOutGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.TopicReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.LeaderBytesInDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.kafkaassigner.KafkaAssignerDiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.kafkaassigner.KafkaAssignerEvenRackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PreferredLeaderElectionGoal
+
+    # The list of supported hard goals
+    hard.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal
+
+    # The minimum percentage of well monitored partitions out of all the partitions
+    min.monitored.partition.percentage=0.95
+
+    # The balance threshold for CPU
+    cpu.balance.threshold=1.1
+
+    # The balance threshold for disk
+    disk.balance.threshold=1.1
+
+    # The balance threshold for network inbound utilization
+    network.inbound.balance.threshold=1.1
+
+    # The balance threshold for network outbound utilization
+    network.outbound.balance.threshold=1.1
+
+    # The balance threshold for the replica count
+    replica.count.balance.threshold=1.1
+
+    # The capacity threshold for CPU in percentage
+    cpu.capacity.threshold=0.8
+
+    # The capacity threshold for disk in percentage
+    disk.capacity.threshold=0.8
+
+    # The capacity threshold for network inbound utilization in percentage
+    network.inbound.capacity.threshold=0.8
+
+    # The capacity threshold for network outbound utilization in percentage
+    network.outbound.capacity.threshold=0.8
+
+    # The threshold to define the cluster to be in a low CPU utilization state
+    cpu.low.utilization.threshold=0.0
+
+    # The threshold to define the cluster to be in a low disk utilization state
+    disk.low.utilization.threshold=0.0
+
+    # The threshold to define the cluster to be in a low network inbound utilization state
+    network.inbound.low.utilization.threshold=0.0
+
+    # The threshold to define the cluster to be in a low disk utilization state
+    network.outbound.low.utilization.threshold=0.0
+
+    # The metric anomaly percentile upper threshold
+    metric.anomaly.percentile.upper.threshold=90.0
+
+    # The metric anomaly percentile lower threshold
+    metric.anomaly.percentile.lower.threshold=10.0
+
+    # How often should the cached proposal be expired and recalculated if necessary
+    proposal.expiration.ms=60000
+
+    # The maximum number of replicas that can reside on a broker at any given time.
+    max.replicas.per.broker=10000
+
+    # The number of threads to use for proposal candidate precomputing.
+    num.proposal.precompute.threads=1
+
+    # the topics that should be excluded from the partition movement.
+    #topics.excluded.from.partition.movement
+
+    # Configurations for the executor
+    # =======================================
+
+    # The zookeeper connect of the Kafka cluster
+    zookeeper.connect=zookeeper:2181
+
+    # The max number of partitions to move in/out on a given broker at a given time.
+    num.concurrent.partition.movements.per.broker=10
+
+    # The interval between two execution progress checks.
+    execution.progress.check.interval.ms=10000
+
+
+    # Configurations for anomaly detector
+    # =======================================
+
+    # The goal violation notifier class
+    anomaly.notifier.class=com.linkedin.kafka.cruisecontrol.detector.notifier.SelfHealingNotifier
+
+    # The metric anomaly finder class
+    metric.anomaly.finder.class=com.linkedin.kafka.cruisecontrol.detector.KafkaMetricAnomalyFinder
+
+    # The anomaly detection interval
+    anomaly.detection.interval.ms=10000
+
+    # The goal violation to detect.
+    anomaly.detection.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal
+
+    # The interested metrics for metric anomaly analyzer.
+    metric.anomaly.analyzer.metrics=BROKER_PRODUCE_LOCAL_TIME_MS_MAX,BROKER_PRODUCE_LOCAL_TIME_MS_MEAN,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MAX,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MAX,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_LOG_FLUSH_TIME_MS_MAX,BROKER_LOG_FLUSH_TIME_MS_MEAN
+
+    # The zk path to store failed broker information.
+    failed.brokers.zk.path=/CruiseControlBrokerList
+
+    # Topic config provider class
+    topic.config.provider.class=com.linkedin.kafka.cruisecontrol.config.KafkaTopicConfigProvider
+
+    # The cluster configurations for the KafkaTopicConfigProvider
+    cluster.configs.file=config/clusterConfigs.json
+
+    # Enable self healing for all anomaly detectors, unless the particular anomaly detector is explicitly disabled
+    self.healing.enabled=true
+
+    # Enable self healing for broker failure detector
+    #self.healing.broker.failure.enabled=true
+
+    # Enable self healing for goal violation detector
+    #self.healing.goal.violation.enabled=true
+
+    # Enable self healing for metric anomaly detector
+    #self.healing.metric.anomaly.enabled=true
+
+  capacityJBOD.json: |-
+    {
+      "brokerCapacities":[
+        {
+          "brokerId": "-1",
+          "capacity": {
+            "DISK": {"/tmp/kafka-logs-1": "100000", "/tmp/kafka-logs-2": "100000", "/tmp/kafka-logs-3": "50000",
+              "/tmp/kafka-logs-4": "50000", "/tmp/kafka-logs-5": "150000", "/tmp/kafka-logs-6": "50000"},
+            "CPU": "100",
+            "NW_IN": "10000",
+            "NW_OUT": "10000"
+          },
+          "doc": "The default capacity for a broker with multiple logDirs each on a separate heterogeneous disk."
+        },
+        {
+          "brokerId": "0",
+          "capacity": {
+            "DISK": {"/tmp/kafka-logs": "500000"},
+            "CPU": "100",
+            "NW_IN": "50000",
+            "NW_OUT": "50000"
+          },
+          "doc": "This overrides the capacity for broker 0. This broker is not a JBOD broker."
+        },
+        {
+          "brokerId": "1",
+          "capacity": {
+            "DISK": {"/tmp/kafka-logs-1": "250000", "/tmp/kafka-logs-2": "250000"},
+            "CPU": "100",
+            "NW_IN": "50000",
+            "NW_OUT": "50000"
+          },
+          "doc": "This overrides the capacity for broker 1. This broker is a JBOD broker."
+        }
+      ]
+    }
+
+  capacity.json: |-
+    {
+      "brokerCapacities":[
+        {
+          "brokerId": "-1",
+          "capacity": {
+            "DISK": "100000",
+            "CPU": "100",
+            "NW_IN": "10000",
+            "NW_OUT": "10000"
+          },
+          "doc": "This is the default capacity. Capacity unit used for disk is in MB, cpu is in percentage, network throughput is in KB."
+        },
+        {
+          "brokerId": "0",
+          "capacity": {
+            "DISK": "500000",
+            "CPU": "100",
+            "NW_IN": "50000",
+            "NW_OUT": "50000"
+          },
+          "doc": "This overrides the capacity for broker 0."
+        }
+      ]
+    }
+
+  clusterConfigs.json: |-
+    {
+      "min.insync.replicas": 1,
+      "an.example.cluster.config": false
+    }
+
+  log4j2.xml: |-
+    <?xml version="1.0" encoding="UTF-8"?>
+    <Configuration status="INFO">
+        <Appenders>
+            <File name="Console" fileName="/dev/stdout">
+                <PatternLayout pattern="%d{yyy-MM-dd HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
+            </File>
+        </Appenders>
+        <Loggers>
+            <Root level="info">
+                <AppenderRef ref="Console" />
+            </Root>
+        </Loggers>
+    </Configuration>
+
+  log4j.properties: |-
+    log4j.rootLogger = INFO, FILE
+
+    log4j.appender.FILE=org.apache.log4j.FileAppender
+    log4j.appender.FILE.File=/dev/stdout
+
+    log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
+    log4j.appender.FILE.layout.conversionPattern=%-6r [%15.15t] %-5p %30.30c %x - %m%n
diff --git a/cruise-control/20kafka-broker-reporter-patch.yml b/cruise-control/20kafka-broker-reporter-patch.yml
@@ -0,0 +1,38 @@
+# meant to be applied using
+# kubectl --namespace kafka patch statefulset kafka --patch "$(cat cruise-control/20kafka-broker-reporter-patch.yml )"
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: kafka
+  namespace: kafka
+spec:
+  template:
+    spec:
+      initContainers:
+      - name: cruise-control-reporter
+        image: hortonworks/alpine-curl:3.1
+        command: ['/bin/sh', '/tmp/cruise-control-reporter-configmap/cruise-control-reporter-init.sh']
+        volumeMounts:
+        - name: cruiseconfigmap
+          mountPath: /tmp/cruise-control-reporter-configmap
+        - name: config
+          mountPath: /etc/kafka
+        - name: extensions
+          mountPath: /opt/kafka/libs/extensions
+      $setElementOrder/initContainers:
+      - name: init-config
+      - name: cruise-control-reporter
+      containers:
+      - name: broker
+        env:
+        - name: CLASSPATH
+          value: /opt/kafka/libs/extensions/cruise-control-metrics-reporter.jar
+        volumeMounts:
+        - name: extensions
+          mountPath: /opt/kafka/libs/extensions
+      volumes:
+      - name: cruiseconfigmap
+        configMap:
+          name: broker-cruise-control-reporter-config
+      - name: extensions
+        emptyDir: {}
diff --git a/cruise-control/40cruise-control-service.yml b/cruise-control/40cruise-control-service.yml
@@ -0,0 +1,12 @@
+kind: Service
+apiVersion: v1
+metadata:
+  name: cruise-control
+  namespace: kafka
+spec:
+  selector:
+    app: cruise-control
+  ports:
+  - protocol: TCP
+    port: 8090
+    targetPort: 8090
diff --git a/cruise-control/50cruise-control.yml b/cruise-control/50cruise-control.yml