Skip to content

Commit c805f9b

Browse files
authored
Merge pull request #218 from double16/cruise-control
Add example of linkedin/cruise-control setup, fixes #100
2 parents 72a5116 + 1d52996 commit c805f9b

File tree

7 files changed

+530
-0
lines changed

7 files changed

+530
-0
lines changed
Lines changed: 380 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,380 @@
1+
kind: ConfigMap
2+
metadata:
3+
name: broker-cruise-control-config
4+
namespace: kafka
5+
apiVersion: v1
6+
data:
7+
cruisecontrol.properties: |-
8+
#
9+
# Copyright 2017 LinkedIn Corp. Licensed under the BSD 2-Clause License (the "License"). See License in the project root for license information.
10+
#
11+
12+
# This is an example property file for Kafka Cruise Control. See KafkaCruiseControlConfig for more details.
13+
14+
# Configuration for the metadata client.
15+
# =======================================
16+
17+
# The Kafka cluster to control.
18+
bootstrap.servers=bootstrap:9092
19+
20+
# The maximum interval in milliseconds between two metadata refreshes.
21+
#metadata.max.age.ms=300000
22+
23+
# Client id for the Cruise Control. It is used for the metadata client.
24+
#client.id=kafka-cruise-control
25+
26+
# The size of TCP send buffer bytes for the metadata client.
27+
#send.buffer.bytes=131072
28+
29+
# The size of TCP receive buffer size for the metadata client.
30+
#receive.buffer.bytes=131072
31+
32+
# The time to wait before disconnect an idle TCP connection.
33+
#connections.max.idle.ms=540000
34+
35+
# The time to wait before reconnect to a given host.
36+
#reconnect.backoff.ms=50
37+
38+
# The time to wait for a response from a host after sending a request.
39+
#request.timeout.ms=30000
40+
41+
42+
# Configurations for the load monitor
43+
# =======================================
44+
45+
# The number of metric fetcher thread to fetch metrics for the Kafka cluster
46+
num.metric.fetchers=1
47+
48+
# The metric sampler class
49+
metric.sampler.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.CruiseControlMetricsReporterSampler
50+
# Configurations for CruiseControlMetricsReporterSampler
51+
metric.reporter.topic.pattern=__CruiseControlMetrics
52+
53+
# The sample store class name
54+
sample.store.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.KafkaSampleStore
55+
56+
# The config for the Kafka sample store to save the partition metric samples
57+
partition.metric.sample.store.topic=__KafkaCruiseControlPartitionMetricSamples
58+
59+
# The config for the Kafka sample store to save the model training samples
60+
broker.metric.sample.store.topic=__KafkaCruiseControlModelTrainingSamples
61+
62+
# The replication factor of Kafka metric sample store topic
63+
sample.store.topic.replication.factor=2
64+
65+
# The config for the number of Kafka sample store consumer threads
66+
num.sample.loading.threads=8
67+
68+
# The partition assignor class for the metric samplers
69+
metric.sampler.partition.assignor.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.DefaultMetricSamplerPartitionAssignor
70+
71+
# The metric sampling interval in milliseconds
72+
metric.sampling.interval.ms=120000
73+
74+
# The partition metrics window size in milliseconds
75+
partition.metrics.window.ms=300000
76+
77+
# The number of partition metric windows to keep in memory
78+
num.partition.metrics.windows=1
79+
80+
# The minimum partition metric samples required for a partition in each window
81+
min.samples.per.partition.metrics.window=1
82+
83+
# The broker metrics window size in milliseconds
84+
broker.metrics.window.ms=300000
85+
86+
# The number of broker metric windows to keep in memory
87+
num.broker.metrics.windows=20
88+
89+
# The minimum broker metric samples required for a partition in each window
90+
min.samples.per.broker.metrics.window=1
91+
92+
# The configuration for the BrokerCapacityConfigFileResolver (supports JBOD and non-JBOD broker capacities)
93+
capacity.config.file=config/capacity.json
94+
#capacity.config.file=config/capacityJBOD.json
95+
96+
# Configurations for the analyzer
97+
# =======================================
98+
99+
# The list of goals to optimize the Kafka cluster for with pre-computed proposals
100+
default.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PotentialNwOutGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.TopicReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.LeaderBytesInDistributionGoal
101+
102+
# The list of supported goals
103+
goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PotentialNwOutGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.TopicReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.LeaderBytesInDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.kafkaassigner.KafkaAssignerDiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.kafkaassigner.KafkaAssignerEvenRackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PreferredLeaderElectionGoal
104+
105+
# The list of supported hard goals
106+
hard.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal
107+
108+
# The minimum percentage of well monitored partitions out of all the partitions
109+
min.monitored.partition.percentage=0.95
110+
111+
# The balance threshold for CPU
112+
cpu.balance.threshold=1.1
113+
114+
# The balance threshold for disk
115+
disk.balance.threshold=1.1
116+
117+
# The balance threshold for network inbound utilization
118+
network.inbound.balance.threshold=1.1
119+
120+
# The balance threshold for network outbound utilization
121+
network.outbound.balance.threshold=1.1
122+
123+
# The balance threshold for the replica count
124+
replica.count.balance.threshold=1.1
125+
126+
# The capacity threshold for CPU in percentage
127+
cpu.capacity.threshold=0.8
128+
129+
# The capacity threshold for disk in percentage
130+
disk.capacity.threshold=0.8
131+
132+
# The capacity threshold for network inbound utilization in percentage
133+
network.inbound.capacity.threshold=0.8
134+
135+
# The capacity threshold for network outbound utilization in percentage
136+
network.outbound.capacity.threshold=0.8
137+
138+
# The threshold to define the cluster to be in a low CPU utilization state
139+
cpu.low.utilization.threshold=0.0
140+
141+
# The threshold to define the cluster to be in a low disk utilization state
142+
disk.low.utilization.threshold=0.0
143+
144+
# The threshold to define the cluster to be in a low network inbound utilization state
145+
network.inbound.low.utilization.threshold=0.0
146+
147+
# The threshold to define the cluster to be in a low disk utilization state
148+
network.outbound.low.utilization.threshold=0.0
149+
150+
# The metric anomaly percentile upper threshold
151+
metric.anomaly.percentile.upper.threshold=90.0
152+
153+
# The metric anomaly percentile lower threshold
154+
metric.anomaly.percentile.lower.threshold=10.0
155+
156+
# How often should the cached proposal be expired and recalculated if necessary
157+
proposal.expiration.ms=60000
158+
159+
# The maximum number of replicas that can reside on a broker at any given time.
160+
max.replicas.per.broker=10000
161+
162+
# The number of threads to use for proposal candidate precomputing.
163+
num.proposal.precompute.threads=1
164+
165+
# the topics that should be excluded from the partition movement.
166+
#topics.excluded.from.partition.movement
167+
168+
# Configurations for the executor
169+
# =======================================
170+
171+
# The zookeeper connect of the Kafka cluster
172+
zookeeper.connect=zookeeper:2181/
173+
174+
# The max number of partitions to move in/out on a given broker at a given time.
175+
num.concurrent.partition.movements.per.broker=10
176+
177+
# The interval between two execution progress checks.
178+
execution.progress.check.interval.ms=10000
179+
180+
181+
# Configurations for anomaly detector
182+
# =======================================
183+
184+
# The goal violation notifier class
185+
anomaly.notifier.class=com.linkedin.kafka.cruisecontrol.detector.notifier.SelfHealingNotifier
186+
187+
# The metric anomaly finder class
188+
metric.anomaly.finder.class=com.linkedin.kafka.cruisecontrol.detector.KafkaMetricAnomalyFinder
189+
190+
# The anomaly detection interval
191+
anomaly.detection.interval.ms=10000
192+
193+
# The goal violation to detect.
194+
anomaly.detection.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal
195+
196+
# The interested metrics for metric anomaly analyzer.
197+
metric.anomaly.analyzer.metrics=BROKER_PRODUCE_LOCAL_TIME_MS_MAX,BROKER_PRODUCE_LOCAL_TIME_MS_MEAN,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MAX,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MAX,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_LOG_FLUSH_TIME_MS_MAX,BROKER_LOG_FLUSH_TIME_MS_MEAN
198+
199+
## Adjust accordingly if your metrics reporter is an older version and does not produce these metrics.
200+
#metric.anomaly.analyzer.metrics=BROKER_PRODUCE_LOCAL_TIME_MS_50TH,BROKER_PRODUCE_LOCAL_TIME_MS_999TH,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_50TH,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_999TH,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_50TH,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_999TH,BROKER_LOG_FLUSH_TIME_MS_50TH,BROKER_LOG_FLUSH_TIME_MS_999TH
201+
202+
# The zk path to store failed broker information.
203+
failed.brokers.zk.path=/CruiseControlBrokerList
204+
205+
# Topic config provider class
206+
topic.config.provider.class=com.linkedin.kafka.cruisecontrol.config.KafkaTopicConfigProvider
207+
208+
# The cluster configurations for the KafkaTopicConfigProvider
209+
cluster.configs.file=config/clusterConfigs.json
210+
211+
# The maximum time in milliseconds to store the response and access details of a completed user task.
212+
completed.user.task.retention.time.ms=21600000
213+
214+
# The maximum time in milliseconds to retain the demotion history of brokers.
215+
demotion.history.retention.time.ms=86400000
216+
217+
# The maximum number of completed user tasks for which the response and access details will be cached.
218+
max.cached.completed.user.tasks=100
219+
220+
# The maximum number of user tasks for concurrently running in async endpoints across all users.
221+
max.active.user.tasks=5
222+
223+
# Enable self healing for all anomaly detectors, unless the particular anomaly detector is explicitly disabled
224+
self.healing.enabled=true
225+
226+
# Enable self healing for broker failure detector
227+
#self.healing.broker.failure.enabled=true
228+
229+
# Enable self healing for goal violation detector
230+
#self.healing.goal.violation.enabled=true
231+
232+
# Enable self healing for metric anomaly detector
233+
#self.healing.metric.anomaly.enabled=true
234+
235+
236+
# configurations for the webserver
237+
# ================================
238+
239+
# HTTP listen port
240+
webserver.http.port=9090
241+
242+
# HTTP listen address
243+
webserver.http.address=0.0.0.0
244+
245+
# Whether CORS support is enabled for API or not
246+
webserver.http.cors.enabled=false
247+
248+
# Value for Access-Control-Allow-Origin
249+
webserver.http.cors.origin=http://localhost:8080/
250+
251+
# Value for Access-Control-Request-Method
252+
webserver.http.cors.allowmethods=OPTIONS,GET,POST
253+
254+
# Headers that should be exposed to the Browser (Webapp)
255+
# This is a special header that is used by the
256+
# User Tasks subsystem and should be explicitly
257+
# Enabled when CORS mode is used as part of the
258+
# Admin Interface
259+
webserver.http.cors.exposeheaders=User-Task-ID
260+
261+
# REST API default prefix
262+
# (dont forget the ending *)
263+
webserver.api.urlprefix=/kafkacruisecontrol/*
264+
265+
# Location where the Cruise Control frontend is deployed
266+
webserver.ui.diskpath=./cruise-control-ui/dist/
267+
268+
# URL path prefix for UI
269+
# (dont forget the ending *)
270+
webserver.ui.urlprefix=/*
271+
272+
# Time After which request is converted to Async
273+
webserver.request.maxBlockTimeMs=10000
274+
275+
# Default Session Expiry Period
276+
webserver.session.maxExpiryTimeMs=60000
277+
278+
# Session cookie path
279+
webserver.session.path=/
280+
281+
# Server Access Logs
282+
webserver.accesslog.enabled=true
283+
284+
# Location of HTTP Request Logs
285+
webserver.accesslog.path=access.log
286+
287+
# HTTP Request Log retention days
288+
webserver.accesslog.retention.days=14
289+
290+
capacityJBOD.json: |-
291+
{
292+
"brokerCapacities":[
293+
{
294+
"brokerId": "-1",
295+
"capacity": {
296+
"DISK": {"/tmp/kafka-logs-1": "100000", "/tmp/kafka-logs-2": "100000", "/tmp/kafka-logs-3": "50000",
297+
"/tmp/kafka-logs-4": "50000", "/tmp/kafka-logs-5": "150000", "/tmp/kafka-logs-6": "50000"},
298+
"CPU": "100",
299+
"NW_IN": "10000",
300+
"NW_OUT": "10000"
301+
},
302+
"doc": "The default capacity for a broker with multiple logDirs each on a separate heterogeneous disk."
303+
},
304+
{
305+
"brokerId": "0",
306+
"capacity": {
307+
"DISK": {"/tmp/kafka-logs": "500000"},
308+
"CPU": "100",
309+
"NW_IN": "50000",
310+
"NW_OUT": "50000"
311+
},
312+
"doc": "This overrides the capacity for broker 0. This broker is not a JBOD broker."
313+
},
314+
{
315+
"brokerId": "1",
316+
"capacity": {
317+
"DISK": {"/tmp/kafka-logs-1": "250000", "/tmp/kafka-logs-2": "250000"},
318+
"CPU": "100",
319+
"NW_IN": "50000",
320+
"NW_OUT": "50000"
321+
},
322+
"doc": "This overrides the capacity for broker 1. This broker is a JBOD broker."
323+
}
324+
]
325+
}
326+
327+
capacity.json: |-
328+
{
329+
"brokerCapacities":[
330+
{
331+
"brokerId": "-1",
332+
"capacity": {
333+
"DISK": "100000",
334+
"CPU": "100",
335+
"NW_IN": "10000",
336+
"NW_OUT": "10000"
337+
},
338+
"doc": "This is the default capacity. Capacity unit used for disk is in MB, cpu is in percentage, network throughput is in KB."
339+
},
340+
{
341+
"brokerId": "0",
342+
"capacity": {
343+
"DISK": "500000",
344+
"CPU": "100",
345+
"NW_IN": "50000",
346+
"NW_OUT": "50000"
347+
},
348+
"doc": "This overrides the capacity for broker 0."
349+
}
350+
]
351+
}
352+
353+
clusterConfigs.json: |-
354+
{
355+
"min.insync.replicas": 2
356+
}
357+
358+
log4j2.xml: |-
359+
<?xml version="1.0" encoding="UTF-8"?>
360+
<Configuration status="INFO">
361+
<Appenders>
362+
<File name="Console" fileName="/dev/stdout">
363+
<PatternLayout pattern="%d{yyy-MM-dd HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
364+
</File>
365+
</Appenders>
366+
<Loggers>
367+
<Root level="info">
368+
<AppenderRef ref="Console" />
369+
</Root>
370+
</Loggers>
371+
</Configuration>
372+
373+
log4j.properties: |-
374+
log4j.rootLogger = INFO, FILE
375+
376+
log4j.appender.FILE=org.apache.log4j.FileAppender
377+
log4j.appender.FILE.File=/dev/stdout
378+
379+
log4j.appender.FILE.layout=org.apache.log4j.PatternLayout
380+
log4j.appender.FILE.layout.conversionPattern=%-6r [%15.15t] %-5p %30.30c %x - %m%n

0 commit comments

Comments
 (0)