1
1
from __future__ import absolute_import
2
2
3
3
from collections import defaultdict
4
+ from copy import copy
4
5
from itertools import izip_longest , repeat
6
+ from kazoo .client import KazooClient
5
7
import logging
6
- import time
7
- from threading import Lock
8
8
from multiprocessing import Process , Queue as MPQueue , Event , Value
9
+ import os
9
10
from Queue import Empty , Queue
11
+ import time
12
+ from threading import Lock
10
13
11
14
from kafka .common import (
12
15
ErrorMapping , FetchRequest ,
30
33
ITER_TIMEOUT_SECONDS = 60
31
34
NO_MESSAGES_WAIT_TIME_SECONDS = 0.1
32
35
36
+ CONSUMERS_DIR = "consumers"
37
+ OFFSETS_DIR = "offsets"
33
38
34
39
class FetchContext (object ):
35
40
"""
@@ -71,7 +76,8 @@ class Consumer(object):
71
76
"""
72
77
def __init__ (self , client , group , topic , partitions = None , auto_commit = True ,
73
78
auto_commit_every_n = AUTO_COMMIT_MSG_COUNT ,
74
- auto_commit_every_t = AUTO_COMMIT_INTERVAL ):
79
+ auto_commit_every_t = AUTO_COMMIT_INTERVAL ,
80
+ zk_hosts = None , zk_chroot = '/' ):
75
81
76
82
self .client = client
77
83
self .topic = topic
@@ -89,34 +95,80 @@ def __init__(self, client, group, topic, partitions=None, auto_commit=True,
89
95
self .auto_commit = auto_commit
90
96
self .auto_commit_every_n = auto_commit_every_n
91
97
self .auto_commit_every_t = auto_commit_every_t
98
+ self .partitions_to_commit = set ()
99
+
100
+ for partition in partitions :
101
+ self .offsets [partition ] = 0
102
+
103
+ # Zookeeper
104
+ self ._init_zk (zk_hosts , zk_chroot )
105
+
106
+ # Uncomment for 0.8.1
107
+ # def get_or_init_offset_callback(resp):
108
+ # if resp.error == ErrorMapping.NO_ERROR:
109
+ # return resp.offset
110
+ # elif resp.error == ErrorMapping.UNKNOWN_TOPIC_OR_PARTITON:
111
+ # return 0
112
+ # else:
113
+ # raise Exception("OffsetFetchRequest for topic=%s, "
114
+ # "partition=%d failed with errorcode=%s" % (
115
+ # resp.topic, resp.partition, resp.error))
116
+ #
117
+ # for partition in partitions:
118
+ # req = OffsetFetchRequest(topic, partition)
119
+ # (offset,) = self.client.send_offset_fetch_request(group, [req],
120
+ # callback=get_or_init_offset_callback,
121
+ # fail_on_error=False)
122
+ # self.offsets[partition] = offset
92
123
93
124
# Set up the auto-commit timer
94
125
if auto_commit is True and auto_commit_every_t is not None :
95
126
self .commit_timer = ReentrantTimer (auto_commit_every_t ,
96
127
self .commit )
97
128
self .commit_timer .start ()
98
129
99
- def get_or_init_offset_callback (resp ):
100
- if resp .error == ErrorMapping .NO_ERROR :
101
- return resp .offset
102
- elif resp .error == ErrorMapping .UNKNOWN_TOPIC_OR_PARTITON :
103
- return 0
104
- else :
105
- raise Exception ("OffsetFetchRequest for topic=%s, "
106
- "partition=%d failed with errorcode=%s" % (
107
- resp .topic , resp .partition , resp .error ))
130
+ def _get_offsets_dir (self ):
131
+ return os .path .join (self .zk_chroot , CONSUMERS_DIR , self .group ,
132
+ OFFSETS_DIR , self .topic )
133
+ def _init_zk (self , zk_hosts , zk_chroot ):
134
+ if zk_hosts :
135
+ self .zk = KazooClient ("," .join (zk_hosts ))
136
+ self .zk .start ()
137
+ self .zk_chroot = zk_chroot
138
+ self .offset_path = self ._get_offsets_dir ()
139
+ self .zk_offset_counters = {}
140
+ self ._fetch_offsets ()
141
+ else :
142
+ self .zk = None
143
+ self .zk_chroot = None
144
+ self .offset_path = None
145
+ self .zk_offset_counters = None
108
146
109
- # Uncomment for 0.8.1
110
- #
111
- #for partition in partitions:
112
- # req = OffsetFetchRequest(topic, partition)
113
- # (offset,) = self.client.send_offset_fetch_request(group, [req],
114
- # callback=get_or_init_offset_callback,
115
- # fail_on_error=False)
116
- # self.offsets[partition] = offset
147
+ def _get_partition_offsets (self , partitions , offset_time ):
148
+ reqs = []
149
+ for partition in partitions :
150
+ reqs .append (OffsetRequest (self .topic , partition , offset_time , 1 ))
151
+ return self .client .send_offset_request (reqs )
152
+
153
+ def _fetch_offsets (self , partitions = None ):
154
+ if self .zk is None :
155
+ raise Exception ("Cannot fetch offsets without zookeeper" )
156
+
157
+ if partitions is None :
158
+ partitions = self .offsets .keys ()
159
+ start_offsets = {}
160
+ for resp in self ._get_partition_offsets (partitions , - 2 ):
161
+ start_offsets [resp .partition ] = resp .offsets [0 ]
162
+
163
+ self .zk_offset_counters = {}
117
164
118
165
for partition in partitions :
119
- self .offsets [partition ] = 0
166
+ partition_offset_path = os .path .join (self .offset_path ,
167
+ str (partition ))
168
+ counter = self .zk .Counter (partition_offset_path ,
169
+ default = start_offsets [partition ])
170
+ self .zk_offset_counters [partition ] = counter
171
+ self .offsets [partition ] = counter .value
120
172
121
173
def commit (self , partitions = None ):
122
174
"""
@@ -125,50 +177,60 @@ def commit(self, partitions=None):
125
177
partitions: list of partitions to commit, default is to commit
126
178
all of them
127
179
"""
180
+ if self .zk is None :
181
+ raise Exception ("Cannot commit offsets without zookeeper" )
128
182
129
- # short circuit if nothing happened. This check is kept outside
130
- # to prevent un-necessarily acquiring a lock for checking the state
131
- if self .count_since_commit == 0 :
132
- return
183
+ if partitions is None :
184
+ partitions = self .offsets .keys ()
133
185
134
186
with self .commit_lock :
135
- # Do this check again, just in case the state has changed
136
- # during the lock acquiring timeout
137
- if self .count_since_commit == 0 :
138
- return
139
-
140
- reqs = []
141
- if not partitions : # commit all partitions
142
- partitions = self .offsets .keys ()
143
-
144
187
for partition in partitions :
145
188
offset = self .offsets [partition ]
146
- log .debug ("Commit offset %d in SimpleConsumer: "
147
- "group=%s, topic=%s, partition=%s" %
148
- (offset , self .group , self .topic , partition ))
149
-
150
- reqs .append (OffsetCommitRequest (self .topic , partition ,
151
- offset , None ))
152
-
153
- resps = self .client .send_offset_commit_request (self .group , reqs )
154
- for resp in resps :
155
- assert resp .error == 0
156
-
157
- self .count_since_commit = 0
189
+ counter = self .zk_offset_counters [partition ]
190
+ if counter .value == offset :
191
+ # Nothing changed for this partition
192
+ continue
193
+ log .debug ("Commit offset %d for consumer group=%s, topic=%s, "
194
+ "partition=%s" , offset , self .group , self .topic ,
195
+ partition )
196
+ counter += offset - counter .value
197
+ self .zk .sync (counter .path )
198
+
199
+ # Do this instead for kafka 0.8.1
200
+ # reqs = []
201
+ # if not partitions: # demmit all partitions
202
+ # partitions = self.offsets.keys()
203
+ #
204
+ # for partition in partitions:
205
+ # offset = self.offsets[partition]
206
+ # log.debug("Commit offset %d in SimpleConsumer: "
207
+ # "group=%s, topic=%s, partition=%s" %
208
+ # (offset, self.group, self.topic, partition))
209
+ #
210
+ # reqs.append(OffsetCommitRequest(self.topic, partition,
211
+ # offset, None))
212
+ #
213
+ # resps = self.client.send_offset_commit_request(self.group, reqs)
214
+ # for resp in resps:
215
+ # assert resp.error == 0
158
216
159
217
def _auto_commit (self ):
160
218
"""
161
219
Check if we have to commit based on number of messages and commit
162
220
"""
163
221
164
222
# Check if we are supposed to do an auto-commit
165
- if not self .auto_commit or self .auto_commit_every_n is None :
223
+ if (self .zk is None or not self .auto_commit or
224
+ self .auto_commit_every_n is None ):
166
225
return
167
226
168
- if self .count_since_commit > self .auto_commit_every_n :
227
+ if self .count_since_commit >= self .auto_commit_every_n :
169
228
self .commit ()
229
+ self .count_since_commit = 0
170
230
171
231
def stop (self ):
232
+ if self .zk :
233
+ self .zk .stop ()
172
234
if self .commit_timer is not None :
173
235
self .commit_timer .stop ()
174
236
self .commit ()
@@ -234,7 +296,8 @@ def __init__(self, client, group, topic, auto_commit=True, partitions=None,
234
296
fetch_size_bytes = FETCH_MIN_BYTES ,
235
297
buffer_size = FETCH_BUFFER_SIZE_BYTES ,
236
298
max_buffer_size = MAX_FETCH_BUFFER_SIZE_BYTES ,
237
- iter_timeout = None ):
299
+ iter_timeout = None , zk_hosts = None ,
300
+ zk_chroot = '/' ):
238
301
239
302
if max_buffer_size is not None and buffer_size > max_buffer_size :
240
303
raise ValueError ("buffer_size (%d) is greater than "
@@ -250,11 +313,11 @@ def __init__(self, client, group, topic, auto_commit=True, partitions=None,
250
313
self .queue = Queue ()
251
314
252
315
super (SimpleConsumer , self ).__init__ (
253
- client , group , topic ,
254
- partitions = partitions ,
316
+ client , group , topic , partitions = partitions ,
255
317
auto_commit = auto_commit ,
256
318
auto_commit_every_n = auto_commit_every_n ,
257
- auto_commit_every_t = auto_commit_every_t )
319
+ auto_commit_every_t = auto_commit_every_t ,
320
+ zk_hosts = zk_hosts , zk_chroot = zk_chroot )
258
321
259
322
def provide_partition_info (self ):
260
323
"""
@@ -272,7 +335,7 @@ def seek(self, offset, whence):
272
335
1 is relative to the current offset
273
336
2 is relative to the latest known offset (tail)
274
337
"""
275
-
338
+ partitions = self . offsets . keys ()
276
339
if whence == 1 : # relative to current position
277
340
for partition , _offset in self .offsets .items ():
278
341
self .offsets [partition ] = _offset + offset
@@ -281,20 +344,15 @@ def seek(self, offset, whence):
281
344
# distribute the remained evenly
282
345
(delta , rem ) = divmod (offset , len (self .offsets ))
283
346
deltas = {}
284
- for partition , r in izip_longest (self . offsets . keys () ,
347
+ for partition , r in izip_longest (partitions ,
285
348
repeat (1 , rem ), fillvalue = 0 ):
286
349
deltas [partition ] = delta + r
287
350
288
- reqs = []
289
- for partition in self .offsets .keys ():
290
- if whence == 0 :
291
- reqs .append (OffsetRequest (self .topic , partition , - 2 , 1 ))
292
- elif whence == 2 :
293
- reqs .append (OffsetRequest (self .topic , partition , - 1 , 1 ))
294
- else :
295
- pass
351
+ if whence == 0 :
352
+ resps = self ._get_partition_offsets (partitions , - 2 )
353
+ else :
354
+ resps = self ._get_partition_offsets (partitions , - 1 )
296
355
297
- resps = self .client .send_offset_request (reqs )
298
356
for resp in resps :
299
357
self .offsets [resp .partition ] = \
300
358
resp .offsets [0 ] + deltas [resp .partition ]
0 commit comments