Line data Source code
1 : /*
2 : Unix SMB/CIFS implementation.
3 :
4 : trivial database library
5 :
6 : Copyright (C) Andrew Tridgell 1999-2005
7 : Copyright (C) Paul `Rusty' Russell 2000
8 : Copyright (C) Jeremy Allison 2000-2003
9 :
10 : ** NOTE! The following LGPL license applies to the tdb
11 : ** library. This does NOT imply that all of Samba is released
12 : ** under the LGPL
13 :
14 : This library is free software; you can redistribute it and/or
15 : modify it under the terms of the GNU Lesser General Public
16 : License as published by the Free Software Foundation; either
17 : version 3 of the License, or (at your option) any later version.
18 :
19 : This library is distributed in the hope that it will be useful,
20 : but WITHOUT ANY WARRANTY; without even the implied warranty of
21 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 : Lesser General Public License for more details.
23 :
24 : You should have received a copy of the GNU Lesser General Public
25 : License along with this library; if not, see <http://www.gnu.org/licenses/>.
26 : */
27 :
28 : #include "tdb_private.h"
29 :
30 48218 : _PUBLIC_ void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *ptr)
31 : {
32 48218 : tdb->interrupt_sig_ptr = ptr;
33 48218 : }
34 :
35 413008954 : static int fcntl_lock(struct tdb_context *tdb,
36 : int rw, off_t off, off_t len, bool waitflag)
37 : {
38 10754951 : struct flock fl;
39 10754951 : int cmd;
40 :
41 : #ifdef USE_TDB_MUTEX_LOCKING
42 : {
43 10754951 : int ret;
44 413008954 : if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) {
45 90292533 : return ret;
46 : }
47 : }
48 : #endif
49 :
50 322716421 : fl.l_type = rw;
51 322716421 : fl.l_whence = SEEK_SET;
52 322716421 : fl.l_start = off;
53 322716421 : fl.l_len = len;
54 322716421 : fl.l_pid = 0;
55 :
56 322716421 : cmd = waitflag ? F_SETLKW : F_SETLK;
57 :
58 322716421 : return fcntl(tdb->fd, cmd, &fl);
59 : }
60 :
61 303039490 : static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
62 : {
63 8237872 : struct flock fl;
64 : #if 0 /* Check they matched up locks and unlocks correctly. */
65 : char line[80];
66 : FILE *locks;
67 : bool found = false;
68 :
69 : locks = fopen("/proc/locks", "r");
70 :
71 : while (fgets(line, 80, locks)) {
72 : char *p;
73 : int type, start, l;
74 :
75 : /* eg. 1: FLOCK ADVISORY WRITE 2440 08:01:2180826 0 EOF */
76 : p = strchr(line, ':') + 1;
77 : if (strncmp(p, " POSIX ADVISORY ", strlen(" POSIX ADVISORY ")))
78 : continue;
79 : p += strlen(" FLOCK ADVISORY ");
80 : if (strncmp(p, "READ ", strlen("READ ")) == 0)
81 : type = F_RDLCK;
82 : else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
83 : type = F_WRLCK;
84 : else
85 : abort();
86 : p += 6;
87 : if (atoi(p) != getpid())
88 : continue;
89 : p = strchr(strchr(p, ' ') + 1, ' ') + 1;
90 : start = atoi(p);
91 : p = strchr(p, ' ') + 1;
92 : if (strncmp(p, "EOF", 3) == 0)
93 : l = 0;
94 : else
95 : l = atoi(p) - start + 1;
96 :
97 : if (off == start) {
98 : if (len != l) {
99 : fprintf(stderr, "Len %u should be %u: %s",
100 : (int)len, l, line);
101 : abort();
102 : }
103 : if (type != rw) {
104 : fprintf(stderr, "Type %s wrong: %s",
105 : rw == F_RDLCK ? "READ" : "WRITE", line);
106 : abort();
107 : }
108 : found = true;
109 : break;
110 : }
111 : }
112 :
113 : if (!found) {
114 : fprintf(stderr, "Unlock on %u@%u not found!\n",
115 : (int)off, (int)len);
116 : abort();
117 : }
118 :
119 : fclose(locks);
120 : #endif
121 :
122 : #ifdef USE_TDB_MUTEX_LOCKING
123 : {
124 8237872 : int ret;
125 303039490 : if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) {
126 90289877 : return ret;
127 : }
128 : }
129 : #endif
130 :
131 212749613 : fl.l_type = F_UNLCK;
132 212749613 : fl.l_whence = SEEK_SET;
133 212749613 : fl.l_start = off;
134 212749613 : fl.l_len = len;
135 212749613 : fl.l_pid = 0;
136 :
137 212749613 : return fcntl(tdb->fd, F_SETLKW, &fl);
138 : }
139 :
140 : /*
141 : * Calculate the lock offset for a list
142 : *
143 : * list -1 is the freelist, otherwise a hash chain.
144 : *
145 : * Note that we consistently (but without real reason) lock hash chains at an
146 : * offset that is 4 bytes below the real offset of the corresponding list head
147 : * in the db.
148 : *
149 : * This is the memory layout of the hashchain array:
150 : *
151 : * FREELIST_TOP + 0 = freelist
152 : * FREELIST_TOP + 4 = hashtable list 0
153 : * FREELIST_TOP + 8 = hashtable list 1
154 : * ...
155 : *
156 : * Otoh lock_offset computes:
157 : *
158 : * freelist = FREELIST_TOP - 4
159 : * list 0 = FREELIST_TOP + 0
160 : * list 1 = FREELIST_TOP + 4
161 : * ...
162 : *
163 : * Unfortunately we can't change this calculation in order to align the locking
164 : * offset with the memory layout, as that would make the locking incompatible
165 : * between different tdb versions.
166 : */
167 2052971310 : static tdb_off_t lock_offset(int list)
168 : {
169 2052971310 : return FREELIST_TOP + 4*list;
170 : }
171 :
172 : /* a byte range locking function - return 0 on success
173 : this functions locks/unlocks "len" byte at the specified offset.
174 :
175 : On error, errno is also set so that errors are passed back properly
176 : through tdb_open().
177 :
178 : note that a len of zero means lock to end of file
179 : */
180 436137227 : int tdb_brlock(struct tdb_context *tdb,
181 : int rw_type, tdb_off_t offset, size_t len,
182 : enum tdb_lock_flags flags)
183 : {
184 12595399 : int ret;
185 :
186 436137227 : if (tdb->flags & TDB_NOLOCK) {
187 21287820 : return 0;
188 : }
189 :
190 413008959 : if (flags & TDB_LOCK_MARK_ONLY) {
191 5 : return 0;
192 : }
193 :
194 413008954 : if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
195 0 : tdb->ecode = TDB_ERR_RDONLY;
196 0 : return -1;
197 : }
198 :
199 10754951 : do {
200 413008954 : ret = fcntl_lock(tdb, rw_type, offset, len,
201 402254003 : flags & TDB_LOCK_WAIT);
202 : /* Check for a sigalarm break. */
203 413008954 : if (ret == -1 && errno == EINTR &&
204 0 : tdb->interrupt_sig_ptr &&
205 0 : *tdb->interrupt_sig_ptr) {
206 0 : break;
207 : }
208 413008954 : } while (ret == -1 && errno == EINTR);
209 :
210 413008954 : if (ret == -1) {
211 63399 : tdb->ecode = TDB_ERR_LOCK;
212 : /* Generic lock error. errno set by fcntl.
213 : * EAGAIN is an expected return from non-blocking
214 : * locks. */
215 63399 : if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
216 0 : TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %u rw_type=%d flags=%d len=%zu\n",
217 : tdb->fd, offset, rw_type, flags, len));
218 : }
219 63399 : return -1;
220 : }
221 402191602 : return 0;
222 : }
223 :
224 326167756 : int tdb_brunlock(struct tdb_context *tdb,
225 : int rw_type, tdb_off_t offset, size_t len)
226 : {
227 10078320 : int ret;
228 :
229 326167756 : if (tdb->flags & TDB_NOLOCK) {
230 21287818 : return 0;
231 : }
232 :
233 8237872 : do {
234 303039490 : ret = fcntl_unlock(tdb, rw_type, offset, len);
235 303039481 : } while (ret == -1 && errno == EINTR);
236 :
237 303039481 : if (ret == -1) {
238 147 : TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brunlock failed (fd=%d) at offset %u rw_type=%u len=%zu\n",
239 : tdb->fd, offset, rw_type, len));
240 : }
241 294801609 : return ret;
242 : }
243 :
244 : /*
245 : * Do a tdb_brlock in a loop. Some OSes (such as solaris) have too
246 : * conservative deadlock detection and claim a deadlock when progress can be
247 : * made. For those OSes we may loop for a while.
248 : */
249 :
250 1148241 : static int tdb_brlock_retry(struct tdb_context *tdb,
251 : int rw_type, tdb_off_t offset, size_t len,
252 : enum tdb_lock_flags flags)
253 : {
254 1148241 : int count = 1000;
255 :
256 1148241 : while (count--) {
257 12017 : struct timeval tv;
258 12017 : int ret;
259 :
260 1148241 : ret = tdb_brlock(tdb, rw_type, offset, len, flags);
261 1148241 : if (ret == 0) {
262 1148239 : return 0;
263 : }
264 2 : if (errno != EDEADLK) {
265 2 : break;
266 : }
267 : /* sleep for as short a time as we can - more portable than usleep() */
268 0 : tv.tv_sec = 0;
269 0 : tv.tv_usec = 1;
270 0 : select(0, NULL, NULL, NULL, &tv);
271 : }
272 2 : return -1;
273 : }
274 :
275 : /*
276 : upgrade a read lock to a write lock.
277 : */
278 1148241 : int tdb_allrecord_upgrade(struct tdb_context *tdb)
279 : {
280 12017 : int ret;
281 :
282 1148241 : if (tdb->allrecord_lock.count != 1) {
283 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
284 : "tdb_allrecord_upgrade failed: count %u too high\n",
285 : tdb->allrecord_lock.count));
286 0 : tdb->ecode = TDB_ERR_LOCK;
287 0 : return -1;
288 : }
289 :
290 1148241 : if (tdb->allrecord_lock.off != 1) {
291 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
292 : "tdb_allrecord_upgrade failed: already upgraded?\n"));
293 0 : tdb->ecode = TDB_ERR_LOCK;
294 0 : return -1;
295 : }
296 :
297 1148241 : if (tdb_have_mutexes(tdb)) {
298 2 : ret = tdb_mutex_allrecord_upgrade(tdb);
299 2 : if (ret == -1) {
300 0 : goto fail;
301 : }
302 2 : ret = tdb_brlock_retry(tdb, F_WRLCK, lock_offset(tdb->hash_size),
303 : 0, TDB_LOCK_WAIT|TDB_LOCK_PROBE);
304 2 : if (ret == -1) {
305 0 : tdb_mutex_allrecord_downgrade(tdb);
306 : }
307 : } else {
308 1148239 : ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0,
309 : TDB_LOCK_WAIT|TDB_LOCK_PROBE);
310 : }
311 :
312 1148241 : if (ret == 0) {
313 1148239 : tdb->allrecord_lock.ltype = F_WRLCK;
314 1148239 : tdb->allrecord_lock.off = 0;
315 1148239 : return 0;
316 : }
317 2 : fail:
318 2 : TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n"));
319 2 : return -1;
320 : }
321 :
322 581359267 : static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb,
323 : tdb_off_t offset)
324 : {
325 : int i;
326 :
327 892398556 : for (i=0; i<tdb->num_lockrecs; i++) {
328 589469738 : if (tdb->lockrecs[i].off == offset) {
329 286705227 : return &tdb->lockrecs[i];
330 : }
331 : }
332 294654040 : return NULL;
333 : }
334 :
335 : /* lock an offset in the database. */
336 513873092 : int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype,
337 : enum tdb_lock_flags flags)
338 : {
339 25107127 : struct tdb_lock_type *new_lck;
340 :
341 513873092 : if (offset >= lock_offset(tdb->hash_size)) {
342 0 : tdb->ecode = TDB_ERR_LOCK;
343 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid offset %u for ltype=%d\n",
344 : offset, ltype));
345 0 : return -1;
346 : }
347 513873092 : if (tdb->flags & TDB_NOLOCK)
348 253525408 : return 0;
349 :
350 241662718 : new_lck = find_nestlock(tdb, offset);
351 241662718 : if (new_lck) {
352 52153414 : if ((new_lck->ltype == F_RDLCK) && (ltype == F_WRLCK)) {
353 1 : if (!tdb_have_mutexes(tdb)) {
354 0 : int ret;
355 : /*
356 : * Upgrade the underlying fcntl
357 : * lock. Mutexes don't do readlocks,
358 : * so this only applies to fcntl
359 : * locking.
360 : */
361 1 : ret = tdb_brlock(tdb, ltype, offset, 1, flags);
362 1 : if (ret != 0) {
363 1 : return ret;
364 : }
365 : }
366 0 : new_lck->ltype = F_WRLCK;
367 : }
368 : /*
369 : * Just increment the in-memory struct, posix locks
370 : * don't stack.
371 : */
372 52153413 : new_lck->count++;
373 52153413 : return 0;
374 : }
375 :
376 189509304 : if (tdb->num_lockrecs == tdb->lockrecs_array_length) {
377 2677808 : new_lck = (struct tdb_lock_type *)realloc(
378 2677808 : tdb->lockrecs,
379 2677808 : sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
380 2677808 : if (new_lck == NULL) {
381 0 : errno = ENOMEM;
382 0 : return -1;
383 : }
384 2677808 : tdb->lockrecs_array_length = tdb->num_lockrecs+1;
385 2677808 : tdb->lockrecs = new_lck;
386 : }
387 :
388 : /* Since fcntl locks don't nest, we do a lock for the first one,
389 : and simply bump the count for future ones */
390 189509304 : if (tdb_brlock(tdb, ltype, offset, 1, flags)) {
391 54420 : return -1;
392 : }
393 :
394 189453894 : new_lck = &tdb->lockrecs[tdb->num_lockrecs];
395 :
396 189453894 : new_lck->off = offset;
397 189453894 : new_lck->count = 1;
398 189453894 : new_lck->ltype = ltype;
399 189453894 : tdb->num_lockrecs++;
400 :
401 189453894 : return 0;
402 : }
403 :
404 64 : static int tdb_lock_and_recover(struct tdb_context *tdb)
405 : {
406 0 : int ret;
407 :
408 : /* We need to match locking order in transaction commit. */
409 64 : if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
410 0 : return -1;
411 : }
412 :
413 64 : if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
414 0 : tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
415 0 : return -1;
416 : }
417 :
418 64 : ret = tdb_transaction_recover(tdb);
419 :
420 64 : tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1);
421 64 : tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
422 :
423 64 : return ret;
424 : }
425 :
426 476436484 : static bool have_data_locks(const struct tdb_context *tdb)
427 : {
428 : int i;
429 :
430 607086065 : for (i = 0; i < tdb->num_lockrecs; i++) {
431 194766323 : if (tdb->lockrecs[i].off >= lock_offset(-1))
432 87754300 : return true;
433 : }
434 388682184 : return false;
435 : }
436 :
437 : /*
438 : * A allrecord lock allows us to avoid per chain locks. Check if the allrecord
439 : * lock is strong enough.
440 : */
441 2526756584 : static int tdb_lock_covered_by_allrecord_lock(struct tdb_context *tdb,
442 : int ltype)
443 : {
444 2526756584 : if (ltype == F_RDLCK) {
445 : /*
446 : * The allrecord_lock is equal (F_RDLCK) or stronger
447 : * (F_WRLCK). Pass.
448 : */
449 1988609386 : return 0;
450 : }
451 :
452 518440950 : if (tdb->allrecord_lock.ltype == F_RDLCK) {
453 : /*
454 : * We ask for ltype==F_WRLCK, but the allrecord_lock
455 : * is too weak. We can't upgrade here, so fail.
456 : */
457 0 : tdb->ecode = TDB_ERR_LOCK;
458 0 : return -1;
459 : }
460 :
461 : /*
462 : * Asking for F_WRLCK, allrecord is F_WRLCK as well. Pass.
463 : */
464 505003032 : return 0;
465 : }
466 :
467 1764689804 : static int tdb_lock_list(struct tdb_context *tdb, int list, int ltype,
468 : enum tdb_lock_flags waitflag)
469 : {
470 41447111 : int ret;
471 1764689804 : bool check = false;
472 :
473 1764689804 : if (tdb->allrecord_lock.count) {
474 1270097251 : return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
475 : }
476 :
477 : /*
478 : * Check for recoveries: Someone might have kill -9'ed a process
479 : * during a commit.
480 : */
481 501311512 : check = !have_data_locks(tdb);
482 501311512 : ret = tdb_nest_lock(tdb, lock_offset(list), ltype, waitflag);
483 :
484 501311512 : if (ret == 0 && check && tdb_needs_recovery(tdb)) {
485 16 : tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
486 :
487 16 : if (tdb_lock_and_recover(tdb) == -1) {
488 0 : return -1;
489 : }
490 16 : return tdb_lock_list(tdb, list, ltype, waitflag);
491 : }
492 476436468 : return ret;
493 : }
494 :
495 : /* lock a list in the database. list -1 is the alloc list */
496 1763205146 : int tdb_lock(struct tdb_context *tdb, int list, int ltype)
497 : {
498 41437483 : int ret;
499 :
500 1763205146 : ret = tdb_lock_list(tdb, list, ltype, TDB_LOCK_WAIT);
501 1763205146 : if (ret) {
502 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
503 : "ltype=%d (%s)\n", list, ltype, strerror(errno)));
504 : }
505 1763205146 : return ret;
506 : }
507 :
508 : /* lock a list in the database. list -1 is the alloc list. non-blocking lock */
509 : _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
510 1484642 : _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
511 : {
512 1484642 : return tdb_lock_list(tdb, list, ltype, TDB_LOCK_NOWAIT);
513 : }
514 :
515 :
516 514900403 : int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype,
517 : bool mark_lock)
518 : {
519 514900403 : int ret = -1;
520 25215484 : struct tdb_lock_type *lck;
521 :
522 514900403 : if (tdb->flags & TDB_NOLOCK)
523 259480418 : return 0;
524 :
525 : /* Sanity checks */
526 236583643 : if (offset >= lock_offset(tdb->hash_size)) {
527 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: offset %u invalid (%d)\n", offset, tdb->hash_size));
528 0 : return ret;
529 : }
530 :
531 236583643 : lck = find_nestlock(tdb, offset);
532 236583643 : if ((lck == NULL) || (lck->count == 0)) {
533 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
534 0 : return -1;
535 : }
536 :
537 236583643 : if (lck->count > 1) {
538 52153413 : lck->count--;
539 52153413 : return 0;
540 : }
541 :
542 : /*
543 : * This lock has count==1 left, so we need to unlock it in the
544 : * kernel. We don't bother with decrementing the in-memory array
545 : * element, we're about to overwrite it with the last array element
546 : * anyway.
547 : */
548 :
549 184430230 : if (mark_lock) {
550 2 : ret = 0;
551 : } else {
552 184430228 : ret = tdb_brunlock(tdb, ltype, offset, 1);
553 : }
554 :
555 : /*
556 : * Shrink the array by overwriting the element just unlocked with the
557 : * last array element.
558 : */
559 184430230 : *lck = tdb->lockrecs[--tdb->num_lockrecs];
560 :
561 : /*
562 : * We don't bother with realloc when the array shrinks, but if we have
563 : * a completely idle tdb we should get rid of the locked array.
564 : */
565 :
566 184430230 : if (ret)
567 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n"));
568 178728008 : return ret;
569 : }
570 :
571 : _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
572 1764687126 : _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
573 : {
574 : /* a global lock allows us to avoid per chain locks */
575 1764687126 : if (tdb->allrecord_lock.count) {
576 1270097251 : return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
577 : }
578 :
579 501308834 : return tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
580 : }
581 :
582 : /*
583 : get the transaction lock
584 : */
585 9180154 : int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
586 : enum tdb_lock_flags lockflags)
587 : {
588 9180154 : return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags);
589 : }
590 :
591 : /*
592 : release the transaction lock
593 : */
594 5816004 : int tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
595 : {
596 5816004 : return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false);
597 : }
598 :
599 : /* Returns 0 if all done, -1 if error, 1 if ok. */
600 108238976 : static int tdb_allrecord_check(struct tdb_context *tdb, int ltype,
601 : enum tdb_lock_flags flags, bool upgradable)
602 : {
603 : /* There are no locks on read-only dbs */
604 108238976 : if (tdb->read_only || tdb->traverse_read) {
605 0 : tdb->ecode = TDB_ERR_LOCK;
606 0 : return -1;
607 : }
608 :
609 108238976 : if (tdb->allrecord_lock.count &&
610 116 : tdb->allrecord_lock.ltype == (uint32_t)ltype) {
611 116 : tdb->allrecord_lock.count++;
612 116 : return 0;
613 : }
614 :
615 108238860 : if (tdb->allrecord_lock.count) {
616 : /* a global lock of a different type exists */
617 0 : tdb->ecode = TDB_ERR_LOCK;
618 0 : return -1;
619 : }
620 :
621 108238860 : if (tdb_have_extra_locks(tdb)) {
622 : /* can't combine global and chain locks */
623 58 : tdb->ecode = TDB_ERR_LOCK;
624 58 : return -1;
625 : }
626 :
627 108238802 : if (upgradable && ltype != F_RDLCK) {
628 : /* tdb error: you can't upgrade a write lock! */
629 0 : tdb->ecode = TDB_ERR_LOCK;
630 0 : return -1;
631 : }
632 105746947 : return 1;
633 : }
634 :
635 : /* We only need to lock individual bytes, but Linux merges consecutive locks
636 : * so we lock in contiguous ranges. */
637 108253409 : static int tdb_chainlock_gradual(struct tdb_context *tdb,
638 : int ltype, enum tdb_lock_flags flags,
639 : size_t off, size_t len)
640 : {
641 2491871 : int ret;
642 108253409 : enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
643 :
644 108253409 : if (len <= 4) {
645 : /* Single record. Just do blocking lock. */
646 7441 : return tdb_brlock(tdb, ltype, off, len, flags);
647 : }
648 :
649 : /* First we try non-blocking. */
650 108245968 : ret = tdb_brlock(tdb, ltype, off, len, nb_flags);
651 108245968 : if (ret == 0) {
652 105746798 : return 0;
653 : }
654 :
655 : /* Try locking first half, then second. */
656 7311 : ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2);
657 7311 : if (ret == -1)
658 2 : return -1;
659 :
660 7309 : ret = tdb_chainlock_gradual(tdb, ltype, flags,
661 7301 : off + len / 2, len - len / 2);
662 7309 : if (ret == -1) {
663 0 : tdb_brunlock(tdb, ltype, off, len / 2);
664 0 : return -1;
665 : }
666 7301 : return 0;
667 : }
668 :
669 : /* lock/unlock entire database. It can only be upgradable if you have some
670 : * other way of guaranteeing exclusivity (ie. transaction write lock).
671 : * We do the locking gradually to avoid being starved by smaller locks. */
672 108238976 : int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
673 : enum tdb_lock_flags flags, bool upgradable)
674 : {
675 2491862 : int ret;
676 :
677 108238976 : switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) {
678 58 : case -1:
679 58 : return -1;
680 116 : case 0:
681 116 : return 0;
682 : }
683 :
684 : /* We cover two kinds of locks:
685 : * 1) Normal chain locks. Taken for almost all operations.
686 : * 2) Individual records locks. Taken after normal or free
687 : * chain locks.
688 : *
689 : * It is (1) which cause the starvation problem, so we're only
690 : * gradual for that. */
691 :
692 108238802 : if (tdb_have_mutexes(tdb)) {
693 13 : ret = tdb_mutex_allrecord_lock(tdb, ltype, flags);
694 : } else {
695 108238789 : ret = tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
696 108238789 : tdb->hash_size * 4);
697 : }
698 :
699 108238802 : if (ret == -1) {
700 3 : return -1;
701 : }
702 :
703 : /* Grab individual record locks. */
704 108238799 : if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0,
705 : flags) == -1) {
706 0 : if (tdb_have_mutexes(tdb)) {
707 0 : tdb_mutex_allrecord_unlock(tdb);
708 : } else {
709 0 : tdb_brunlock(tdb, ltype, FREELIST_TOP,
710 0 : tdb->hash_size * 4);
711 : }
712 0 : return -1;
713 : }
714 :
715 108238799 : tdb->allrecord_lock.count = 1;
716 : /* If it's upgradable, it's actually exclusive so we can treat
717 : * it as a write lock. */
718 108238799 : tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
719 108238799 : tdb->allrecord_lock.off = upgradable;
720 :
721 108238799 : if (tdb_needs_recovery(tdb)) {
722 48 : bool mark = flags & TDB_LOCK_MARK_ONLY;
723 48 : tdb_allrecord_unlock(tdb, ltype, mark);
724 48 : if (mark) {
725 0 : tdb->ecode = TDB_ERR_LOCK;
726 0 : TDB_LOG((tdb, TDB_DEBUG_ERROR,
727 : "tdb_lockall_mark cannot do recovery\n"));
728 0 : return -1;
729 : }
730 48 : if (tdb_lock_and_recover(tdb) == -1) {
731 40 : return -1;
732 : }
733 8 : return tdb_allrecord_lock(tdb, ltype, flags, upgradable);
734 : }
735 :
736 105746896 : return 0;
737 : }
738 :
739 :
740 :
741 : /* unlock entire db */
742 108238463 : int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock)
743 : {
744 : /* There are no locks on read-only dbs */
745 108238463 : if (tdb->read_only || tdb->traverse_read) {
746 0 : tdb->ecode = TDB_ERR_LOCK;
747 0 : return -1;
748 : }
749 :
750 108238463 : if (tdb->allrecord_lock.count == 0) {
751 0 : tdb->ecode = TDB_ERR_LOCK;
752 0 : return -1;
753 : }
754 :
755 : /* Upgradable locks are marked as write locks. */
756 108238463 : if (tdb->allrecord_lock.ltype != (uint32_t)ltype
757 8 : && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
758 0 : tdb->ecode = TDB_ERR_LOCK;
759 0 : return -1;
760 : }
761 :
762 108238463 : if (tdb->allrecord_lock.count > 1) {
763 116 : tdb->allrecord_lock.count--;
764 116 : return 0;
765 : }
766 :
767 108238347 : if (!mark_lock) {
768 2491852 : int ret;
769 :
770 108238345 : if (tdb_have_mutexes(tdb)) {
771 7 : ret = tdb_mutex_allrecord_unlock(tdb);
772 7 : if (ret == 0) {
773 7 : ret = tdb_brunlock(tdb, ltype,
774 7 : lock_offset(tdb->hash_size),
775 : 0);
776 : }
777 : } else {
778 108238338 : ret = tdb_brunlock(tdb, ltype, FREELIST_TOP, 0);
779 : }
780 :
781 108238342 : if (ret != 0) {
782 45 : TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed "
783 : "(%s)\n", strerror(errno)));
784 45 : return -1;
785 : }
786 : }
787 :
788 108238299 : tdb->allrecord_lock.count = 0;
789 108238299 : tdb->allrecord_lock.ltype = 0;
790 :
791 108238299 : return 0;
792 : }
793 :
794 : /* lock entire database with write lock */
795 744 : _PUBLIC_ int tdb_lockall(struct tdb_context *tdb)
796 : {
797 18 : tdb_trace(tdb, "tdb_lockall");
798 744 : return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
799 : }
800 :
801 : /* lock entire database with write lock - mark only */
802 2 : _PUBLIC_ int tdb_lockall_mark(struct tdb_context *tdb)
803 : {
804 0 : tdb_trace(tdb, "tdb_lockall_mark");
805 2 : return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY, false);
806 : }
807 :
808 : /* unlock entire database with write lock - unmark only */
809 2 : _PUBLIC_ int tdb_lockall_unmark(struct tdb_context *tdb)
810 : {
811 0 : tdb_trace(tdb, "tdb_lockall_unmark");
812 2 : return tdb_allrecord_unlock(tdb, F_WRLCK, true);
813 : }
814 :
815 : /* lock entire database with write lock - nonblocking variant */
816 2 : _PUBLIC_ int tdb_lockall_nonblock(struct tdb_context *tdb)
817 : {
818 2 : int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false);
819 0 : tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret);
820 2 : return ret;
821 : }
822 :
823 : /* unlock entire database with write lock */
824 668 : _PUBLIC_ int tdb_unlockall(struct tdb_context *tdb)
825 : {
826 16 : tdb_trace(tdb, "tdb_unlockall");
827 668 : return tdb_allrecord_unlock(tdb, F_WRLCK, false);
828 : }
829 :
830 : /* lock entire database with read lock */
831 104874071 : _PUBLIC_ int tdb_lockall_read(struct tdb_context *tdb)
832 : {
833 2474029 : tdb_trace(tdb, "tdb_lockall_read");
834 104874071 : return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
835 : }
836 :
837 : /* lock entire database with read lock - nonblock variant */
838 0 : _PUBLIC_ int tdb_lockall_read_nonblock(struct tdb_context *tdb)
839 : {
840 0 : int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false);
841 0 : tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret);
842 0 : return ret;
843 : }
844 :
845 : /* unlock entire database with read lock */
846 104873601 : _PUBLIC_ int tdb_unlockall_read(struct tdb_context *tdb)
847 : {
848 2474029 : tdb_trace(tdb, "tdb_unlockall_read");
849 104873601 : return tdb_allrecord_unlock(tdb, F_RDLCK, false);
850 : }
851 :
852 : /* lock/unlock one hash chain. This is meant to be used to reduce
853 : contention - it cannot guarantee how many records will be locked */
854 10150674 : _PUBLIC_ int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
855 : {
856 10150674 : int ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
857 156326 : tdb_trace_1rec(tdb, "tdb_chainlock", key);
858 10150674 : return ret;
859 : }
860 :
861 : /* lock/unlock one hash chain, non-blocking. This is meant to be used
862 : to reduce contention - it cannot guarantee how many records will be
863 : locked */
864 9 : _PUBLIC_ int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
865 : {
866 9 : int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
867 0 : tdb_trace_1rec_ret(tdb, "tdb_chainlock_nonblock", key, ret);
868 9 : return ret;
869 : }
870 :
871 : /* mark a chain as locked without actually locking it. Warning! use with great caution! */
872 2 : _PUBLIC_ int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
873 : {
874 2 : int ret = tdb_nest_lock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
875 : F_WRLCK, TDB_LOCK_MARK_ONLY);
876 0 : tdb_trace_1rec(tdb, "tdb_chainlock_mark", key);
877 2 : return ret;
878 : }
879 :
880 : /* unmark a chain as locked without actually locking it. Warning! use with great caution! */
881 2 : _PUBLIC_ int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
882 : {
883 0 : tdb_trace_1rec(tdb, "tdb_chainlock_unmark", key);
884 2 : return tdb_nest_unlock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
885 : F_WRLCK, true);
886 : }
887 :
888 10150672 : _PUBLIC_ int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
889 : {
890 156325 : tdb_trace_1rec(tdb, "tdb_chainunlock", key);
891 10150672 : return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
892 : }
893 :
894 250 : _PUBLIC_ int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
895 : {
896 0 : int ret;
897 250 : ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
898 0 : tdb_trace_1rec(tdb, "tdb_chainlock_read", key);
899 250 : return ret;
900 : }
901 :
902 248 : _PUBLIC_ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
903 : {
904 0 : tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
905 248 : return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
906 : }
907 :
908 0 : _PUBLIC_ int tdb_chainlock_read_nonblock(struct tdb_context *tdb, TDB_DATA key)
909 : {
910 0 : int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
911 0 : tdb_trace_1rec_ret(tdb, "tdb_chainlock_read_nonblock", key, ret);
912 0 : return ret;
913 : }
914 :
915 : /* record lock stops delete underneath */
916 568611486 : int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
917 : {
918 568611486 : if (tdb->allrecord_lock.count) {
919 537527119 : return 0;
920 : }
921 27499878 : return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
922 : }
923 :
924 : /*
925 : Write locks override our own fcntl readlocks, so check it here.
926 : Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
927 : an error to fail to get the lock here.
928 : */
929 3006949 : int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
930 : {
931 208613 : struct tdb_traverse_lock *i;
932 3006949 : if (tdb == NULL) {
933 0 : return -1;
934 : }
935 6089794 : for (i = &tdb->travlocks; i; i = i->next)
936 3170217 : if (i->off == off)
937 52360 : return -1;
938 2919577 : if (tdb->allrecord_lock.count) {
939 1812133 : if (tdb->allrecord_lock.ltype == F_WRLCK) {
940 1641991 : return 0;
941 : }
942 0 : return -1;
943 : }
944 1107444 : return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
945 : }
946 :
947 2918903 : int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
948 : {
949 2918903 : if (tdb->allrecord_lock.count) {
950 1641991 : return 0;
951 : }
952 1106770 : return tdb_brunlock(tdb, F_WRLCK, off, 1);
953 : }
954 :
955 : /* fcntl locks don't stack: avoid unlocking someone else's */
956 568611516 : int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
957 : {
958 5426800 : struct tdb_traverse_lock *i;
959 568611516 : uint32_t count = 0;
960 :
961 568611516 : if (tdb->allrecord_lock.count) {
962 537527115 : return 0;
963 : }
964 :
965 27499912 : if (off == 0)
966 12 : return 0;
967 82499438 : for (i = &tdb->travlocks; i; i = i->next)
968 54999560 : if (i->off == off)
969 27499880 : count++;
970 27499878 : return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0);
971 : }
972 :
973 113973100 : bool tdb_have_extra_locks(struct tdb_context *tdb)
974 : {
975 113973100 : unsigned int extra = tdb->num_lockrecs;
976 :
977 : /* A transaction holds the lock for all records. */
978 113973100 : if (!tdb->transaction && tdb->allrecord_lock.count) {
979 0 : return true;
980 : }
981 :
982 : /* We always hold the active lock if CLEAR_IF_FIRST. */
983 113973100 : if (find_nestlock(tdb, ACTIVE_LOCK)) {
984 553586 : extra--;
985 : }
986 :
987 : /* In a transaction, we expect to hold the transaction lock */
988 114002932 : if (tdb->transaction && find_nestlock(tdb, TRANSACTION_LOCK)) {
989 4512392 : extra--;
990 : }
991 :
992 113973100 : return extra;
993 : }
994 :
995 : /* The transaction code uses this to remove all locks. */
996 3364147 : void tdb_release_transaction_locks(struct tdb_context *tdb)
997 : {
998 17814 : int i;
999 3364147 : unsigned int active = 0;
1000 :
1001 3364147 : if (tdb->allrecord_lock.count != 0) {
1002 3364141 : tdb_allrecord_unlock(tdb, tdb->allrecord_lock.ltype, false);
1003 3364138 : tdb->allrecord_lock.count = 0;
1004 : }
1005 :
1006 7881506 : for (i=0;i<tdb->num_lockrecs;i++) {
1007 4517368 : struct tdb_lock_type *lck = &tdb->lockrecs[i];
1008 :
1009 : /* Don't release the active lock! Copy it to first entry. */
1010 4517368 : if (lck->off == ACTIVE_LOCK) {
1011 4982 : tdb->lockrecs[active++] = *lck;
1012 : } else {
1013 4512386 : tdb_brunlock(tdb, lck->ltype, lck->off, 1);
1014 : }
1015 : }
1016 3364138 : tdb->num_lockrecs = active;
1017 3364138 : }
1018 :
1019 : /* Following functions are added specifically to support CTDB. */
1020 :
1021 : /* Don't do actual fcntl locking, just mark tdb locked */
1022 : _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb);
1023 0 : _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb)
1024 : {
1025 0 : return tdb_transaction_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY);
1026 : }
1027 :
1028 : /* Don't do actual fcntl unlocking, just mark tdb unlocked */
1029 : _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb);
1030 0 : _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb)
1031 : {
1032 0 : return tdb_nest_unlock(tdb, TRANSACTION_LOCK, F_WRLCK, true);
1033 : }
|