forked from minio/minio
/
format-config-v1.go
613 lines (564 loc) · 17.7 KB
/
format-config-v1.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
/*
* Minio Cloud Storage, (C) 2016 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package main
import (
"encoding/json"
"errors"
"fmt"
"strings"
"sync"
"github.com/skyrings/skyring-common/tools/uuid"
)
// fsFormat - structure holding 'fs' format.
type fsFormat struct {
Version string `json:"version"`
}
// xlFormat - structure holding 'xl' format.
type xlFormat struct {
Version string `json:"version"` // Version of 'xl' format.
Disk string `json:"disk"` // Disk field carries assigned disk uuid.
// JBOD field carries the input disk order generated the first
// time when fresh disks were supplied.
JBOD []string `json:"jbod"`
}
// formatConfigV1 - structure holds format config version '1'.
type formatConfigV1 struct {
Version string `json:"version"` // Version of the format config.
// Format indicates the backend format type, supports two values 'xl' and 'fs'.
Format string `json:"format"`
FS *fsFormat `json:"fs,omitempty"` // FS field holds fs format.
XL *xlFormat `json:"xl,omitempty"` // XL field holds xl format.
}
/*
All disks online
-----------------
- All Unformatted - format all and return success.
- Some Unformatted - format all and return success.
- Any JBOD inconsistent - return failure // Requires deep inspection, phase2.
- Some are corrupt (missing format.json) - return failure // Requires deep inspection, phase2.
- Any unrecognized disks - return failure
Some disks are offline and we have quorum.
-----------------
- Some unformatted - no heal, return success.
- Any JBOD inconsistent - return failure // Requires deep inspection, phase2.
- Some are corrupt (missing format.json) - return failure // Requires deep inspection, phase2.
- Any unrecognized disks - return failure
No read quorum
-----------------
failure for all cases.
// Pseudo code for managing `format.json`.
// Generic checks.
if (no quorum) return error
if (any disk is corrupt) return error // phase2
if (jbod inconsistent) return error // phase2
if (disks not recognized) // Always error.
// Specific checks.
if (all disks online)
if (all disks return format.json)
if (jbod consistent)
if (all disks recognized)
return
else
if (all disks return format.json not found)
(initialize format)
return
else (some disks return format.json not found)
(heal format)
return
fi
fi
else // No healing at this point forward, some disks are offline or dead.
if (some disks return format.json not found)
if (with force)
// Offline disks are marked as dead.
(heal format) // Offline disks should be marked as dead.
return success
else (without force)
// --force is necessary to heal few drives, because some drives
// are offline. Offline disks will be marked as dead.
return error
fi
fi
*/
var errSomeDiskUnformatted = errors.New("some disks are found to be unformatted")
var errSomeDiskOffline = errors.New("some disks are offline")
// Returns error slice into understandable errors.
func reduceFormatErrs(errs []error, diskCount int) error {
var errUnformattedDiskCount = 0
var errDiskNotFoundCount = 0
for _, err := range errs {
if err == errUnformattedDisk {
errUnformattedDiskCount++
} else if err == errDiskNotFound {
errDiskNotFoundCount++
}
}
// Returns errUnformattedDisk if all disks report unFormattedDisk.
if errUnformattedDiskCount == diskCount {
return errUnformattedDisk
} else if errUnformattedDiskCount < diskCount && errDiskNotFoundCount == 0 {
// Only some disks return unFormattedDisk and all disks are online.
return errSomeDiskUnformatted
} else if errUnformattedDiskCount < diskCount && errDiskNotFoundCount > 0 {
// Only some disks return unFormattedDisk and some disks are
// offline as well.
return errSomeDiskOffline
}
return nil
}
// loadAllFormats - load all format config from all input disks in parallel.
func loadAllFormats(bootstrapDisks []StorageAPI) ([]*formatConfigV1, []error) {
// Initialize sync waitgroup.
var wg = &sync.WaitGroup{}
// Initialize list of errors.
var sErrs = make([]error, len(bootstrapDisks))
// Initialize format configs.
var formatConfigs = make([]*formatConfigV1, len(bootstrapDisks))
// Make a volume entry on all underlying storage disks.
for index, disk := range bootstrapDisks {
wg.Add(1)
// Make a volume inside a go-routine.
go func(index int, disk StorageAPI) {
defer wg.Done()
formatConfig, lErr := loadFormat(disk)
if lErr != nil {
sErrs[index] = lErr
return
}
formatConfigs[index] = formatConfig
}(index, disk)
}
// Wait for all make vol to finish.
wg.Wait()
for _, err := range sErrs {
if err != nil {
// Return all formats and errors.
return formatConfigs, sErrs
}
}
// Return all formats and nil
return formatConfigs, nil
}
// genericFormatCheck - validates and returns error.
// if (no quorum) return error
// if (any disk is corrupt) return error // phase2
// if (jbod inconsistent) return error // phase2
// if (disks not recognized) // Always error.
func genericFormatCheck(formatConfigs []*formatConfigV1, sErrs []error) (err error) {
// Calculate the errors.
var (
errCorruptFormatCount = 0
errCount = 0
)
// Through all errors calculate the actual errors.
for _, lErr := range sErrs {
if lErr == nil {
continue
}
// These errors are good conditions, means disk is online.
if lErr == errUnformattedDisk || lErr == errVolumeNotFound {
continue
}
if lErr == errCorruptedFormat {
errCorruptFormatCount++
} else {
errCount++
}
}
// Calculate read quorum.
readQuorum := len(formatConfigs)/2 + 1
// Validate the err count under tolerant limit.
if errCount > len(formatConfigs)-readQuorum {
return errXLReadQuorum
}
// One of the disk has corrupt format, return error.
if errCorruptFormatCount > 0 {
return errCorruptedFormat
}
// Validates if format and JBOD are consistent across all disks.
if err = checkFormatXL(formatConfigs); err != nil {
return err
}
// Success..
return nil
}
// errDiskOrderMismatch - returned when disk UUID is not in consistent JBOD order.
var errDiskOrderMismatch = errors.New("disk order mismatch")
// isSavedUUIDInOrder - validates if disk uuid is present and valid in all
// available format config JBOD. This function also validates if the disk UUID
// is always available on all JBOD under the same order.
func isSavedUUIDInOrder(uuid string, formatConfigs []*formatConfigV1) bool {
var orderIndexes []int
// Validate each for format.json for relevant uuid.
for _, formatConfig := range formatConfigs {
if formatConfig == nil {
continue
}
// Validate if UUID is present in JBOD.
uuidIndex := findDiskIndex(uuid, formatConfig.XL.JBOD)
if uuidIndex == -1 {
// UUID not found.
errorIf(errDiskNotFound, "Disk %s not found in JBOD list", uuid)
return false
}
// Save the position of UUID present in JBOD.
orderIndexes = append(orderIndexes, uuidIndex+1)
}
// Once uuid is found, verify if the uuid
// present in same order across all format configs.
prevOrderIndex := orderIndexes[0]
for _, orderIndex := range orderIndexes {
if prevOrderIndex != orderIndex {
errorIf(errDiskOrderMismatch, "Disk %s is in wrong order wanted %d, saw %d ", uuid, prevOrderIndex, orderIndex)
return false
}
}
// Returns success, when we have verified if uuid
// is consistent and valid across all format configs.
return true
}
// checkDisksConsistency - checks if all disks are consistent with all JBOD entries on all disks.
func checkDisksConsistency(formatConfigs []*formatConfigV1) error {
var disks = make([]string, len(formatConfigs))
// Collect currently available disk uuids.
for index, formatConfig := range formatConfigs {
if formatConfig == nil {
disks[index] = ""
continue
}
disks[index] = formatConfig.XL.Disk
}
// Validate collected uuids and verify JBOD.
for _, uuid := range disks {
if uuid == "" {
continue
}
// Is uuid present on all JBOD ?.
if !isSavedUUIDInOrder(uuid, formatConfigs) {
return fmt.Errorf("%s disk not found in JBOD", uuid)
}
}
return nil
}
// checkJBODConsistency - validate xl jbod order if they are consistent.
func checkJBODConsistency(formatConfigs []*formatConfigV1) error {
var jbodStr string
// Extract first valid JBOD.
for _, format := range formatConfigs {
if format == nil {
continue
}
jbodStr = strings.Join(format.XL.JBOD, ".")
break
}
for _, format := range formatConfigs {
if format == nil {
continue
}
savedJBODStr := strings.Join(format.XL.JBOD, ".")
if jbodStr != savedJBODStr {
return errors.New("Inconsistent JBOD found.")
}
}
return nil
}
// findDiskIndex returns position of disk in JBOD.
func findDiskIndex(disk string, jbod []string) int {
for index, uuid := range jbod {
if uuid == disk {
return index
}
}
return -1
}
// reorderDisks - reorder disks in JBOD order.
func reorderDisks(bootstrapDisks []StorageAPI, formatConfigs []*formatConfigV1) ([]StorageAPI, error) {
var savedJBOD []string
for _, format := range formatConfigs {
if format == nil {
continue
}
savedJBOD = format.XL.JBOD
break
}
// Pick the first JBOD list to verify the order and construct new set of disk slice.
var newDisks = make([]StorageAPI, len(bootstrapDisks))
for fIndex, format := range formatConfigs {
if format == nil {
continue
}
jIndex := findDiskIndex(format.XL.Disk, savedJBOD)
if jIndex == -1 {
return nil, errors.New("Unrecognized uuid " + format.XL.Disk + " found")
}
newDisks[jIndex] = bootstrapDisks[fIndex]
}
return newDisks, nil
}
// loadFormat - load format from disk.
func loadFormat(disk StorageAPI) (format *formatConfigV1, err error) {
var buffer []byte
buffer, err = readAll(disk, minioMetaBucket, formatConfigFile)
if err != nil {
// 'file not found' and 'volume not found' as
// same. 'volume not found' usually means its a fresh disk.
if err == errFileNotFound || err == errVolumeNotFound {
var vols []VolInfo
vols, err = disk.ListVols()
if err != nil {
return nil, err
}
if len(vols) > 1 {
// 'format.json' not found, but we found user data.
return nil, errCorruptedFormat
}
// No other data found, its a fresh disk.
return nil, errUnformattedDisk
}
return nil, err
}
format = &formatConfigV1{}
err = json.Unmarshal(buffer, format)
if err != nil {
return nil, err
}
return format, nil
}
// Heals any missing format.json on the drives. Returns error only for unexpected errors
// as regular errors can be ignored since there might be enough quorum to be operational.
func healFormatXL(bootstrapDisks []StorageAPI) error {
needHeal := make([]bool, len(bootstrapDisks)) // Slice indicating which drives needs healing.
formatConfigs := make([]*formatConfigV1, len(bootstrapDisks))
var referenceConfig *formatConfigV1
successCount := 0 // Tracks if we have successfully loaded all `format.json` from all disks.
formatNotFoundCount := 0 // Tracks if we `format.json` is not found on all disks.
// Loads `format.json` from all disks.
for index, disk := range bootstrapDisks {
formatXL, err := loadFormat(disk)
if err != nil {
if err == errUnformattedDisk {
// format.json is missing, should be healed.
needHeal[index] = true
formatNotFoundCount++
continue
} else if err == errDiskNotFound { // Is a valid case we
// can proceed without healing.
return nil
}
// Return error for unsupported errors.
return err
} // Success.
formatConfigs[index] = formatXL
successCount++
}
// All `format.json` has been read successfully, previously completed.
if successCount == len(bootstrapDisks) {
// Return success.
return nil
}
// All disks are fresh, format.json will be written by initFormatXL()
if formatNotFoundCount == len(bootstrapDisks) {
return initFormatXL(bootstrapDisks)
}
// Validate format configs for consistency in JBOD and disks.
if err := checkFormatXL(formatConfigs); err != nil {
return err
}
if referenceConfig == nil {
// This config will be used to update the drives missing format.json.
for _, formatConfig := range formatConfigs {
if formatConfig == nil {
continue
}
referenceConfig = formatConfig
break
}
}
uuidUsage := make([]struct {
uuid string // Disk uuid
inUse bool // indicates if the uuid is used by
// any disk
}, len(bootstrapDisks))
// Returns any unused drive UUID.
getUnusedUUID := func() string {
for index := range uuidUsage {
if !uuidUsage[index].inUse {
uuidUsage[index].inUse = true
return uuidUsage[index].uuid
}
}
return ""
}
// From reference config update UUID's not be in use.
for index, diskUUID := range referenceConfig.XL.JBOD {
uuidUsage[index].uuid = diskUUID
uuidUsage[index].inUse = false
}
// For all config formats validate if they are in use and
// update the uuidUsage values.
for _, config := range formatConfigs {
if config == nil {
continue
}
for index := range uuidUsage {
if config.XL.Disk == uuidUsage[index].uuid {
uuidUsage[index].inUse = true
break
}
}
}
// This section heals the format.json and updates the fresh disks
// by apply a new UUID for all the fresh disks.
for index, heal := range needHeal {
if !heal {
continue
}
config := &formatConfigV1{}
*config = *referenceConfig
config.XL.Disk = getUnusedUUID()
if config.XL.Disk == "" {
// getUnusedUUID() should have
// returned an unused uuid, it
// is an unexpected error.
return errUnexpected
}
formatBytes, err := json.Marshal(config)
if err != nil {
return err
}
// Fresh disk without format.json
_ = bootstrapDisks[index].AppendFile(minioMetaBucket, formatConfigFile, formatBytes)
// Ignore any error from AppendFile() as
// quorum might still be there to be operational.
}
return nil
}
// loadFormatXL - loads XL `format.json` and returns back properly
// ordered storage slice based on `format.json`.
func loadFormatXL(bootstrapDisks []StorageAPI) (disks []StorageAPI, err error) {
var unformattedDisksFoundCnt = 0
var diskNotFoundCount = 0
formatConfigs := make([]*formatConfigV1, len(bootstrapDisks))
// Try to load `format.json` bootstrap disks.
for index, disk := range bootstrapDisks {
var formatXL *formatConfigV1
formatXL, err = loadFormat(disk)
if err != nil {
if err == errUnformattedDisk {
unformattedDisksFoundCnt++
continue
} else if err == errDiskNotFound {
diskNotFoundCount++
continue
}
return nil, err
}
// Save valid formats.
formatConfigs[index] = formatXL
}
// If all disks indicate that 'format.json' is not available
// return 'errUnformattedDisk'.
if unformattedDisksFoundCnt == len(bootstrapDisks) {
return nil, errUnformattedDisk
} else if diskNotFoundCount == len(bootstrapDisks) {
return nil, errDiskNotFound
} else if diskNotFoundCount > len(bootstrapDisks)-(len(bootstrapDisks)/2+1) {
return nil, errXLReadQuorum
} else if unformattedDisksFoundCnt > len(bootstrapDisks)-(len(bootstrapDisks)/2+1) {
return nil, errXLReadQuorum
}
// Validate the format configs read are correct.
if err = checkFormatXL(formatConfigs); err != nil {
return nil, err
}
// Erasure code requires disks to be presented in the same order each time.
return reorderDisks(bootstrapDisks, formatConfigs)
}
// checkFormatXL - verifies if format.json format is intact.
func checkFormatXL(formatConfigs []*formatConfigV1) error {
for _, formatXL := range formatConfigs {
if formatXL == nil {
continue
}
// Validate format version and format type.
if formatXL.Version != "1" {
return fmt.Errorf("Unsupported version of backend format [%s] found.", formatXL.Version)
}
if formatXL.Format != "xl" {
return fmt.Errorf("Unsupported backend format [%s] found.", formatXL.Format)
}
if formatXL.XL.Version != "1" {
return fmt.Errorf("Unsupported XL backend format found [%s]", formatXL.XL.Version)
}
if len(formatConfigs) != len(formatXL.XL.JBOD) {
return fmt.Errorf("Number of disks %d did not match the backend format %d", len(formatConfigs), len(formatXL.XL.JBOD))
}
}
if err := checkJBODConsistency(formatConfigs); err != nil {
return err
}
return checkDisksConsistency(formatConfigs)
}
// initFormatXL - save XL format configuration on all disks.
func initFormatXL(storageDisks []StorageAPI) (err error) {
var (
jbod = make([]string, len(storageDisks))
formats = make([]*formatConfigV1, len(storageDisks))
saveFormatErrCnt = 0
)
for index, disk := range storageDisks {
if err = disk.MakeVol(minioMetaBucket); err != nil {
if err != errVolumeExists {
saveFormatErrCnt++
// Check for write quorum.
if saveFormatErrCnt <= len(storageDisks)-(len(storageDisks)/2+3) {
continue
}
return errXLWriteQuorum
}
}
var u *uuid.UUID
u, err = uuid.New()
if err != nil {
saveFormatErrCnt++
// Check for write quorum.
if saveFormatErrCnt <= len(storageDisks)-(len(storageDisks)/2+3) {
continue
}
return err
}
formats[index] = &formatConfigV1{
Version: "1",
Format: "xl",
XL: &xlFormat{
Version: "1",
Disk: u.String(),
},
}
jbod[index] = formats[index].XL.Disk
}
for index, disk := range storageDisks {
formats[index].XL.JBOD = jbod
formatBytes, err := json.Marshal(formats[index])
if err != nil {
return err
}
if err = disk.AppendFile(minioMetaBucket, formatConfigFile, formatBytes); err != nil {
return err
}
}
return nil
}