diff --git a/cmd-car-split.go b/cmd-car-split.go index ef7c086c..22fa0de8 100644 --- a/cmd-car-split.go +++ b/cmd-car-split.go @@ -10,6 +10,7 @@ import ( "fmt" "io" "io/fs" + "net/http" "os" "path/filepath" "sort" @@ -44,7 +45,8 @@ var ( Roots: []cid.Cid{CBOR_SHA256_DUMMY_CID}, // placeholder Version: 1, } - hdrSize, _ = car.HeaderSize(hdr) + hdrSize, _ = car.HeaderSize(hdr) + maxSectionSize = 2 << 20 // 2 MiB ) const maxLinks = 432000 / 18 // 18 subsets @@ -532,11 +534,13 @@ func readHeader(streamBuf *bufio.Reader) ([]byte, int64, error) { return headerBuf.Bytes(), streamLen, nil } -func SortCarFiles(carFiles []string) ([]string, error) { - type carFileInfo struct { - path string - firstSlot int64 - } +type carFileInfo struct { + name string + firstSlot int64 + size int64 +} + +func SortCarFiles(carFiles []string) ([]carFileInfo, error) { var fileInfos []carFileInfo @@ -584,9 +588,15 @@ func SortCarFiles(carFiles []string) ([]string, error) { return nil, fmt.Errorf("failed to find root node in file %s", path) } + fi, err := file.Stat() + if err != nil { + return nil, fmt.Errorf("failed to get file info for %s: %w", path, err) + } + fileInfos = append(fileInfos, carFileInfo{ - path: path, + name: path, firstSlot: int64(subset.First), + size: fi.Size(), }) } @@ -595,11 +605,139 @@ func SortCarFiles(carFiles []string) ([]string, error) { return fileInfos[i].firstSlot < fileInfos[j].firstSlot }) - // Extract the sorted file paths - sortedFiles := make([]string, len(fileInfos)) - for i, info := range fileInfos { - sortedFiles[i] = info.path + return fileInfos, nil +} + +func SortCarURLs(carURLs []string) ([]carFileInfo, error) { + var urlInfos []carFileInfo + + for _, url := range carURLs { + firstSlot, size, err := getSlotAndSizeFromURL(url) + if err != nil { + return nil, fmt.Errorf("failed to get first slot from URL %s: %w", url, err) + } + + urlInfos = append(urlInfos, carFileInfo{ + name: url, + firstSlot: firstSlot, + size: size, + }) + } + + // Sort the URL infos based on the firstSlot + sort.Slice(urlInfos, func(i, j int) bool { + return urlInfos[i].firstSlot < urlInfos[j].firstSlot + }) + + return urlInfos, nil +} + +func getSlotAndSizeFromURL(url string) (int64, int64, error) { + fileSize, err := splitcarfetcher.GetContentSizeWithHeadOrZeroRange(url) + if err != nil { + return 0, 0, fmt.Errorf("failed to get file size: %w", err) + } + + rootCID, err := getRootCid(url) + if err != nil { + return 0, 0, fmt.Errorf("failed to get root CID: %w", err) + } + + endOffset := getEndOffset(fileSize) + + partialContent, err := fetchFromOffset(url, endOffset) + if err != nil { + return 0, 0, fmt.Errorf("failed to fetch partial content: %w", err) + } + + cidBytes := rootCID.Bytes() + index := bytes.LastIndex(partialContent, cidBytes) + if index == -1 { + return 0, 0, fmt.Errorf("CID block not found in the last 2MiB of the file") + } + blockData := partialContent[index-2:] + r := bufio.NewReader(bytes.NewBuffer(blockData)) + cid, _, data, err := carreader.ReadNodeInfoWithData(r) + if err != nil { + return 0, 0, fmt.Errorf("failed to read node info: %w", err) + } + if cid != rootCID { + return 0, 0, fmt.Errorf("expected CID %s, got %s", rootCID, cid) + } + + // Decode the Subset + subset, err := iplddecoders.DecodeSubset(data) + if err != nil { + return 0, 0, fmt.Errorf("failed to decode Subset from block: %w", err) + } + + return int64(subset.First), fileSize, nil +} + +func getRootCid(url string) (cid.Cid, error) { + // Make a GET request for the beginning of the file + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return cid.Undef, fmt.Errorf("failed to create request: %w", err) + } + + // Request only the first hdrSize bytes + req.Header.Set("Range", fmt.Sprintf("bytes=0-%d", hdrSize)) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return cid.Undef, fmt.Errorf("failed to fetch CAR file header: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusPartialContent { + return cid.Undef, fmt.Errorf("server does not support range requests") + } + + // Read the header content + headerContent, err := io.ReadAll(resp.Body) + if err != nil { + return cid.Undef, fmt.Errorf("failed to read header content: %w", err) + } + + // Parse the CAR header + rc := io.NopCloser(bytes.NewReader(headerContent)) + cr, err := carreader.New(rc) + if err != nil { + return cid.Undef, fmt.Errorf("failed to create CarReader: %w", err) + } + + roots := cr.Header.Roots + if len(roots) != 1 { + return cid.Undef, fmt.Errorf("expected 1 root CID, got %d", len(roots)) + } + rootCID := roots[0] + + return rootCID, nil +} + +func getEndOffset(fileSize int64) int64 { + eo := fileSize - int64(maxSectionSize) + return max(eo, 0) +} + +func fetchFromOffset(url string, offset int64) ([]byte, error) { + req, err := http.NewRequest("GET", url, nil) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Range", fmt.Sprintf("bytes=%d-", offset)) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to fetch CAR file: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusPartialContent { + return nil, fmt.Errorf("server does not support range requests") } - return sortedFiles, nil + return io.ReadAll(resp.Body) } diff --git a/cmd-car-split_test.go b/cmd-car-split_test.go new file mode 100644 index 00000000..c39f4e94 --- /dev/null +++ b/cmd-car-split_test.go @@ -0,0 +1,257 @@ +package main + +import ( + "fmt" + "io" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" +) + +func TestSortCarFiles(t *testing.T) { + + fixturesDir := "fixtures" + + carFiles := []string{ + filepath.Join(fixturesDir, "epoch-0-3.car"), + filepath.Join(fixturesDir, "epoch-0-2.car"), + filepath.Join(fixturesDir, "epoch-0-1.car"), + } + + result, err := SortCarFiles(carFiles) + + if err != nil { + t.Fatalf("unexpected error: %s", err) + } + + if len(result) != 3 { + t.Fatalf("unexpected result length: %d", len(result)) + } + + expectedResults := []struct { + name string + firstSlot int64 + size int64 + }{ + {filepath.Join(fixturesDir, "epoch-0-1.car"), 0, 96932}, + {filepath.Join(fixturesDir, "epoch-0-2.car"), 10, 100027}, + {filepath.Join(fixturesDir, "epoch-0-3.car"), 20, 99487}, + } + + for i, expected := range expectedResults { + if result[i].name != expected.name { + t.Fatalf("unexpected name: %s", result[i].name) + } + if result[i].firstSlot != expected.firstSlot { + t.Fatalf("unexpected firstSlot: %d", result[i].firstSlot) + } + if result[i].size != expected.size { + t.Fatalf("unexpected size: %d", result[i].size) + } + } + +} + +func TestSortCarURLs(t *testing.T) { + // Create a test server + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Get the filename from the URL path + filename := filepath.Base(r.URL.Path) + fixturesDir := "fixtures" + filePath := filepath.Join(fixturesDir, filename) + + // Open the local CAR file + file, err := os.Open(filePath) + if err != nil { + t.Fatalf("failed to open fixture file %s: %v", filePath, err) + } + defer file.Close() + + // Get file info for Content-Length header + fileInfo, err := file.Stat() + if err != nil { + t.Fatalf("failed to get file info: %v", err) + } + + // Handle HEAD requests + if r.Method == "HEAD" { + w.Header().Set("Content-Length", fmt.Sprintf("%d", fileInfo.Size())) + return + } + + // Handle range requests + if rangeHeader := r.Header.Get("Range"); rangeHeader != "" { + // Parse the range header + start, end := int64(0), fileInfo.Size() + fmt.Sscanf(rangeHeader, "bytes=%d-", &start) + if start < 0 { + start = 0 + } + if end > fileInfo.Size() { + end = fileInfo.Size() + } + + // Seek to the start position + _, err = file.Seek(start, 0) + if err != nil { + t.Fatalf("failed to seek in file: %v", err) + } + + // Set response headers + w.Header().Set("Content-Range", fmt.Sprintf("bytes %d-%d/%d", start, end-1, fileInfo.Size())) + w.WriteHeader(http.StatusPartialContent) + + // Copy the requested range to the response + _, err = io.Copy(w, file) + if err != nil { + t.Fatalf("failed to copy file content: %v", err) + } + return + } + + // Handle regular GET requests + http.ServeFile(w, r, filePath) + })) + defer server.Close() + + // Create URLs for our test files + carURLs := []string{ + server.URL + "/epoch-0-3.car", + server.URL + "/epoch-0-2.car", + server.URL + "/epoch-0-1.car", + } + + // Call SortCarURLs + result, err := SortCarURLs(carURLs) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // Verify the results + if len(result) != 3 { + t.Fatalf("unexpected result length: got %d, want 3", len(result)) + } + + expectedResults := []struct { + name string + firstSlot int64 + size int64 + }{ + {server.URL + "/epoch-0-1.car", 0, 96932}, + {server.URL + "/epoch-0-2.car", 10, 100027}, + {server.URL + "/epoch-0-3.car", 20, 99487}, + } + + for i, expected := range expectedResults { + if result[i].name != expected.name { + t.Errorf("unexpected name at index %d: got %s, want %s", i, result[i].name, expected.name) + } + if result[i].firstSlot != expected.firstSlot { + t.Errorf("unexpected firstSlot at index %d: got %d, want %d", i, result[i].firstSlot, expected.firstSlot) + } + if result[i].size != expected.size { + t.Errorf("unexpected size at index %d: got %d, want %d", i, result[i].size, expected.size) + } + } +} + +func TestSortCarFiles_EmptyInput(t *testing.T) { + result, err := SortCarFiles([]string{}) + + if err != nil { + t.Fatalf("unexpected error for empty input: %s", err) + } + + if len(result) != 0 { + t.Fatalf("expected empty result for empty input, got %d items", len(result)) + } +} + +func TestSortCarFiles_NonExistentFile(t *testing.T) { + nonExistentFile := filepath.Join("fixtures", "non-existent.car") + _, err := SortCarFiles([]string{nonExistentFile}) + + if err == nil { + t.Fatal("expected error for non-existent file, got nil") + } + + if !strings.Contains(err.Error(), "no such file or directory") { + t.Fatalf("unexpected error message: %s", err) + } +} + +func TestSortCarFiles_InvalidCAR(t *testing.T) { + invalidCarFile := filepath.Join("fixtures", "invalid.car") + + // Create an invalid CAR file for testing + err := os.WriteFile(invalidCarFile, []byte("invalid car content"), 0644) + if err != nil { + t.Fatalf("failed to create invalid CAR file: %s", err) + } + defer os.Remove(invalidCarFile) + + _, err = SortCarFiles([]string{invalidCarFile}) + + if err == nil { + t.Fatal("expected error for invalid CAR file, got nil") + } + + if !strings.Contains(err.Error(), "failed to create CarReader") { + t.Fatalf("unexpected error message: %s", err) + } +} + +func TestSortCarURLs_EmptyInput(t *testing.T) { + result, err := SortCarURLs([]string{}) + + if err != nil { + t.Fatalf("unexpected error for empty input: %s", err) + } + + if len(result) != 0 { + t.Fatalf("expected empty result for empty input, got %d items", len(result)) + } +} + +func TestSortCarURLs_InvalidURL(t *testing.T) { + invalidURL := "http://invalid.url/non-existent.car" + _, err := SortCarURLs([]string{invalidURL}) + + if err == nil { + t.Fatal("expected error for invalid URL, got nil") + } + + if !strings.Contains(err.Error(), "failed to get first slot from URL") { + t.Fatalf("unexpected error message: %s", err) + } +} + +func TestSortCarURLs_MixedValidAndInvalidURLs(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Serve a valid CAR file for testing + fixturesDir := "fixtures" + filePath := filepath.Join(fixturesDir, "epoch-0-1.car") + http.ServeFile(w, r, filePath) + })) + defer server.Close() + + validURL := server.URL + "/valid.car" + invalidURL := "http://invalid.url/non-existent.car" + + result, err := SortCarURLs([]string{validURL, invalidURL}) + + if err == nil { + t.Fatal("expected error for mixed valid and invalid URLs, got nil") + } + + if !strings.Contains(err.Error(), "failed to get first slot from URL") { + t.Fatalf("unexpected error message: %s", err) + } + + if len(result) != 0 { + t.Fatalf("expected empty result for error case, got %d items", len(result)) + } +} diff --git a/fixtures/epoch-0-1.car b/fixtures/epoch-0-1.car new file mode 100644 index 00000000..58ddd4f8 Binary files /dev/null and b/fixtures/epoch-0-1.car differ diff --git a/fixtures/epoch-0-2.car b/fixtures/epoch-0-2.car new file mode 100644 index 00000000..dc3abd1e Binary files /dev/null and b/fixtures/epoch-0-2.car differ diff --git a/fixtures/epoch-0-3.car b/fixtures/epoch-0-3.car new file mode 100644 index 00000000..b9d50d10 Binary files /dev/null and b/fixtures/epoch-0-3.car differ