Files
CleanMM/cmd/analyze/scanner.go

743 lines
18 KiB
Go
Raw Normal View History

2025-11-18 18:23:33 +08:00
package main
import (
"bytes"
"container/heap"
2025-11-18 18:23:33 +08:00
"context"
"fmt"
"io/fs"
"os"
"os/exec"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"golang.org/x/sync/singleflight"
)
var scanGroup singleflight.Group
// trySend attempts to send an item to a channel with a timeout.
// Returns true if the item was sent, false if the timeout was reached.
func trySend[T any](ch chan<- T, item T, timeout time.Duration) bool {
if timeout <= 0 {
select {
case ch <- item:
return true
default:
return false
}
}
select {
case ch <- item:
return true
default:
}
timer := time.NewTimer(timeout)
defer func() {
if !timer.Stop() {
select {
case <-timer.C:
default:
}
}
}()
select {
case ch <- item:
return true
case <-timer.C:
return false
}
}
func scanPathConcurrent(root string, filesScanned, dirsScanned, bytesScanned *int64, currentPath *atomic.Value) (scanResult, error) {
2025-11-18 18:23:33 +08:00
children, err := os.ReadDir(root)
if err != nil {
return scanResult{}, err
}
var total int64
var localFilesScanned int64
var localBytesScanned int64
// Keep Top N heaps.
entriesHeap := &entryHeap{}
heap.Init(entriesHeap)
largeFilesHeap := &largeFileHeap{}
heap.Init(largeFilesHeap)
largeFileMinSize := int64(largeFileWarmupMinSize)
2025-11-18 18:23:33 +08:00
// Worker pool sized for I/O-bound scanning.
numWorkers := max(min(max(runtime.NumCPU()*cpuMultiplier, minWorkers), maxWorkers, len(children)), 1)
2025-11-18 22:00:05 +08:00
sem := make(chan struct{}, numWorkers)
dirSem := make(chan struct{}, min(runtime.NumCPU()*2, maxDirWorkers))
duSem := make(chan struct{}, min(4, runtime.NumCPU())) // limits concurrent du processes
duQueueSem := make(chan struct{}, min(4, runtime.NumCPU())*2) // limits how many goroutines may be waiting to run du
2025-11-18 18:23:33 +08:00
var wg sync.WaitGroup
// Collect results via channels.
// Cap buffer size to prevent memory spikes with huge directories.
entryBufSize := max(min(len(children), 4096), 1)
entryChan := make(chan dirEntry, entryBufSize)
2025-11-18 18:23:33 +08:00
largeFileChan := make(chan fileEntry, maxLargeFiles*2)
var collectorWg sync.WaitGroup
collectorWg.Add(2)
go func() {
defer collectorWg.Done()
for entry := range entryChan {
if entriesHeap.Len() < maxEntries {
heap.Push(entriesHeap, entry)
} else if entry.Size > (*entriesHeap)[0].Size {
heap.Pop(entriesHeap)
heap.Push(entriesHeap, entry)
}
2025-11-18 18:23:33 +08:00
}
}()
go func() {
defer collectorWg.Done()
for file := range largeFileChan {
if largeFilesHeap.Len() < maxLargeFiles {
heap.Push(largeFilesHeap, file)
if largeFilesHeap.Len() == maxLargeFiles {
atomic.StoreInt64(&largeFileMinSize, (*largeFilesHeap)[0].Size)
}
} else if file.Size > (*largeFilesHeap)[0].Size {
heap.Pop(largeFilesHeap)
heap.Push(largeFilesHeap, file)
atomic.StoreInt64(&largeFileMinSize, (*largeFilesHeap)[0].Size)
}
2025-11-18 18:23:33 +08:00
}
}()
isRootDir := root == "/"
home := os.Getenv("HOME")
isHomeDir := home != "" && root == home
2025-11-18 18:23:33 +08:00
for _, child := range children {
fullPath := filepath.Join(root, child.Name())
// Skip symlinks to avoid following unexpected targets.
2025-11-20 20:14:08 +08:00
if child.Type()&fs.ModeSymlink != 0 {
targetInfo, err := os.Stat(fullPath)
isDir := false
if err == nil && targetInfo.IsDir() {
isDir = true
}
// Count link size only to avoid double-counting targets.
2025-11-20 20:14:08 +08:00
info, err := child.Info()
if err != nil {
continue
}
size := getActualFileSize(fullPath, info)
atomic.AddInt64(&total, size)
trySend(entryChan, dirEntry{
Name: child.Name() + " →",
2025-11-20 20:14:08 +08:00
Path: fullPath,
Size: size,
IsDir: isDir,
2025-11-20 20:14:08 +08:00
LastAccess: getLastAccessTimeFromInfo(info),
}, 100*time.Millisecond)
2025-11-20 20:14:08 +08:00
continue
2025-11-20 20:14:08 +08:00
}
2025-11-18 18:23:33 +08:00
if child.IsDir() {
if defaultSkipDirs[child.Name()] {
continue
}
// Skip system dirs at root.
2025-11-18 18:23:33 +08:00
if isRootDir && skipSystemDirs[child.Name()] {
continue
}
// ~/Library is scanned separately; reuse cache when possible.
if isHomeDir && child.Name() == "Library" {
sem <- struct{}{}
wg.Add(1)
go func(name, path string) {
defer wg.Done()
defer func() { <-sem }()
var size int64
if cached, err := loadStoredOverviewSize(path); err == nil && cached > 0 {
size = cached
} else if cached, err := loadCacheFromDisk(path); err == nil {
size = cached.TotalSize
} else {
size = calculateDirSizeConcurrent(path, largeFileChan, &largeFileMinSize, dirSem, duSem, duQueueSem, filesScanned, dirsScanned, bytesScanned, currentPath)
}
atomic.AddInt64(&total, size)
atomic.AddInt64(dirsScanned, 1)
trySend(entryChan, dirEntry{
Name: name,
Path: path,
Size: size,
IsDir: true,
LastAccess: time.Time{},
}, 100*time.Millisecond)
}(child.Name(), fullPath)
continue
}
// Folded dirs: fast size without expanding.
2025-11-18 18:23:33 +08:00
if shouldFoldDirWithPath(child.Name(), fullPath) {
duQueueSem <- struct{}{}
2025-11-18 18:23:33 +08:00
wg.Add(1)
go func(name, path string) {
defer wg.Done()
defer func() { <-duQueueSem }()
2025-11-18 18:23:33 +08:00
size, err := func() (int64, error) {
duSem <- struct{}{}
defer func() { <-duSem }()
return getDirectorySizeFromDu(path)
}()
2025-11-18 22:00:05 +08:00
if err != nil || size <= 0 {
2025-11-18 18:23:33 +08:00
size = calculateDirSizeFast(path, filesScanned, dirsScanned, bytesScanned, currentPath)
}
atomic.AddInt64(&total, size)
atomic.AddInt64(dirsScanned, 1)
trySend(entryChan, dirEntry{
Name: name,
Path: path,
Size: size,
IsDir: true,
LastAccess: time.Time{},
}, 100*time.Millisecond)
2025-11-18 18:23:33 +08:00
}(child.Name(), fullPath)
continue
}
sem <- struct{}{}
2025-11-18 18:23:33 +08:00
wg.Add(1)
go func(name, path string) {
defer wg.Done()
defer func() { <-sem }()
size := calculateDirSizeConcurrent(path, largeFileChan, &largeFileMinSize, dirSem, duSem, duQueueSem, filesScanned, dirsScanned, bytesScanned, currentPath)
2025-11-18 18:23:33 +08:00
atomic.AddInt64(&total, size)
atomic.AddInt64(dirsScanned, 1)
trySend(entryChan, dirEntry{
Name: name,
Path: path,
Size: size,
IsDir: true,
LastAccess: time.Time{},
}, 100*time.Millisecond)
2025-11-18 18:23:33 +08:00
}(child.Name(), fullPath)
continue
}
info, err := child.Info()
if err != nil {
continue
}
// Actual disk usage for sparse/cloud files.
2025-11-18 18:23:33 +08:00
size := getActualFileSize(fullPath, info)
atomic.AddInt64(&total, size)
localFilesScanned++
localBytesScanned += size
2025-11-18 18:23:33 +08:00
trySend(entryChan, dirEntry{
Name: child.Name(),
Path: fullPath,
Size: size,
IsDir: false,
LastAccess: getLastAccessTimeFromInfo(info),
}, 100*time.Millisecond)
// Track large files only.
if !shouldSkipFileForLargeTracking(fullPath) {
minSize := atomic.LoadInt64(&largeFileMinSize)
if size >= minSize {
trySend(largeFileChan, fileEntry{Name: child.Name(), Path: fullPath, Size: size}, 100*time.Millisecond)
}
2025-11-18 18:23:33 +08:00
}
}
if localFilesScanned > 0 {
atomic.AddInt64(filesScanned, localFilesScanned)
}
if localBytesScanned > 0 {
atomic.AddInt64(bytesScanned, localBytesScanned)
}
2025-11-18 18:23:33 +08:00
wg.Wait()
// Close channels and wait for collectors.
2025-11-18 18:23:33 +08:00
close(entryChan)
close(largeFileChan)
collectorWg.Wait()
// Convert heaps to sorted slices (descending).
entries := make([]dirEntry, entriesHeap.Len())
for i := len(entries) - 1; i >= 0; i-- {
entries[i] = heap.Pop(entriesHeap).(dirEntry)
}
largeFiles := make([]fileEntry, largeFilesHeap.Len())
for i := len(largeFiles) - 1; i >= 0; i-- {
largeFiles[i] = heap.Pop(largeFilesHeap).(fileEntry)
2025-11-18 18:23:33 +08:00
}
// Use Spotlight for large files when it expands the list.
if spotlightFiles := findLargeFilesWithSpotlight(root, spotlightMinFileSize); len(spotlightFiles) > len(largeFiles) {
2025-11-18 18:23:33 +08:00
largeFiles = spotlightFiles
}
return scanResult{
Entries: entries,
LargeFiles: largeFiles,
TotalSize: total,
TotalFiles: atomic.LoadInt64(filesScanned),
2025-11-18 18:23:33 +08:00
}, nil
}
func shouldFoldDirWithPath(name, path string) bool {
if foldDirs[name] {
return true
}
// Handle npm cache structure.
2025-11-18 18:23:33 +08:00
if strings.Contains(path, "/.npm/") || strings.Contains(path, "/.tnpm/") {
parent := filepath.Base(filepath.Dir(path))
if parent == ".npm" || parent == ".tnpm" || strings.HasPrefix(parent, "_") {
return true
}
if len(name) == 1 {
return true
}
}
return false
}
func shouldSkipFileForLargeTracking(path string) bool {
ext := strings.ToLower(filepath.Ext(path))
return skipExtensions[ext]
}
// calculateDirSizeFast performs concurrent dir sizing using os.ReadDir.
func calculateDirSizeFast(root string, filesScanned, dirsScanned, bytesScanned *int64, currentPath *atomic.Value) int64 {
2025-11-18 18:23:33 +08:00
var total int64
var wg sync.WaitGroup
2025-11-18 18:23:33 +08:00
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
defer cancel()
2026-01-06 11:52:05 +02:00
concurrency := min(runtime.NumCPU()*4, 64)
sem := make(chan struct{}, concurrency)
var walk func(string)
walk = func(dirPath string) {
2025-11-18 18:23:33 +08:00
select {
case <-ctx.Done():
return
2025-11-18 18:23:33 +08:00
default:
}
if currentPath != nil && atomic.LoadInt64(filesScanned)%int64(batchUpdateSize) == 0 {
currentPath.Store(dirPath)
}
entries, err := os.ReadDir(dirPath)
2025-11-18 18:23:33 +08:00
if err != nil {
return
2025-11-18 18:23:33 +08:00
}
var localBytes, localFiles int64
for _, entry := range entries {
if entry.IsDir() {
subDir := filepath.Join(dirPath, entry.Name())
atomic.AddInt64(dirsScanned, 1)
select {
case sem <- struct{}{}:
wg.Add(1)
go func(p string) {
defer wg.Done()
defer func() { <-sem }()
walk(p)
}(subDir)
default:
// Fallback to synchronous traversal to avoid semaphore deadlock under high fan-out.
walk(subDir)
}
} else {
info, err := entry.Info()
if err == nil {
size := getActualFileSize(filepath.Join(dirPath, entry.Name()), info)
localBytes += size
localFiles++
}
2025-11-18 18:23:33 +08:00
}
}
if localBytes > 0 {
atomic.AddInt64(&total, localBytes)
atomic.AddInt64(bytesScanned, localBytes)
2025-11-18 18:23:33 +08:00
}
if localFiles > 0 {
atomic.AddInt64(filesScanned, localFiles)
2025-11-18 18:23:33 +08:00
}
}
walk(root)
wg.Wait()
2025-11-18 18:23:33 +08:00
return total
}
// Use Spotlight (mdfind) to quickly find large files.
2025-11-18 18:23:33 +08:00
func findLargeFilesWithSpotlight(root string, minSize int64) []fileEntry {
query := fmt.Sprintf("kMDItemFSSize >= %d", minSize)
2025-11-18 22:00:05 +08:00
ctx, cancel := context.WithTimeout(context.Background(), mdlsTimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "mdfind", "-onlyin", root, query)
2025-11-18 18:23:33 +08:00
output, err := cmd.Output()
if err != nil {
return nil
}
h := &largeFileHeap{}
heap.Init(h)
2025-11-18 18:23:33 +08:00
2026-01-06 11:52:05 +02:00
for line := range strings.Lines(strings.TrimSpace(string(output))) {
2025-11-18 18:23:33 +08:00
if line == "" {
continue
}
// Filter code files first (cheap).
2025-11-18 18:23:33 +08:00
if shouldSkipFileForLargeTracking(line) {
continue
}
// Filter folded directories (cheap string check).
2025-11-18 18:23:33 +08:00
if isInFoldedDir(line) {
continue
}
info, err := os.Lstat(line)
if err != nil {
continue
}
if info.IsDir() || info.Mode()&os.ModeSymlink != 0 {
continue
}
// Actual disk usage for sparse/cloud files.
2025-11-18 18:23:33 +08:00
actualSize := getActualFileSize(line, info)
candidate := fileEntry{
Name: filepath.Base(line),
Path: line,
Size: actualSize,
}
2025-11-18 18:23:33 +08:00
if h.Len() < maxLargeFiles {
heap.Push(h, candidate)
} else if candidate.Size > (*h)[0].Size {
heap.Pop(h)
heap.Push(h, candidate)
}
}
2025-11-18 18:23:33 +08:00
files := make([]fileEntry, h.Len())
for i := len(files) - 1; i >= 0; i-- {
files[i] = heap.Pop(h).(fileEntry)
2025-11-18 18:23:33 +08:00
}
return files
}
// isInFoldedDir checks if a path is inside a folded directory.
2025-11-18 18:23:33 +08:00
func isInFoldedDir(path string) bool {
2026-01-06 11:52:05 +02:00
parts := strings.SplitSeq(path, string(os.PathSeparator))
for part := range parts {
2025-11-18 18:23:33 +08:00
if foldDirs[part] {
return true
}
}
return false
}
func calculateDirSizeConcurrent(root string, largeFileChan chan<- fileEntry, largeFileMinSize *int64, dirSem, duSem, duQueueSem chan struct{}, filesScanned, dirsScanned, bytesScanned *int64, currentPath *atomic.Value) int64 {
2025-11-18 18:23:33 +08:00
children, err := os.ReadDir(root)
if err != nil {
return 0
}
var total int64
var localFilesScanned int64
var localDirsScanned int64
var localBytesScanned int64
2025-11-18 18:23:33 +08:00
var wg sync.WaitGroup
for _, child := range children {
fullPath := filepath.Join(root, child.Name())
2025-11-20 20:14:08 +08:00
if child.Type()&fs.ModeSymlink != 0 {
info, err := child.Info()
if err != nil {
continue
}
size := getActualFileSize(fullPath, info)
total += size
localFilesScanned++
localBytesScanned += size
2025-11-20 20:14:08 +08:00
continue
}
2025-11-18 18:23:33 +08:00
if child.IsDir() {
localDirsScanned++
2025-11-18 18:23:33 +08:00
if shouldFoldDirWithPath(child.Name(), fullPath) {
duQueueSem <- struct{}{}
2025-11-18 18:23:33 +08:00
wg.Add(1)
go func(path string) {
defer wg.Done()
defer func() { <-duQueueSem }()
size, err := func() (int64, error) {
duSem <- struct{}{}
defer func() { <-duSem }()
return getDirectorySizeFromDu(path)
}()
if err != nil || size <= 0 {
size = calculateDirSizeFast(path, filesScanned, dirsScanned, bytesScanned, currentPath)
} else {
2025-11-18 18:23:33 +08:00
atomic.AddInt64(bytesScanned, size)
}
atomic.AddInt64(&total, size)
2025-11-18 18:23:33 +08:00
}(fullPath)
continue
}
select {
case dirSem <- struct{}{}:
wg.Add(1)
go func(path string) {
defer wg.Done()
defer func() { <-dirSem }()
2025-11-18 18:23:33 +08:00
size := calculateDirSizeConcurrent(path, largeFileChan, largeFileMinSize, dirSem, duSem, duQueueSem, filesScanned, dirsScanned, bytesScanned, currentPath)
atomic.AddInt64(&total, size)
}(fullPath)
default:
size := calculateDirSizeConcurrent(fullPath, largeFileChan, largeFileMinSize, dirSem, duSem, duQueueSem, filesScanned, dirsScanned, bytesScanned, currentPath)
2025-11-18 18:23:33 +08:00
atomic.AddInt64(&total, size)
}
2025-11-18 18:23:33 +08:00
continue
}
info, err := child.Info()
if err != nil {
continue
}
size := getActualFileSize(fullPath, info)
total += size
localFilesScanned++
localBytesScanned += size
2025-11-18 18:23:33 +08:00
if !shouldSkipFileForLargeTracking(fullPath) && largeFileMinSize != nil {
minSize := atomic.LoadInt64(largeFileMinSize)
if size >= minSize {
trySend(largeFileChan, fileEntry{Name: child.Name(), Path: fullPath, Size: size}, 100*time.Millisecond)
}
2025-11-18 18:23:33 +08:00
}
// Update current path occasionally to prevent UI jitter.
if currentPath != nil && localFilesScanned%int64(batchUpdateSize) == 0 {
currentPath.Store(fullPath)
2025-11-18 18:23:33 +08:00
}
}
wg.Wait()
if localFilesScanned > 0 {
atomic.AddInt64(filesScanned, localFilesScanned)
}
if localBytesScanned > 0 {
atomic.AddInt64(bytesScanned, localBytesScanned)
}
if localDirsScanned > 0 {
atomic.AddInt64(dirsScanned, localDirsScanned)
}
2025-11-18 18:23:33 +08:00
return total
}
// measureOverviewSize calculates the size of a directory using multiple strategies.
// When scanning Home, it excludes ~/Library to avoid duplicate counting.
2025-11-18 18:23:33 +08:00
func measureOverviewSize(path string) (int64, error) {
if path == "" {
return 0, fmt.Errorf("empty path")
}
path = filepath.Clean(path)
if !filepath.IsAbs(path) {
return 0, fmt.Errorf("path must be absolute: %s", path)
}
if _, err := os.Stat(path); err != nil {
return 0, fmt.Errorf("cannot access path: %v", err)
}
// Determine if we should exclude ~/Library (when scanning Home)
home := os.Getenv("HOME")
excludePath := ""
if home != "" && path == home {
excludePath = filepath.Join(home, "Library")
}
if duSize, err := getDirectorySizeFromDuWithExclude(path, excludePath); err == nil && duSize > 0 {
2025-11-18 18:23:33 +08:00
_ = storeOverviewSize(path, duSize)
return duSize, nil
}
if logicalSize, err := getDirectoryLogicalSizeWithExclude(path, excludePath); err == nil && logicalSize > 0 {
2025-11-18 18:23:33 +08:00
_ = storeOverviewSize(path, logicalSize)
return logicalSize, nil
}
if cached, err := loadCacheFromDisk(path); err == nil {
_ = storeOverviewSize(path, cached.TotalSize)
return cached.TotalSize, nil
}
return 0, fmt.Errorf("unable to measure directory size with fast methods")
}
func getDirectorySizeFromDu(path string) (int64, error) {
return getDirectorySizeFromDuWithExclude(path, "")
}
func getDirectorySizeFromDuWithExclude(path string, excludePath string) (int64, error) {
runDuSize := func(target string) (int64, error) {
if _, err := os.Stat(target); err != nil {
return 0, err
}
2025-11-18 18:23:33 +08:00
ctx, cancel := context.WithTimeout(context.Background(), duTimeout)
defer cancel()
cmd := exec.CommandContext(ctx, "du", "-skP", target)
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
2025-11-18 18:23:33 +08:00
if err := cmd.Run(); err != nil {
if ctx.Err() == context.DeadlineExceeded {
return 0, fmt.Errorf("du timeout after %v", duTimeout)
}
if stderr.Len() > 0 {
return 0, fmt.Errorf("du failed: %v, %s", err, stderr.String())
}
return 0, fmt.Errorf("du failed: %v", err)
2025-11-18 18:23:33 +08:00
}
fields := strings.Fields(stdout.String())
if len(fields) == 0 {
return 0, fmt.Errorf("du output empty")
2025-11-18 18:23:33 +08:00
}
kb, err := strconv.ParseInt(fields[0], 10, 64)
if err != nil {
return 0, fmt.Errorf("failed to parse du output: %v", err)
}
if kb <= 0 {
return 0, fmt.Errorf("du size invalid: %d", kb)
}
return kb * 1024, nil
2025-11-18 18:23:33 +08:00
}
// When excluding a path (e.g., ~/Library), subtract only that exact directory instead of ignoring every "Library"
if excludePath != "" {
totalSize, err := runDuSize(path)
if err != nil {
return 0, err
}
excludeSize, err := runDuSize(excludePath)
if err != nil {
if !os.IsNotExist(err) {
return 0, err
}
excludeSize = 0
}
if excludeSize > totalSize {
excludeSize = 0
}
return totalSize - excludeSize, nil
2025-11-18 18:23:33 +08:00
}
return runDuSize(path)
}
func getDirectoryLogicalSizeWithExclude(path string, excludePath string) (int64, error) {
2025-11-18 18:23:33 +08:00
var total int64
err := filepath.WalkDir(path, func(p string, d fs.DirEntry, err error) error {
if err != nil {
if os.IsPermission(err) {
return filepath.SkipDir
}
return nil
}
// Skip excluded path
if excludePath != "" && p == excludePath {
return filepath.SkipDir
}
2025-11-18 18:23:33 +08:00
if d.IsDir() {
return nil
}
info, err := d.Info()
if err != nil {
return nil
}
total += getActualFileSize(p, info)
return nil
})
if err != nil && err != filepath.SkipDir {
return 0, err
}
return total, nil
}
func getActualFileSize(_ string, info fs.FileInfo) int64 {
stat, ok := info.Sys().(*syscall.Stat_t)
if !ok {
return info.Size()
}
actualSize := stat.Blocks * 512
if actualSize < info.Size() {
return actualSize
}
return info.Size()
}
func getLastAccessTimeFromInfo(info fs.FileInfo) time.Time {
stat, ok := info.Sys().(*syscall.Stat_t)
if !ok {
return time.Time{}
}
return time.Unix(stat.Atimespec.Sec, stat.Atimespec.Nsec)
}