Skip to content

Commit 2cc3a63

Browse files
authored
Add cron method to gc LFS MetaObjects (#22385)
This PR adds a task to the cron service to allow garbage collection of LFS meta objects. As repositories may have a large number of LFSMetaObjects, an updated column is added to this table and it is used to perform a generational GC to attempt to reduce the amount of work. (There may need to be a bit more work here but this is probably enough for the moment.) Fix #7045 Signed-off-by: Andrew Thornton <art27@cantab.net>
1 parent 04c97aa commit 2cc3a63

File tree

9 files changed

+251
-31
lines changed

9 files changed

+251
-31
lines changed

custom/conf/app.example.ini

+22
Original file line numberDiff line numberDiff line change
@@ -2213,6 +2213,28 @@ ROUTER = console
22132213
;SCHEDULE = @every 168h
22142214
;OLDER_THAN = 8760h
22152215

2216+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2217+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2218+
;; Garbage collect LFS pointers in repositories
2219+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2220+
;[cron.gc_lfs]
2221+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2222+
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2223+
;ENABLED = false
2224+
;; Garbage collect LFS pointers in repositories (default false)
2225+
;RUN_AT_START = false
2226+
;; Interval as a duration between each gc run (default every 24h)
2227+
;SCHEDULE = @every 24h
2228+
;; Only attempt to garbage collect LFSMetaObjects older than this (default 7 days)
2229+
;OLDER_THAN = 168h
2230+
;; Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days)
2231+
;LAST_UPDATED_MORE_THAN_AGO = 72h
2232+
; Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all.
2233+
;NUMBER_TO_CHECK_PER_REPO = 100
2234+
;Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.)
2235+
;PROPORTION_TO_CHECK_PER_REPO = 0.6
2236+
2237+
22162238
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
22172239
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
22182240
;; Git Operation timeout in seconds

docs/content/doc/advanced/config-cheat-sheet.en-us.md

+10
Original file line numberDiff line numberDiff line change
@@ -1039,6 +1039,16 @@ Default templates for project boards:
10391039
- `SCHEDULE`: **@every 168h**: Cron syntax to set how often to check.
10401040
- `OLDER_THAN`: **@every 8760h**: any system notice older than this expression will be deleted from database.
10411041

1042+
#### Cron - Garbage collect LFS pointers in repositories ('cron.gc_lfs')
1043+
1044+
- `ENABLED`: **false**: Enable service.
1045+
- `RUN_AT_START`: **false**: Run tasks at start up time (if ENABLED).
1046+
- `SCHEDULE`: **@every 24h**: Cron syntax to set how often to check.
1047+
- `OLDER_THAN`: **168h**: Only attempt to garbage collect LFSMetaObjects older than this (default 7 days)
1048+
- `LAST_UPDATED_MORE_THAN_AGO`: **72h**: Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days)
1049+
- `NUMBER_TO_CHECK_PER_REPO`: **100**: Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all.
1050+
- `PROPORTION_TO_CHECK_PER_REPO`: **0.6**: Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.)
1051+
10421052
## Git (`git`)
10431053

10441054
- `PATH`: **""**: The path of Git executable. If empty, Gitea searches through the PATH environment.

models/git/lfs.go

+66-3
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ type LFSMetaObject struct {
115115
RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"`
116116
Existing bool `xorm:"-"`
117117
CreatedUnix timeutil.TimeStamp `xorm:"created"`
118+
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
118119
}
119120

120121
func init() {
@@ -334,8 +335,45 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) {
334335
return lfsSize, nil
335336
}
336337

338+
// IterateRepositoryIDsWithLFSMetaObjects iterates across the repositories that have LFSMetaObjects
339+
func IterateRepositoryIDsWithLFSMetaObjects(ctx context.Context, f func(ctx context.Context, repoID, count int64) error) error {
340+
batchSize := setting.Database.IterateBufferSize
341+
sess := db.GetEngine(ctx)
342+
id := int64(0)
343+
type RepositoryCount struct {
344+
RepositoryID int64
345+
Count int64
346+
}
347+
for {
348+
counts := make([]*RepositoryCount, 0, batchSize)
349+
sess.Select("repository_id, COUNT(id) AS count").
350+
Table("lfs_meta_object").
351+
Where("repository_id > ?", id).
352+
GroupBy("repository_id").
353+
OrderBy("repository_id ASC")
354+
355+
if err := sess.Limit(batchSize, 0).Find(&counts); err != nil {
356+
return err
357+
}
358+
if len(counts) == 0 {
359+
return nil
360+
}
361+
362+
for _, count := range counts {
363+
if err := f(ctx, count.RepositoryID, count.Count); err != nil {
364+
return err
365+
}
366+
}
367+
id = counts[len(counts)-1].RepositoryID
368+
}
369+
}
370+
371+
// IterateLFSMetaObjectsForRepoOptions provides options for IterateLFSMetaObjectsForRepo
337372
type IterateLFSMetaObjectsForRepoOptions struct {
338-
OlderThan time.Time
373+
OlderThan time.Time
374+
UpdatedLessRecentlyThan time.Time
375+
OrderByUpdated bool
376+
LoopFunctionAlwaysUpdates bool
339377
}
340378

341379
// IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo
@@ -348,28 +386,53 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont
348386
LFSMetaObject
349387
}
350388

389+
id := int64(0)
390+
351391
for {
352392
beans := make([]*CountLFSMetaObject, 0, batchSize)
353-
// SELECT `lfs_meta_object`.*, COUNT(`l1`.id) as `count` FROM lfs_meta_object INNER JOIN lfs_meta_object AS l1 ON l1.oid = lfs_meta_object.oid WHERE lfs_meta_object.repository_id = ? GROUP BY lfs_meta_object.id
354393
sess := engine.Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`").
355394
Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid").
356395
Where("`lfs_meta_object`.repository_id = ?", repoID)
357396
if !opts.OlderThan.IsZero() {
358397
sess.And("`lfs_meta_object`.created_unix < ?", opts.OlderThan)
359398
}
399+
if !opts.UpdatedLessRecentlyThan.IsZero() {
400+
sess.And("`lfs_meta_object`.updated_unix < ?", opts.UpdatedLessRecentlyThan)
401+
}
360402
sess.GroupBy("`lfs_meta_object`.id")
403+
if opts.OrderByUpdated {
404+
sess.OrderBy("`lfs_meta_object`.updated_unix ASC")
405+
} else {
406+
sess.And("`lfs_meta_object`.id > ?", id)
407+
sess.OrderBy("`lfs_meta_object`.id ASC")
408+
}
361409
if err := sess.Limit(batchSize, start).Find(&beans); err != nil {
362410
return err
363411
}
364412
if len(beans) == 0 {
365413
return nil
366414
}
367-
start += len(beans)
415+
if !opts.LoopFunctionAlwaysUpdates {
416+
start += len(beans)
417+
}
368418

369419
for _, bean := range beans {
370420
if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil {
371421
return err
372422
}
373423
}
424+
id = beans[len(beans)-1].ID
425+
}
426+
}
427+
428+
// MarkLFSMetaObject updates the updated time for the provided LFSMetaObject
429+
func MarkLFSMetaObject(ctx context.Context, id int64) error {
430+
obj := &LFSMetaObject{
431+
UpdatedUnix: timeutil.TimeStampNow(),
432+
}
433+
count, err := db.GetEngine(ctx).ID(id).Update(obj)
434+
if count != 1 {
435+
log.Error("Unexpectedly updated %d LFSMetaObjects with ID: %d", count, id)
374436
}
437+
return err
375438
}

models/migrations/migrations.go

+5
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,9 @@ var migrations = []Migration{
432432
NewMigration("Update counts of all open milestones", v1_18.UpdateOpenMilestoneCounts),
433433
// v230 -> v231
434434
NewMigration("Add ConfidentialClient column (default true) to OAuth2Application table", v1_18.AddConfidentialClientColumnToOAuth2ApplicationTable),
435+
436+
// Gitea 1.18.0 ends at v231
437+
435438
// v231 -> v232
436439
NewMigration("Add index for hook_task", v1_19.AddIndexForHookTask),
437440
// v232 -> v233
@@ -446,6 +449,8 @@ var migrations = []Migration{
446449
NewMigration("Create secrets table", v1_19.CreateSecretsTable),
447450
// v237 -> v238
448451
NewMigration("Drop ForeignReference table", v1_19.DropForeignReferenceTable),
452+
// v238 -> v239
453+
NewMigration("Add updated unix to LFSMetaObject", v1_19.AddUpdatedUnixToLFSMetaObject),
449454
}
450455

451456
// GetCurrentDBVersion returns the current db version

models/migrations/v1_19/v238.go

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package v1_19 //nolint
5+
6+
import (
7+
"code.gitea.io/gitea/modules/timeutil"
8+
9+
"xorm.io/xorm"
10+
)
11+
12+
// AddUpdatedUnixToLFSMetaObject adds an updated column to the LFSMetaObject to allow for garbage collection
13+
func AddUpdatedUnixToLFSMetaObject(x *xorm.Engine) error {
14+
// Drop the table introduced in `v211`, it's considered badly designed and doesn't look like to be used.
15+
// See: https://github.com/go-gitea/gitea/issues/21086#issuecomment-1318217453
16+
// LFSMetaObject stores metadata for LFS tracked files.
17+
type LFSMetaObject struct {
18+
ID int64 `xorm:"pk autoincr"`
19+
Oid string `json:"oid" xorm:"UNIQUE(s) INDEX NOT NULL"`
20+
Size int64 `json:"size" xorm:"NOT NULL"`
21+
RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"`
22+
CreatedUnix timeutil.TimeStamp `xorm:"created"`
23+
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
24+
}
25+
26+
return x.Sync(new(LFSMetaObject))
27+
}

modules/doctor/lfs.go

+15-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package doctor
66
import (
77
"context"
88
"fmt"
9+
"time"
910

1011
"code.gitea.io/gitea/modules/log"
1112
"code.gitea.io/gitea/modules/setting"
@@ -29,7 +30,20 @@ func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool
2930
return fmt.Errorf("LFS support is disabled")
3031
}
3132

32-
if err := repository.GarbageCollectLFSMetaObjects(ctx, logger, autofix); err != nil {
33+
if err := repository.GarbageCollectLFSMetaObjects(ctx, repository.GarbageCollectLFSMetaObjectsOptions{
34+
Logger: logger,
35+
AutoFix: autofix,
36+
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
37+
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
38+
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
39+
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
40+
// objects.
41+
//
42+
// It is likely that a week is potentially excessive but it should definitely be enough that any
43+
// unassociated LFS object is genuinely unassociated.
44+
OlderThan: time.Now().Add(-24 * time.Hour * 7),
45+
// We don't set the UpdatedLessRecentlyThan because we want to do a full GC
46+
}); err != nil {
3347
return err
3448
}
3549

options/locale/locale_en-US.ini

+1
Original file line numberDiff line numberDiff line change
@@ -2554,6 +2554,7 @@ dashboard.delete_old_actions = Delete all old actions from database
25542554
dashboard.delete_old_actions.started = Delete all old actions from database started.
25552555
dashboard.update_checker = Update checker
25562556
dashboard.delete_old_system_notices = Delete all old system notices from database
2557+
dashboard.gc_lfs = Garbage collect LFS meta objects
25572558
25582559
users.user_manage_panel = User Account Management
25592560
users.new_account = Create User Account

services/cron/tasks_extended.go

+43
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,48 @@ func registerDeleteOldSystemNotices() {
175175
})
176176
}
177177

178+
func registerGCLFS() {
179+
if !setting.LFS.StartServer {
180+
return
181+
}
182+
type GCLFSConfig struct {
183+
OlderThanConfig
184+
LastUpdatedMoreThanAgo time.Duration
185+
NumberToCheckPerRepo int64
186+
ProportionToCheckPerRepo float64
187+
}
188+
189+
RegisterTaskFatal("gc_lfs", &GCLFSConfig{
190+
OlderThanConfig: OlderThanConfig{
191+
BaseConfig: BaseConfig{
192+
Enabled: false,
193+
RunAtStart: false,
194+
Schedule: "@every 24h",
195+
},
196+
// Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload
197+
// and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby
198+
// an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid
199+
// changes in new branches that might lead to lfs objects becoming temporarily unassociated with git
200+
// objects.
201+
//
202+
// It is likely that a week is potentially excessive but it should definitely be enough that any
203+
// unassociated LFS object is genuinely unassociated.
204+
OlderThan: 24 * time.Hour * 7,
205+
},
206+
// Only GC things that haven't been looked at in the past 3 days
207+
LastUpdatedMoreThanAgo: 24 * time.Hour * 3,
208+
NumberToCheckPerRepo: 100,
209+
ProportionToCheckPerRepo: 0.6,
210+
}, func(ctx context.Context, _ *user_model.User, config Config) error {
211+
gcLFSConfig := config.(*GCLFSConfig)
212+
return repo_service.GarbageCollectLFSMetaObjects(ctx, repo_service.GarbageCollectLFSMetaObjectsOptions{
213+
AutoFix: true,
214+
OlderThan: time.Now().Add(-gcLFSConfig.OlderThan),
215+
UpdatedLessRecentlyThan: time.Now().Add(-gcLFSConfig.LastUpdatedMoreThanAgo),
216+
})
217+
})
218+
}
219+
178220
func initExtendedTasks() {
179221
registerDeleteInactiveUsers()
180222
registerDeleteRepositoryArchives()
@@ -188,4 +230,5 @@ func initExtendedTasks() {
188230
registerDeleteOldActions()
189231
registerUpdateGiteaChecker()
190232
registerDeleteOldSystemNotices()
233+
registerGCLFS()
191234
}

0 commit comments

Comments
 (0)