From bbfee14e1c2537e6eaf7bc9a47ab5e2c3d126c17 Mon Sep 17 00:00:00 2001 From: Jack Jackson Date: Mon, 3 Mar 2025 16:29:56 -0800 Subject: [PATCH] Refactor (in support of testing) --- TODO.txt | 8 +- src/git.ts | 151 ++++++++++++++++++++++++++++++++++++ src/main.ts | 211 ++++++++------------------------------------------- src/types.ts | 4 +- 4 files changed, 191 insertions(+), 183 deletions(-) create mode 100644 src/git.ts diff --git a/TODO.txt b/TODO.txt index 0821a6f..bd22df8 100644 --- a/TODO.txt +++ b/TODO.txt @@ -10,4 +10,10 @@ - [ ] Blog about this ;) - [ ] Tests! - [ ] ... - - [ ] Profit? \ No newline at end of file + - [ ] Profit? +- [ ] Migrate into Gitea's own [Issue Tracker](https://gitea.scubbo.org/scubbo/commit-report-sync/issues) +- [ ] Use a more fully-featured logging system than `console.log` (at least with different logging levels!) + +# Done + +- [X] Remove `parentHashes`, never ended up being needed diff --git a/src/git.ts b/src/git.ts new file mode 100644 index 0000000..3885301 --- /dev/null +++ b/src/git.ts @@ -0,0 +1,151 @@ +// Abstract-away Git interactions, so they can be mocked in tests + +import { execSync } from "child_process"; +import { Commit, ExecSyncError, RepoId } from "./types"; +import { mkdirSync } from "fs"; +import { format } from 'date-fns'; + +export function gitClone(dir: string, url: string) { + execSync(`git clone ${url} ${dir}`, { cwd: dir }); +} + +export function getNLatestCommits(dir: string, n: number): Commit[] { + const logOutput = execSync( + // If you want to copy this formatting for debugging, it's: + // + // --pretty=format:'{"hash":"%h","author_name":"%an","author_email":"%ae","date":"%ai","message":"%s"}' + // + // TODO - return to this and figure out if these are _actually_ "useless escapes" or not - got a couple layers + // of string-parsing to consider here, I wouldn't want to bet without testing! + //eslint-disable-next-line no-useless-escape + `git log --max-count=${n} --pretty=format:'{\"hash\":\"%h\",\"author_name\":\"%an\",\"author_email\":\"%ae\",\"date\":\"%ai\",\"message\":\"%s\"}'`, + { cwd: dir } + ); + const logLines = logOutput.toString().split('\n') + return logLines.map(commitLine => { + // https://gist.github.com/textarcana/1306223 + const parsed = JSON.parse(commitLine) + return { + hash: parsed.hash, + author_name: parsed.author_name, + author_email: parsed.author_email, + repo_path: dir, + date: parsed.date, + message: parsed.message + } + }); +} + +export function getCommitsSinceLatestBeforeGivenDate(dir: string, date: Date): Commit[] { + try { + const countingLogOutput = execSync( + `git log --since=${date.toISOString()} --pretty=oneline`, + { cwd: dir } + ); + const countedNumber = countingLogOutput.toString().split('\n').length; + console.log(`DEBUG - countedNumber (how many commits in target repo since oldest source commit) is: ${countedNumber}`); + // TODO - return to this and figure out if these are _actually_ "useless escapes" or not - got a couple layers + // of string-parsing to consider here, I wouldn't want to bet without testing! + const logOutput = execSync( + //eslint-disable-next-line no-useless-escape + `git log --max-count=${countedNumber+1} --pretty=format:'{\"hash\":\"%h\",\"author_name\":\"%an\",\"author_email\":\"%ae\",\"date\":\"%ai\",\"message\":\"%s\"}'`, + { cwd: dir } + ); + const logLines = logOutput.toString().split('\n'); + return logLines.map(commitLine => { + const parsed = JSON.parse(commitLine) + return { + hash: parsed.hash, + author_name: parsed.author_name, + author_email: parsed.author_email, + repo_path: dir, + date: parsed.date, + message: parsed.message + } + }); + } catch (e) { + const error = e as ExecSyncError + // No commits in the target repo - return an empty array, which will result in the first representative commit + // being made as the first commit. And then we can iterate as normal (recalling that the target history is + // refreshed _from local repo_ - incurring no network charges) from there on. + const errorOutputAsString = '' + error.output[2] + if (!errorOutputAsString.includes('does not have any commits yet')) { + console.log(`Unexpected error: ${errorOutputAsString}`); + throw Error(`Unexpected error while building target commit history`, { + cause: error + }) + } + // Fresh target repo - just write into it (by returning an empty array of target commits as target history) + // (i.e. doing nothing) + return []; + } +} + +export function insertRepresentativeCommit(dir: string,sourceRepo: RepoId, sourceCommit: Commit, targetCommit: Commit | undefined, followOnTargetCommit: Commit | undefined): void { + // If there is a target commit, + if (targetCommit != undefined) { + execSync(`git reset --hard ${targetCommit.hash}`, { + cwd: dir + }) + } + + createRepresentativeCommit(dir,sourceRepo, sourceCommit); + // Then, if there is a follow-on target commit, we need to cherry-pick it onto the source commit: + if (followOnTargetCommit != undefined) { + execSync(`git cherry-pick ${followOnTargetCommit.hash}`, { + cwd: dir + }) + }// else - nothing to cherry-pick back on top +} + +export function gitPush(dir: string, tokenForTargetRepo: string, targetRepoId: RepoId) { + // Note that it must be a `-f`, because we are literally rewriting history. + execSync(`git push -f https://unused-username:${tokenForTargetRepo}@${targetRepoId.domain}/${targetRepoId.owner}/${targetRepoId.name}`, { + cwd: dir + }) + // TODO - it'd be nice - before this `git push` is probably best - to add a `README.md` comment acknowledging + // the sync +} + +function createRepresentativeCommit(dir: string, sourceRepo: RepoId,sourceCommit: Commit) { + // Create a commit that represents the source commit, but with a filename that is generated from the source commit's + // metadata. + // + // This is guaranteed to not cause conflicts with other commits, because the filename is generated from the source + // commit's metadata, and no two source commits will have the same metadata. + // (OK sure _technically_ these could have a collision, but...like...what are the odds?) + // TODO - figure out what the odds actually are, that'd be fun :P + const filename = `${sourceRepo.owner}/${sourceRepo.name}/${sourceCommit.hash}` + mkdirSync(dir + '/' + sourceRepo.owner + '/' + sourceRepo.name, { recursive: true }); + execSync(`touch ${filename}`, { + cwd: dir + }) + execSync(`git add ${filename}`, { + cwd: dir + }) + + // Seems like setting `--author` on `git commit` is not sufficient - still need to set `user` as well (I guess those + // are the difference between `comitted by` and `written by`?) + // Confirmed by following the instructions [here](https://docs.github.com/en/account-and-profile/setting-up-and-managing-your-github-profile/managing-contribution-settings-on-your-profile/why-are-my-contributions-not-showing-up-on-my-profile#your-local-git-commit-email-isnt-connected-to-your-account) + // to check the "made by" address, and confirming that it did not match the email set in the `--author` flag. + // Note that, contrary to advice given by the CLI, this does not use the global config, but the local one - because, + // otherwise, if this was run locally, it would mess up the host system's config. + execSync(`git config user.email "${sourceCommit.author_email}"`, { cwd: dir }); + execSync(`git config user.name "${sourceCommit.author_name}"`, { cwd: dir }); + + try { + // Do _not_ arbitrarily remove the `hash` - it's used for signalling identity in `main()` + const args = `"${sourceRepo.owner}/${sourceRepo.name}: ${sourceCommit.message} - ${sourceCommit.hash}" --date="${format(sourceCommit.date, 'yyyy-MM-dd HH:mm:ss')}" --author="${sourceCommit.author_name} <${sourceCommit.author_email}>"`; + console.log(`About to commit with args ${args}`); + // https://github.com/Shpota/github-activity-generator/blob/main/contribute.py#L63 + // "%Y-%m-%d %H:%M:%S" + execSync(`git commit -m ${args}`, { + cwd: dir + }) + } catch (e) { + console.log(e); + const error = e as ExecSyncError; + console.log(`DEBUG - error while creating representative commit: ${'' + error.output[2]} ... ${'' + error.output[1]}`); + throw e; + } +} diff --git a/src/main.ts b/src/main.ts index 05eb204..51f11a7 100644 --- a/src/main.ts +++ b/src/main.ts @@ -1,37 +1,14 @@ -import { Repo, repoString, Commit, ExecSyncError } from './types'; +import { RepoId, repoString, Commit } from './types'; import { execSync } from 'child_process'; import { existsSync, mkdirSync } from 'fs'; - -import { format } from 'date-fns'; +import { getCommitsSinceLatestBeforeGivenDate, getNLatestCommits, gitClone, gitPush, insertRepresentativeCommit } from './git'; const WORKING_DIR = './working'; const SOURCE_DIR = WORKING_DIR + '/source'; const TARGET_DIR = WORKING_DIR + '/target'; -export async function main(sourceRepo: Repo, targetRepo: Repo, dryRun: boolean, tokenForTargetRepo: string) { - // It _shouldn't_ ever exist, but it if did, something weird is going on. - if (existsSync(WORKING_DIR) || existsSync(SOURCE_DIR) || existsSync(TARGET_DIR)) { - throw new Error('Working directory already exists/populated'); - } - - if (tokenForTargetRepo == '') { - throw new Error('token_for_target_repo is required'); - } - - mkdirSync(WORKING_DIR); - mkdirSync(SOURCE_DIR); - mkdirSync(TARGET_DIR); - - console.log(`DEBUG - sourceRepoPath: ${repoString(sourceRepo)}`) - - console.log(`DEBUG - targetRepoPath: ${repoString(targetRepo)}`) - - // TODO - allow parameterizing how far back in history to checkout (because it might take a long time for older - // repos and, once synced initially, it won't have to go back further than a single one in most cases) - const sourceRepoCloneCommand = `git clone https://${repoString(sourceRepo)} ${SOURCE_DIR}` - console.log(`DEBUG - sourceRepoCloneCommand: ${sourceRepoCloneCommand}`); - execSync(sourceRepoCloneCommand); - execSync(`git clone https://${repoString(targetRepo)} ${TARGET_DIR}`); +export async function main(sourceRepoId: RepoId, targetRepoId: RepoId, dryRun: boolean, tokenForTargetRepo: string) { + setPreconditions(tokenForTargetRepo, sourceRepoId, targetRepoId); // Logic: // * Go back as far in source commit history as the given number of commits @@ -53,7 +30,7 @@ export async function main(sourceRepo: Repo, targetRepo: Repo, dryRun: boolean, // only been alive so many years - there's a hard limit on the rate of code I could possibly have generated, which // is small compared to, y'know, _companies_. And I don't see organizations of that size caring about GitHub // contribution history at whole-org scale - and if they do, it'd be proportionally simple for them to implement it. - const sourceCommitHistory = buildSourceCommitHistory(SOURCE_DIR, 10); + const sourceCommitHistory = getNLatestCommits(SOURCE_DIR, 10); // Calling `doSomethingTo(sourceCommitHistory.reverse()); doSomethingElseTo(sourceCommitHistory.reverse());` results // in the second invocation receiving the double-reversed array. @@ -66,12 +43,13 @@ export async function main(sourceRepo: Repo, targetRepo: Repo, dryRun: boolean, // than abandoning the target tree after the insertion point and trusting in later operation to rebuild it - because // the target repo's tree will have representations of commits from _other_ (source)repos too, which we cannot // recreate without their context) - let targetCommitHistory = buildTargetCommitHistory(TARGET_DIR, reversedSourceCommitHistory[0].date); + let targetCommitHistory = getCommitsSinceLatestBeforeGivenDate(TARGET_DIR, reversedSourceCommitHistory[0].date); for (const sourceCommit of reversedSourceCommitHistory) { // "(Index of) First Target Commit that is earlier than (or equal to) the source commit" const targetCommitIndex = targetCommitHistory.findIndex(c => c.date <= sourceCommit.date); console.log(`DEBUG - targetCommitIndex: ${targetCommitIndex}. targetCommitHistory: ${JSON.stringify(targetCommitHistory)}`); + // TODO - refactor this to use guard clauses more than nested-ifs if (targetCommitIndex != -1) { const targetCommit = targetCommitHistory[targetCommitIndex]; // If the target commit is a representation of the source commit, we can skip it @@ -90,7 +68,7 @@ export async function main(sourceRepo: Repo, targetRepo: Repo, dryRun: boolean, } else { followOnTargetCommit = targetCommitHistory[targetCommitIndex - 1]; } - insertRepresentativeCommit(sourceRepo, sourceCommit, targetCommit, followOnTargetCommit); + insertRepresentativeCommit(TARGET_DIR,sourceRepoId, sourceCommit, targetCommit, followOnTargetCommit); // And then regenerate the target commit history // Thankfully, we only need to do this back to immediately preceding the _just processed_ source // commit (since we know that all the rest of the source commits to be processed will be after it), @@ -99,171 +77,44 @@ export async function main(sourceRepo: Repo, targetRepo: Repo, dryRun: boolean, // compared), albeit approximately-halved - but I'm gambling on the fact that that should still take // negligible practical time at usual repo sizes - at least, the ones I'm // this quadratic portion should be negligible, though - and, even if it isn't, it definitely will - targetCommitHistory = buildTargetCommitHistory(TARGET_DIR, sourceCommit.date); + targetCommitHistory = getCommitsSinceLatestBeforeGivenDate(TARGET_DIR, sourceCommit.date); } } console.log(`DEBUG - targetCommit: ${targetCommit.hash}`); } else { console.log(`DEBUG - could not find a targetCommit that is earlier than or equal to the sourceCommit ${sourceCommit.hash} - therefore, writing the source commit's representation onto the current HEAD of target repo`); - insertRepresentativeCommit(sourceRepo, sourceCommit, undefined, undefined); + insertRepresentativeCommit(TARGET_DIR, sourceRepoId, sourceCommit, undefined, undefined); // As above, have to regenerate history after mutation - targetCommitHistory = buildTargetCommitHistory(TARGET_DIR, sourceCommit.date); + targetCommitHistory = getCommitsSinceLatestBeforeGivenDate(TARGET_DIR, sourceCommit.date); } } // OK, that's it - we've processed all the source commits, and we've inserted all the necessary target commits. // We can just `git push` to the target repo now. - // - // Note that it must be a `-f`, because we are literally rewriting history. if (!dryRun) { - execSync(`git push -f https://unused-username:${tokenForTargetRepo}@${targetRepo.domain}/${targetRepo.owner}/${targetRepo.name}`, { - cwd: TARGET_DIR - }) - // TODO - it'd be nice - before this `git push` is probably best - to add a `README.md` comment acknowledging - // the sync + gitPush(TARGET_DIR, tokenForTargetRepo, targetRepoId); } return - } -export function buildSourceCommitHistory(path: string, numCommits: number): Commit[] { - console.log(`DEBUG - building source commit history for ${path} with max count ${numCommits}`); - const output: Commit[] = []; - - const logOutput = execSync( - // If you want to copy this formatting for debugging, it's: - // - // --pretty=format:'{"hash":"%h","author_name":"%an","author_email":"%ae","date":"%ai","message":"%s"}' - // - // TODO - return to this and figure out if these are _actually_ "useless escapes" or not - got a couple layers - // of string-parsing to consider here, I wouldn't want to bet without testing! - //eslint-disable-next-line no-useless-escape - `git log --max-count=${numCommits} --pretty=format:'{\"hash\":\"%h\",\"author_name\":\"%an\",\"author_email\":\"%ae\",\"date\":\"%ai\",\"message\":\"%s\"}'`, - { cwd: path } - ); - const logLines = logOutput.toString().split('\n'); - for (const line of logLines) { - const commit = parseCommit(path, line); - output.push(commit); +function setPreconditions(tokenForTargetRepo: string, sourceRepoId: RepoId, targetRepoId: RepoId) { + // It _shouldn't_ ever exist, but it if did, something weird is going on. + if (existsSync(WORKING_DIR) || existsSync(SOURCE_DIR) || existsSync(TARGET_DIR)) { + throw new Error('Working directory already exists/populated'); } - return output; + if (tokenForTargetRepo == '') { + throw new Error('token_for_target_repo is required'); + } + + mkdirSync(WORKING_DIR); + mkdirSync(SOURCE_DIR); + mkdirSync(TARGET_DIR); + + console.log(`DEBUG - sourceRepoPath: ${repoString(sourceRepoId)}`) + console.log(`DEBUG - targetRepoPath: ${repoString(targetRepoId)}`) + + // TODO - allow parameterizing how far back in history to checkout (because it might take a long time for older + // repos and, once synced initially, it won't have to go back further than a single one in most cases) + gitClone(SOURCE_DIR, `https://${repoString(sourceRepoId)}`); + gitClone(TARGET_DIR, `https://${repoString(targetRepoId)}`); } - -export function buildTargetCommitHistory(path: string, oldestDateInSourceCommitHistory: Date): Commit[] { - console.log(`DEBUG - building target commit history for ${path} with oldest date ${oldestDateInSourceCommitHistory.toISOString()}`); - const output: Commit[] = []; - - try { - const countingLogOutput = execSync( - `git log --since=${oldestDateInSourceCommitHistory.toISOString()} --pretty=oneline`, - { cwd: path } - ); - const countedNumber = countingLogOutput.toString().split('\n').length; - console.log(`DEBUG - countedNumber (how many commits in target repo since oldest source commit) is: ${countedNumber}`); - // TODO - return to this and figure out if these are _actually_ "useless escapes" or not - got a couple layers - // of string-parsing to consider here, I wouldn't want to bet without testing! - const logOutput = execSync( - //eslint-disable-next-line no-useless-escape - `git log --max-count=${countedNumber+1} --pretty=format:'{\"hash\":\"%h\",\"author_name\":\"%an\",\"author_email\":\"%ae\",\"date\":\"%ai\",\"message\":\"%s\"}'`, - { cwd: path } - ); - const logLines = logOutput.toString().split('\n'); - for (const line of logLines) { - const commit = parseCommit(path, line); - output.push(commit); - } - } catch (e) { - const error = e as ExecSyncError - // Now you can safely access properties - // No commits in the target repo - return an empty array, which will result in the first representative commit - // being made as the first commit. And then we can iterate as normal (recalling that the target history is - // refreshed _from local repo_ - incurring no network charges) from there on. - const errorOutputAsString = '' + error.output[2] - if (!errorOutputAsString.includes('does not have any commits yet')) { - console.log(`Unexpected error: ${errorOutputAsString}`); - throw Error(`Unexpected error while building target commit history`, { - cause: error - }) - } - // Fresh target repo - just write into it (by returning an empty array of target commits as target history) - // (i.e. doing nothing) - } - console.log(`As final output of buildTargetCommitHistory, preceding ${oldestDateInSourceCommitHistory.toISOString()}, output is ${JSON.stringify(output)}`); - return output; -} - -// https://gist.github.com/textarcana/1306223 -function parseCommit(repo_path: string, line: string): Commit { - console.log(`DEBUG - line: ${line}, for path ${repo_path}`); - const parsed = JSON.parse(line) - return { - hash: parsed['hash'], - author_name: parsed['author_name'], - author_email: parsed['author_email'], - repo_path: repo_path, - date: new Date(parsed['date']), - message: parsed['message'], - } -} - -function insertRepresentativeCommit(sourceRepo: Repo,sourceCommit: Commit, targetCommit: Commit | undefined, followOnTargetCommit: Commit | undefined): void { - // If there is a target commit, - if (targetCommit != undefined) { - execSync(`git reset --hard ${targetCommit.hash}`, { - cwd: TARGET_DIR - }) - } - - createRepresentativeCommit(sourceRepo, sourceCommit); - // Then, if there is a follow-on target commit, we need to cherry-pick it onto the source commit: - if (followOnTargetCommit != undefined) { - execSync(`git cherry-pick ${followOnTargetCommit.hash}`, { - cwd: TARGET_DIR - }) - }// else - nothing to cherry-pick back on top - -} - -function createRepresentativeCommit(sourceRepo: Repo,sourceCommit: Commit) { - // Create a commit that represents the source commit, but with a filename that is generated from the source commit's - // metadata. - // - // This is guaranteed to not cause conflicts with other commits, because the filename is generated from the source - // commit's metadata, and no two source commits will have the same metadata. - // (OK sure _technically_ these could have a collision, but...like...what are the odds?) - // TODO - figure out what the odds actually are, that'd be fun :P - const filename = `${sourceRepo.owner}/${sourceRepo.name}/${sourceCommit.hash}` - mkdirSync(TARGET_DIR + '/' + sourceRepo.owner + '/' + sourceRepo.name, { recursive: true }); - execSync(`touch ${filename}`, { - cwd: TARGET_DIR - }) - execSync(`git add ${filename}`, { - cwd: TARGET_DIR - }) - - // Seems like setting `--author` on `git commit` is not sufficient - still need to set `user` as well (I guess those - // are the difference between `comitted by` and `written by`?) - // Confirmed by following the instructions [here](https://docs.github.com/en/account-and-profile/setting-up-and-managing-your-github-profile/managing-contribution-settings-on-your-profile/why-are-my-contributions-not-showing-up-on-my-profile#your-local-git-commit-email-isnt-connected-to-your-account) - // to check the "made by" address, and confirming that it did not match the email set in the `--author` flag. - // Note that, contrary to advice given by the CLI, this does not use the global config, but the local one - because, - // otherwise, if this was run locally, it would mess up the host system's config. - execSync(`git config user.email "${sourceCommit.author_email}"`, { cwd: TARGET_DIR }); - execSync(`git config user.name "${sourceCommit.author_name}"`, { cwd: TARGET_DIR }); - - try { - // Do _not_ arbitrarily remove the `hash` - it's used for signalling identity in `main()` - const args = `"${sourceRepo.owner}/${sourceRepo.name}: ${sourceCommit.message} - ${sourceCommit.hash}" --date="${format(sourceCommit.date, 'yyyy-MM-dd HH:mm:ss')}" --author="${sourceCommit.author_name} <${sourceCommit.author_email}>"`; - console.log(`About to commit with args ${args}`); - // https://github.com/Shpota/github-activity-generator/blob/main/contribute.py#L63 - // "%Y-%m-%d %H:%M:%S" - execSync(`git commit -m ${args}`, { - cwd: TARGET_DIR - }) - } catch (e) { - console.log(e); - const error = e as ExecSyncError; - console.log(`DEBUG - error while creating representative commit: ${'' + error.output[2]} ... ${'' + error.output[1]}`); - throw e; - } - -} \ No newline at end of file diff --git a/src/types.ts b/src/types.ts index 210222e..ba096d2 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,11 +1,11 @@ // TODO - parametrize the scheme -export type Repo = { +export type RepoId = { domain: string; owner: string; name: string; } -export function repoString(repo: Repo): string { +export function repoString(repo: RepoId): string { return `${repo.domain}/${repo.owner}/${repo.name}`; }