import { Repo, repoString, Commit } from './types'; import { execSync } from 'child_process'; import { existsSync, mkdirSync } from 'fs'; import { format } from 'date-fns'; const WORKING_DIR = './working'; const SOURCE_DIR = WORKING_DIR + '/source'; const TARGET_DIR = WORKING_DIR + '/target'; export async function main(sourceRepo: Repo, targetRepo: Repo, dryRun: boolean, tokenForTargetRepo: string) { // It _shouldn't_ ever exist, but it if did, something weird is going on. if (existsSync(WORKING_DIR) || existsSync(SOURCE_DIR) || existsSync(TARGET_DIR)) { throw new Error('Working directory already exists/populated'); } if (tokenForTargetRepo == '') { throw new Error('token_for_target_repo is required'); } mkdirSync(WORKING_DIR); mkdirSync(SOURCE_DIR); mkdirSync(TARGET_DIR); console.log(`DEBUG - sourceRepoPath: ${repoString(sourceRepo)}`) console.log(`DEBUG - targetRepoPath: ${repoString(targetRepo)}`) // TODO - allow parameterizing how far back in history to checkout (because it might take a long time for older // repos and, once synced initially, it won't have to go back further than a single one in most cases) const sourceRepoCloneCommand = `git clone https://${repoString(sourceRepo)} ${SOURCE_DIR}` console.log(`DEBUG - sourceRepoCloneCommand: ${sourceRepoCloneCommand}`); execSync(sourceRepoCloneCommand); execSync(`git clone https://${repoString(targetRepo)} ${TARGET_DIR}`); // Seems like setting `--author` on `git commit` is not sufficient - still need to set `user` as well (I guess those // are the difference between `comitted by` and `written by`?) execSync(`git config --global user.email "commit-report-sync-bot@scubbo.org"`, { cwd: TARGET_DIR }); execSync(`git config --global user.name "Commit Report Sync Bot"`, { cwd: TARGET_DIR }); // Logic: // * Go back as far in source commit history as the given number of commits // * For each commit, check if it is recorded in the target repo // * If it is, move on to the next commit in the source repo // * If it is not: // * Find the commit in the target repo that immediately predates it (and note its child) // * Create a record-commit in the target repo (note that this will never cause merge conflicts because the // filename modified in the commit will be generated based on the source commit's metadata - i.e. no two // record-commits will modify the same filename) // * Cherry-pick the previous child onto this (again - no merge conflicts, because differing filenames are // guaranteed by having different source commit metadata) // * Move on to the next commit in the source repo // // (Thinking about the algorithm, if we need to optimize it, can do some kind of moving-pointer/stack system on the // commit history to reduce amount of seeking time - but, until we test it and have reason to believe that the slow // naive approach is a problem, I'm not going to bother. YAGNI :shrug: (for myself, I doubt I have enough volume of // code to cause scaling issues. Not that I'm saying I don't commit a lot - just, I'm only a single person, and I've // only been alive so many years - there's a hard limit on the rate of code I could possibly have generated, which // is small compared to, y'know, _companies_. And I don't see organizations of that size caring about GitHub // contribution history at whole-org scale - and if they do, it'd be proportionally simple for them to implement it. const sourceCommitHistory = buildSourceCommitHistory(SOURCE_DIR, 10); // Calling `doSomethingTo(sourceCommitHistory.reverse()); doSomethingElseTo(sourceCommitHistory.reverse());` results // in the second invocation receiving the double-reversed array. const reversedSourceCommitHistory = sourceCommitHistory.reverse(); // Only have to go back far enough in target history to find the commit that immediately precedes the oldest // commit in source history // // Though - we have to do this for _every_ commit in source history (if it didn't cause an insertion, that is) // because every insertion to target history will re-hash all the child commits (but we do have to insert, rather // than abandoning the target tree after the insertion point and trusting in later operation to rebuild it - because // the target repo's tree will have representations of commits from _other_ (source)repos too, which we cannot // recreate without their context) var targetCommitHistory = buildTargetCommitHistory(TARGET_DIR, reversedSourceCommitHistory[0].date); for (var sourceCommit of reversedSourceCommitHistory) { // "(Index of) First Target Commit that is earlier than (or equal to) the source commit" const targetCommitIndex = targetCommitHistory.findIndex(c => c.date <= sourceCommit.date); console.log(`DEBUG - targetCommitIndex: ${targetCommitIndex}. targetCommitHistory: ${JSON.stringify(targetCommitHistory)}`); if (targetCommitIndex != -1) { const targetCommit = targetCommitHistory[targetCommitIndex]; // If the target commit is a representation of the source commit, we can skip it if (targetCommit.message.includes(sourceCommit.hash)) { console.log(`DEBUG - found a match for ${sourceCommit.hash} in ${targetCommit.hash} - found a representation of the source commit. No need to insert.`); continue; // Not strictly necessary, but makes intention clearer when reading } else { if (targetCommit.date == sourceCommit.date) { throw new Error(`Target commit ${targetCommit.hash} has the same date as source commit ${sourceCommit.hash}, but they are not representations of one another. This should never happen. This means that two source repos have commits at the exact same time, which is one of the (anti-)pre-requisites of this tool. If you need this feature, let me know.`); } else { console.log(`DEBUG - No match for ${sourceCommit.hash} in ${targetCommit.hash} - inserting.`); var followOnTargetCommit: Commit | undefined; if (targetCommitIndex == 0) { // The targetCommit is the latest (in time; first in the array) in the history - i.e. there is nothing to cherry-pick back on top of it followOnTargetCommit = undefined; } else { followOnTargetCommit = targetCommitHistory[targetCommitIndex - 1]; } insertRepresentativeCommit(sourceRepo, sourceCommit, targetCommit, followOnTargetCommit); // And then regenerate the target commit history // Thankfully, we only need to do this back to immediately preceding the _just processed_ source // commit (since we know that all the rest of the source commits to be processed will be after it), // which should cut down a _bit_ on the time required to carry this out. It's still quadratic (or, // rather, scales as O(n_1 * n_2), where those are the sizes of the histories of the two repos being // compared), albeit approximately-halved - but I'm gambling on the fact that that should still take // negligible practical time at usual repo sizes - at least, the ones I'm // this quadratic portion should be negligible, though - and, even if it isn't, it definitely will targetCommitHistory = buildTargetCommitHistory(TARGET_DIR, sourceCommit.date); } } console.log(`DEBUG - targetCommit: ${targetCommit.hash}`); } else { console.log(`DEBUG - could not find a targetCommit that is earlier than or equal to the sourceCommit ${sourceCommit.hash} - therefore, writing the source commit's representation onto the current HEAD of target repo`); insertRepresentativeCommit(sourceRepo, sourceCommit, undefined, undefined); } } // OK, that's it - we've processed all the source commits, and we've inserted all the necessary target commits. // We can just `git push` to the target repo now. // // Note that it must be a `-f`, because we are literally rewriting history. if (!dryRun) { execSync(`git push -f https://unused-username:${tokenForTargetRepo}@${targetRepo.domain}/${targetRepo.owner}/${targetRepo.name}`, { cwd: TARGET_DIR }) // TODO - it'd be nice - before this `git push` is probably best - to add a `README.md` comment acknowledging // the sync } return } export function buildSourceCommitHistory(path: string, numCommits: number): Commit[] { console.log(`DEBUG - building source commit history for ${path} with max count ${numCommits}`); const output: Commit[] = []; const logOutput = execSync( // If you want to copy this formatting for debugging, it's: // // --pretty=format:'{"hash":"%h","author_name":"%an","author_email":"%ae","date":"%ai","message":"%s","parentHashes":"%p"}' `git log --max-count=${numCommits} --pretty=format:'{\"hash\":\"%h\",\"author_name\":\"%an\",\"author_email\":\"%ae\",\"date\":\"%ai\",\"message\":\"%s\",\"parentHashes\":\"%p\"}'`, { cwd: path } ); const logLines = logOutput.toString().split('\n'); for (const line of logLines) { const commit = parseCommit(path, line); output.push(commit); } return output; } export function buildTargetCommitHistory(path: string, oldestDateInSourceCommitHistory: Date): Commit[] { console.log(`DEBUG - building target commit history for ${path} with oldest date ${oldestDateInSourceCommitHistory.toISOString()}`); const output: Commit[] = []; try { const countingLogOutput = execSync( `git log --since=${oldestDateInSourceCommitHistory.toISOString()} --pretty=oneline`, { cwd: path } ); const countedNumber = countingLogOutput.toString().split('\n').length; console.log(`DEBUG - countedNumber (how many commits in target repo since oldest source commit) is: ${countedNumber}`); const logOutput = execSync( `git log --max-count=${countedNumber+1} --pretty=format:'{\"hash\":\"%h\",\"author_name\":\"%an\",\"author_email\":\"%ae\",\"date\":\"%ai\",\"message\":\"%s\",\"parentHashes\":\"%p\"}'`, { cwd: path } ); const logLines = logOutput.toString().split('\n'); for (const line of logLines) { const commit = parseCommit(path, line); output.push(commit); } } catch (e) { const error = e as Record; // Now you can safely access properties // No commits in the target repo - return an empty array, which will result in the first representative commit // being made as the first commit. And then we can iterate as normal (recalling that the target history is // refreshed _from local repo_ - incurring no network charges) from there on. const errorOutputAsString = '' + error.output[2] if (!errorOutputAsString.includes('does not have any commits yet')) { console.log(`Unexpected error: ${errorOutputAsString}`); throw Error(`Unexpected error while building target commit history`, { cause: error }) } // Fresh target repo - just write into it (by returning an empty array of target commits as target history) // (i.e. doing nothing) } console.log(`As final output of buildTargetCommitHistory, preceding ${oldestDateInSourceCommitHistory.toISOString()}, output is ${JSON.stringify(output)}`); return output; } // https://gist.github.com/textarcana/1306223 function parseCommit(repo_path: string, line: string): Commit { console.log(`DEBUG - line: ${line}, for path ${repo_path}`); const parsed = JSON.parse(line) return { hash: parsed['hash'], author_name: parsed['author_name'], author_email: parsed['author_email'], repo_path: repo_path, date: new Date(parsed['date']), message: parsed['message'], parentHashes: parsed['parentHashes'], } } function insertRepresentativeCommit(sourceRepo: Repo,sourceCommit: Commit, targetCommit: Commit | undefined, followOnTargetCommit: Commit | undefined): void { // If there is a target commit, if (targetCommit != undefined) { execSync(`git reset --hard ${targetCommit.hash}`, { cwd: TARGET_DIR }) } createRepresentativeCommit(sourceRepo, sourceCommit); // Then, if there is a follow-on target commit, we need to cherry-pick it onto the source commit: if (followOnTargetCommit != undefined) { execSync(`git cherry-pick ${followOnTargetCommit.hash}`, { cwd: TARGET_DIR }) }// else - nothing to cherry-pick back on top } function createRepresentativeCommit(sourceRepo: Repo,sourceCommit: Commit) { // Create a commit that represents the source commit, but with a filename that is generated from the source commit's // metadata. // // This is guaranteed to not cause conflicts with other commits, because the filename is generated from the source // commit's metadata, and no two source commits will have the same metadata. // (OK sure _technically_ these could have a collision, but...like...what are the odds?) // TODO - figure out what the odds actually are, that'd be fun :P const filename = `${sourceRepo.owner}/${sourceRepo.name}/${sourceCommit.hash}` mkdirSync(TARGET_DIR + '/' + sourceRepo.owner + '/' + sourceRepo.name, { recursive: true }); execSync(`touch ${filename}`, { cwd: TARGET_DIR }) execSync(`git add ${filename}`, { cwd: TARGET_DIR }) try { // Do _not_ arbitrarily remove the `hash` - it's used for signalling identity in `main()` const args = `"${sourceRepo.owner}/${sourceRepo.name}: ${sourceCommit.message} - ${sourceCommit.hash}" --date="${format(sourceCommit.date, 'yyyy-MM-dd HH:mm:ss')}" --author="${sourceCommit.author_name} <${sourceCommit.author_email}>"`; console.log(`About to commit with args ${args}`); // https://github.com/Shpota/github-activity-generator/blob/main/contribute.py#L63 // "%Y-%m-%d %H:%M:%S" execSync(`git commit -m ${args}`, { cwd: TARGET_DIR }) } catch (e) { console.log(e); const error = e as Record; console.log(`DEBUG - error while creating representative commit: ${'' + error.output[2]} ... ${'' + error.output[1]}`); throw e; } }