commit-report-sync/src/main.ts

import { RepoId, repoString, Commit } from './types';
import { existsSync, mkdirSync } from 'fs';
import { getCommitsSinceLatestBeforeGivenDate, getNLatestCommits, gitClone, gitPush, insertRepresentativeCommit } from './git';

const WORKING_DIR = './working';
const SOURCE_DIR = WORKING_DIR + '/source';
const TARGET_DIR = WORKING_DIR + '/target';

export async function main(sourceRepoId: RepoId, targetRepoId: RepoId, dryRun: boolean, tokenForTargetRepo: string) {
    setPreconditions(tokenForTargetRepo, sourceRepoId, targetRepoId);

    // Logic:
    // * Go back as far in source commit history as the given number of commits
    // * For each commit, check if it is recorded in the target repo
    // * If it is, move on to the next commit in the source repo
    // * If it is not:
    //   * Find the commit in the target repo that immediately predates it (and note its child)
    //   * Create a record-commit in the target repo (note that this will never cause merge conflicts because the
    //       filename modified in the commit will be generated based on the source commit's metadata - i.e. no two
    //       record-commits will modify the same filename)
    //   * Cherry-pick the previous child onto this (again - no merge conflicts, because differing filenames are
    //       guaranteed by having different source commit metadata)
    //   * Move on to the next commit in the source repo
    //
    // (Thinking about the algorithm, if we need to optimize it, can do some kind of moving-pointer/stack system on the
    // commit history to reduce amount of seeking time - but, until we test it and have reason to believe that the slow
    // naive approach is a problem, I'm not going to bother. YAGNI :shrug: (for myself, I doubt I have enough volume of
    // code to cause scaling issues. Not that I'm saying I don't commit a lot - just, I'm only a single person, and I've
    // only been alive so many years - there's a hard limit on the rate of code I could possibly have generated, which
    // is small compared to, y'know, _companies_. And I don't see organizations of that size caring about GitHub
    // contribution history at whole-org scale - and if they do, it'd be proportionally simple for them to implement it.
    const sourceCommitHistory = getNLatestCommits(SOURCE_DIR, 10);

    // Calling `doSomethingTo(sourceCommitHistory.reverse()); doSomethingElseTo(sourceCommitHistory.reverse());` results
    // in the second invocation receiving the double-reversed array.
    const reversedSourceCommitHistory = sourceCommitHistory.reverse();
    // Only have to go back far enough in target history to find the commit that immediately precedes the oldest
    // commit in source history
    //
    // Though - we have to do this for _every_ commit in source history (if it didn't cause an insertion, that is)
    // because every insertion to target history will re-hash all the child commits (but we do have to insert, rather
    // than abandoning the target tree after the insertion point and trusting in later operation to rebuild it - because
    // the target repo's tree will have representations of commits from _other_ (source)repos too, which we cannot
    // recreate without their context)
    let targetCommitHistory = getCommitsSinceLatestBeforeGivenDate(TARGET_DIR, reversedSourceCommitHistory[0].date);

    for (const sourceCommit of reversedSourceCommitHistory) {
        // "(Index of) First Target Commit that is earlier than (or equal to) the source commit"
        const targetCommitIndex = targetCommitHistory.findIndex(c => c.date <= sourceCommit.date);
        console.log(`DEBUG - targetCommitIndex: ${targetCommitIndex}. targetCommitHistory: ${JSON.stringify(targetCommitHistory)}`);
        // TODO - refactor this to use guard clauses more than nested-ifs
        if (targetCommitIndex != -1) {
            const targetCommit = targetCommitHistory[targetCommitIndex];
            // If the target commit is a representation of the source commit, we can skip it
            if (targetCommit.message.includes(sourceCommit.hash)) {
                console.log(`DEBUG - found a match for ${sourceCommit.hash} in ${targetCommit.hash} - found a representation of the source commit. No need to insert.`);
                continue; // Not strictly necessary, but makes intention clearer when reading
            } else {
                if (targetCommit.date == sourceCommit.date) {
                    throw new Error(`Target commit ${targetCommit.hash} has the same date as source commit ${sourceCommit.hash}, but they are not representations of one another. This should never happen. This means that two source repos have commits at the exact same time, which is one of the (anti-)pre-requisites of this tool. If you need this feature, let me know.`);
                } else {
                    console.log(`DEBUG - No match for ${sourceCommit.hash} in ${targetCommit.hash} - inserting.`);
                    let followOnTargetCommit: Commit | undefined;
                    if (targetCommitIndex == 0) {
                        // The targetCommit is the latest (in time; first in the array) in the history - i.e. there is nothing to cherry-pick back on top of it
                        followOnTargetCommit = undefined;
                    } else {
                        followOnTargetCommit = targetCommitHistory[targetCommitIndex - 1];
                    }
                    insertRepresentativeCommit(TARGET_DIR,sourceRepoId, sourceCommit, targetCommit, followOnTargetCommit);
                    // And then regenerate the target commit history
                    // Thankfully, we only need to do this back to immediately preceding the _just processed_ source
                    // commit (since we know that all the rest of the source commits to be processed will be after it),
                    // which should cut down a _bit_ on the time required to carry this out. It's still quadratic (or,
                    // rather, scales as O(n_1 * n_2), where those are the sizes of the histories of the two repos being
                    // compared), albeit approximately-halved - but I'm gambling on the fact that that should still take
                    // negligible practical time at usual repo sizes - at least, the ones I'm
                    // this quadratic portion should be negligible, though - and, even if it isn't, it definitely will
                    targetCommitHistory = getCommitsSinceLatestBeforeGivenDate(TARGET_DIR, sourceCommit.date);
                }
            }
            console.log(`DEBUG - targetCommit: ${targetCommit.hash}`);
        } else {
            console.log(`DEBUG - could not find a targetCommit that is earlier than or equal to the sourceCommit ${sourceCommit.hash} - therefore, writing the source commit's representation onto the current HEAD of target repo`);
            insertRepresentativeCommit(TARGET_DIR, sourceRepoId, sourceCommit, undefined, undefined);
            // As above, have to regenerate history after mutation
            targetCommitHistory = getCommitsSinceLatestBeforeGivenDate(TARGET_DIR, sourceCommit.date);
        }
    }
    // OK, that's it - we've processed all the source commits, and we've inserted all the necessary target commits.
    // We can just `git push` to the target repo now.
    if (!dryRun) {
        gitPush(TARGET_DIR, tokenForTargetRepo, targetRepoId);
    }
    return
}

function setPreconditions(tokenForTargetRepo: string, sourceRepoId: RepoId, targetRepoId: RepoId) {
    // It _shouldn't_ ever exist, but it if did, something weird is going on.
    if (existsSync(WORKING_DIR) || existsSync(SOURCE_DIR) || existsSync(TARGET_DIR)) {
        throw new Error('Working directory already exists/populated');
    }

    if (tokenForTargetRepo == '') {
        throw new Error('token_for_target_repo is required');
    }

    mkdirSync(WORKING_DIR);
    mkdirSync(SOURCE_DIR);
    mkdirSync(TARGET_DIR);

    console.log(`DEBUG - sourceRepoPath: ${repoString(sourceRepoId)}`)
    console.log(`DEBUG - targetRepoPath: ${repoString(targetRepoId)}`)

    // TODO - allow parameterizing how far back in history to checkout (because it might take a long time for older
    // repos and, once synced initially, it won't have to go back further than a single one in most cases)
    gitClone(SOURCE_DIR, `https://${repoString(sourceRepoId)}`);
    gitClone(TARGET_DIR, `https://${repoString(targetRepoId)}`);
}