diff options
Diffstat (limited to 'mnv/src/linematch.c')
| -rw-r--r-- | mnv/src/linematch.c | 493 |
1 files changed, 493 insertions, 0 deletions
diff --git a/mnv/src/linematch.c b/mnv/src/linematch.c new file mode 100644 index 0000000000..8f3b520838 --- /dev/null +++ b/mnv/src/linematch.c @@ -0,0 +1,493 @@ +/* vi:set ts=8 sts=4 sw=4 noet: + * + * MNV - MNV is not Vim by Bram Moolenaar + * + * Do ":help uganda" in MNV to read copying and usage conditions. + * Do ":help credits" in MNV to see a list of people who contributed. + * See README.txt for an overview of the MNV source code. + */ + +#include "mnv.h" + +#define LN_MAX_BUFS 8 +#define LN_DECISION_MAX 255 // pow(2, LN_MAX_BUFS(8)) - 1 = 255 + +// struct for running the diff linematch algorithm +typedef struct diffcmppath_S diffcmppath_T; +struct diffcmppath_S +{ + // to keep track of the total score of this path + int df_lev_score; + size_t df_path_n; // current index of this path + int df_choice_mem[LN_DECISION_MAX + 1]; + int df_choice[LN_DECISION_MAX]; + // to keep track of this path traveled + diffcmppath_T *df_decision[LN_DECISION_MAX]; + size_t df_optimal_choice; +}; + +static int matching_chars(const mmfile_t *m1, const mmfile_t *m2); +static size_t unwrap_indexes(const int *values, const int *diff_len, const size_t ndiffs); +static size_t test_charmatch_paths(diffcmppath_T *node, int lastdecision); + + static size_t +line_len(const mmfile_t *m) +{ + char *s = m->ptr; + char *end; + + end = memchr(s, '\n', (size_t)m->size); + return end ? (size_t)(end - s) : (size_t)m->size; +} + +#define MATCH_CHAR_MAX_LEN 800 + +/// Same as matching_chars but ignore whitespace +/// +/// @param s1 +/// @param s2 + static int +matching_chars_iwhite(const mmfile_t *s1, const mmfile_t *s2) +{ + // the newly processed strings that will be compared + // delete the white space characters + mmfile_t sp[2]; + char p[2][MATCH_CHAR_MAX_LEN]; + + for (int k = 0; k < 2; k++) + { + const mmfile_t *s = k == 0 ? s1 : s2; + size_t pi = 0; + size_t slen = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(s)); + + for (size_t i = 0; i <= slen; i++) + { + char e = s->ptr[i]; + + if (e != ' ' && e != '\t') + { + p[k][pi] = e; + pi++; + } + } + + sp[k].ptr = p[k]; + sp[k].size = (int)pi; + } + + return matching_chars(&sp[0], &sp[1]); +} + +/// Return matching characters between "s1" and "s2" whilst respecting sequence +/// order. +/// Consider the case of two strings 'AAACCC' and 'CCCAAA', the +/// return value from this function will be 3, either to match +/// the 3 C's, or the 3 A's. +/// +/// Examples: +/// matching_chars("aabc", "acba") -> 2 // 'a' and 'b' in common +/// matching_chars("123hello567", "he123ll567o") -> 8 // '123', 'll' and '567' in common +/// matching_chars("abcdefg", "gfedcba") -> 1 // all characters in common, +/// // but only at most 1 in sequence +/// +/// @param m1 +/// @param m2 + static int +matching_chars(const mmfile_t *m1, const mmfile_t *m2) +{ + size_t s1len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m1)); + size_t s2len = MIN(MATCH_CHAR_MAX_LEN - 1, line_len(m2)); + char *s1 = m1->ptr; + char *s2 = m2->ptr; + int matrix[2][MATCH_CHAR_MAX_LEN] = { 0 }; + int icur = 1; // save space by storing only two rows for i axis + + for (size_t i = 0; i < s1len; i++) + { + icur = (icur == 1 ? 0 : 1); + int *e1 = matrix[icur]; + int *e2 = matrix[!icur]; + + for (size_t j = 0; j < s2len; j++) + { + // skip char in s1 + if (e2[j + 1] > e1[j + 1]) + e1[j + 1] = e2[j + 1]; + // skip char in s2 + if (e1[j] > e1[j + 1]) + e1[j + 1] = e1[j]; + // compare char in s1 and s2 + if ((s1[i] == s2[j]) && (e2[j] + 1) > e1[j + 1]) + e1[j + 1] = e2[j] + 1; + } + } + + return matrix[icur][s2len]; +} + +/// count the matching characters between a variable number of strings "sp" +/// mark the strings that have already been compared to extract them later +/// without re-running the character match counting. +/// @param sp +/// @param fomvals +/// @param n + static int +count_n_matched_chars(mmfile_t **sp, const size_t n, int iwhite) +{ + int matched_chars = 0; + int matched = 0; + + for (size_t i = 0; i < n; i++) + { + for (size_t j = i + 1; j < n; j++) + { + if (sp[i]->ptr != NULL && sp[j]->ptr != NULL) + { + matched++; + // TODO(lewis6991): handle whitespace ignoring higher up in the + // stack + matched_chars += iwhite ? matching_chars_iwhite(sp[i], sp[j]) + : matching_chars(sp[i], sp[j]); + } + } + } + + // prioritize a match of 3 (or more lines) equally to a match of 2 lines + if (matched >= 2) + { + matched_chars *= 2; + matched_chars /= matched; + } + + return matched_chars; +} + + static mmfile_t +fastforward_buf_to_lnum(mmfile_t s, linenr_T lnum) +{ + for (int i = 0; i < lnum - 1; i++) + { + char *line_end; + + line_end = memchr(s.ptr, '\n', (size_t)s.size); + s.size = line_end ? (int)(s.size - (line_end - s.ptr)) : 0; + s.ptr = line_end; + if (!s.ptr) + break; + s.ptr++; + s.size--; + } + + return s; +} + +/// try all the different ways to compare these lines and use the one that +/// results in the most matching characters +/// @param df_iters +/// @param paths +/// @param npaths +/// @param path_idx +/// @param choice +/// @param diffcmppath +/// @param diff_len +/// @param ndiffs +/// @param diff_blk + static void +try_possible_paths( + const int *df_iters, + const size_t *paths, + const int npaths, + const int path_idx, + int *choice, + diffcmppath_T *diffcmppath, + const int *diff_len, + const size_t ndiffs, + const mmfile_t **diff_blk, + int iwhite) +{ + if (path_idx == npaths) + { + if ((*choice) > 0) + { + int from_vals[LN_MAX_BUFS] = { 0 }; + const int *to_vals = df_iters; + + mmfile_t mm[LN_MAX_BUFS]; // stack memory for current_lines + mmfile_t *current_lines[LN_MAX_BUFS]; + for (size_t k = 0; k < ndiffs; k++) + { + from_vals[k] = df_iters[k]; + // get the index at all of the places + if ((*choice) & (1 << k)) + { + from_vals[k]--; + mm[k] = fastforward_buf_to_lnum(*diff_blk[k], df_iters[k]); + } + else + CLEAR_FIELD(mm[k]); + current_lines[k] = &mm[k]; + } + size_t unwrapped_idx_from = unwrap_indexes(from_vals, diff_len, ndiffs); + size_t unwrapped_idx_to = unwrap_indexes(to_vals, diff_len, ndiffs); + int matched_chars = count_n_matched_chars(current_lines, ndiffs, iwhite); + int score = diffcmppath[unwrapped_idx_from].df_lev_score + matched_chars; + + if (score > diffcmppath[unwrapped_idx_to].df_lev_score) + { + diffcmppath[unwrapped_idx_to].df_path_n = 1; + diffcmppath[unwrapped_idx_to].df_decision[0] = + &diffcmppath[unwrapped_idx_from]; + diffcmppath[unwrapped_idx_to].df_choice[0] = *choice; + diffcmppath[unwrapped_idx_to].df_lev_score = score; + } + else if (score == diffcmppath[unwrapped_idx_to].df_lev_score) + { + size_t k = diffcmppath[unwrapped_idx_to].df_path_n++; + diffcmppath[unwrapped_idx_to].df_decision[k] = + &diffcmppath[unwrapped_idx_from]; + diffcmppath[unwrapped_idx_to].df_choice[k] = *choice; + } + } + return; + } + + size_t bit_place = paths[path_idx]; + *(choice) |= (1 << bit_place); // set it to 1 + try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice, + diffcmppath, diff_len, ndiffs, diff_blk, iwhite); + *(choice) &= ~(1 << bit_place); // set it to 0 + try_possible_paths(df_iters, paths, npaths, path_idx + 1, choice, + diffcmppath, diff_len, ndiffs, diff_blk, iwhite); +} + +/// unwrap indexes to access n dimensional tensor +/// @param values +/// @param diff_len +/// @param ndiffs + static size_t +unwrap_indexes(const int *values, const int *diff_len, const size_t ndiffs) +{ + size_t num_unwrap_scalar = 1; + + for (size_t k = 0; k < ndiffs; k++) + num_unwrap_scalar *= (size_t)diff_len[k] + 1; + + size_t path_idx = 0; + for (size_t k = 0; k < ndiffs; k++) + { + num_unwrap_scalar /= (size_t)diff_len[k] + 1; + + int n = values[k]; + path_idx += num_unwrap_scalar * (size_t)n; + } + + return path_idx; +} + +/// populate the values of the linematch algorithm tensor, and find the best +/// decision for how to compare the relevant lines from each of the buffers at +/// each point in the tensor +/// @param df_iters +/// @param ch_dim +/// @param diffcmppath +/// @param diff_len +/// @param ndiffs +/// @param diff_blk + static void +populate_tensor( + int *df_iters, + const size_t ch_dim, + diffcmppath_T *diffcmppath, + const int *diff_len, + const size_t ndiffs, + const mmfile_t **diff_blk, + int iwhite) +{ + if (ch_dim == ndiffs) + { + int npaths = 0; + size_t paths[LN_MAX_BUFS]; + + for (size_t j = 0; j < ndiffs; j++) + { + if (df_iters[j] > 0) + { + paths[npaths] = j; + npaths++; + } + } + + int choice = 0; + size_t unwrapper_idx_to = unwrap_indexes(df_iters, diff_len, ndiffs); + + diffcmppath[unwrapper_idx_to].df_lev_score = -1; + try_possible_paths(df_iters, paths, npaths, 0, &choice, diffcmppath, + diff_len, ndiffs, diff_blk, iwhite); + return; + } + + for (int i = 0; i <= diff_len[ch_dim]; i++) + { + df_iters[ch_dim] = i; + populate_tensor(df_iters, ch_dim + 1, diffcmppath, diff_len, + ndiffs, diff_blk, iwhite); + } +} + +/// algorithm to find an optimal alignment of lines of a diff block with 2 or +/// more files. The algorithm is generalized to work for any number of files +/// which corresponds to another dimension added to the tensor used in the +/// algorithm +/// +/// for questions and information about the linematch algorithm please contact +/// Jonathon White (jonathonwhite@protonmail.com) +/// +/// for explanation, a summary of the algorithm in 3 dimensions (3 files +/// compared) follows +/// +/// The 3d case (for 3 buffers) of the algorithm implemented when diffopt +/// 'linematch' is enabled. The algorithm constructs a 3d tensor to +/// compare a diff between 3 buffers. The dimensions of the tensor are +/// the length of the diff in each buffer plus 1 A path is constructed by +/// moving from one edge of the cube/3d tensor to the opposite edge. +/// Motions from one cell of the cube to the next represent decisions. In +/// a 3d cube, there are a total of 7 decisions that can be made, +/// represented by the enum df_path3_choice which is defined in +/// buffer_defs.h a comparison of buffer 0 and 1 represents a motion +/// toward the opposite edge of the cube with components along the 0 and +/// 1 axes. a comparison of buffer 0, 1, and 2 represents a motion +/// toward the opposite edge of the cube with components along the 0, 1, +/// and 2 axes. A skip of buffer 0 represents a motion along only the 0 +/// axis. For each action, a point value is awarded, and the path is +/// saved for reference later, if it is found to have been the optimal +/// path. The optimal path has the highest score. The score is +/// calculated as the summation of the total characters matching between +/// all of the lines which were compared. The structure of the algorithm +/// is that of a dynamic programming problem. We can calculate a point +/// i,j,k in the cube as a function of i-1, j-1, and k-1. To find the +/// score and path at point i,j,k, we must determine which path we want +/// to use, this is done by looking at the possibilities and choosing +/// the one which results in the local highest score. The total highest +/// scored path is, then in the end represented by the cell in the +/// opposite corner from the start location. The entire algorithm +/// consists of populating the 3d cube with the optimal paths from which +/// it may have came. +/// +/// Optimizations: +/// As the function to calculate the cell of a tensor at point i,j,k is a +/// function of the cells at i-1, j-1, k-1, the whole tensor doesn't need +/// to be stored in memory at once. In the case of the 3d cube, only two +/// slices (along k and j axis) are stored in memory. For the 2d matrix +/// (for 2 files), only two rows are stored at a time. The next/previous +/// slice (or row) is always calculated from the other, and they alternate +/// at each iteration. +/// In the 3d case, 3 arrays are populated to memorize the score (matched +/// characters) of the 3 buffers, so a redundant calculation of the +/// scores does not occur +/// @param diff_blk +/// @param diff_len +/// @param ndiffs +/// @param [out] [allocated] decisions +/// @return the length of decisions + size_t +linematch_nbuffers( + const mmfile_t **diff_blk, + const int *diff_len, + const size_t ndiffs, + int **decisions, + int iwhite) +{ + assert(ndiffs <= LN_MAX_BUFS); + + size_t memsize = 1; + size_t memsize_decisions = 0; + for (size_t i = 0; i < ndiffs; i++) + { + assert(diff_len[i] >= 0); + memsize *= (size_t)(diff_len[i] + 1); + memsize_decisions += (size_t)diff_len[i]; + } + + // create the flattened path matrix + diffcmppath_T *diffcmppath = lalloc(sizeof(diffcmppath_T) * memsize, TRUE); + if (diffcmppath == NULL) + return 0; + + // allocate memory here + size_t n = (size_t)pow(2.0, (double)ndiffs); + for (size_t i = 0; i < memsize; i++) + { + diffcmppath[i].df_lev_score = 0; + diffcmppath[i].df_path_n = 0; + for (size_t j = 0; j < n; j++) + diffcmppath[i].df_choice_mem[j] = -1; + } + + // memory for avoiding repetitive calculations of score + int df_iters[LN_MAX_BUFS]; + populate_tensor(df_iters, 0, diffcmppath, diff_len, ndiffs, diff_blk, + iwhite); + + const size_t u = unwrap_indexes(diff_len, diff_len, ndiffs); + diffcmppath_T *startNode = &diffcmppath[u]; + + *decisions = lalloc(sizeof(int) * memsize_decisions, TRUE); + if (*decisions == NULL) + { + mnv_free(diffcmppath); + return 0; + } + + size_t n_optimal = 0; + test_charmatch_paths(startNode, 0); + while (startNode->df_path_n > 0) + { + size_t j = startNode->df_optimal_choice; + (*decisions)[n_optimal++] = startNode->df_choice[j]; + startNode = startNode->df_decision[j]; + } + // reverse array + for (size_t i = 0; i < (n_optimal / 2); i++) + { + int tmp = (*decisions)[i]; + (*decisions)[i] = (*decisions)[n_optimal - 1 - i]; + (*decisions)[n_optimal - 1 - i] = tmp; + } + + mnv_free(diffcmppath); + + return n_optimal; +} + +// returns the minimum amount of path changes from start to end + static size_t +test_charmatch_paths(diffcmppath_T *node, int lastdecision) +{ + // memoization + if (node->df_choice_mem[lastdecision] == -1) + { + if (node->df_path_n == 0) + // we have reached the end of the tree + node->df_choice_mem[lastdecision] = 0; + else + { + // the minimum amount of turns required to reach the end + size_t minimum_turns = SIZE_MAX; + for (size_t i = 0; i < node->df_path_n; i++) + { + // recurse + size_t t = test_charmatch_paths(node->df_decision[i], + node->df_choice[i]) + + (lastdecision != node->df_choice[i] ? 1 : 0); + if (t < minimum_turns) + { + node->df_optimal_choice = i; + minimum_turns = t; + } + } + node->df_choice_mem[lastdecision] = (int)minimum_turns; + } + } + + return (size_t)node->df_choice_mem[lastdecision]; +} |
