Fix broken spans in diffs (#14678)
Gitea runs diff on highlighted code fragment for each line in order to provide code highlight diffs. Unfortunately this diff algorithm is not aware that span tags and entities are atomic and cannot be split. The current fixup code makes some attempt to fix these broken tags however, it cannot handle situations where a tag is split over multiple blocks. This PR provides a more algorithmic fixup mechanism whereby spans and entities are completely coalesced into their respective blocks. This may result in a incompletely reduced diff but - it will definitely prevent the broken entities and spans that are currently possible. As a result of this fixup several inconsistencies were discovered in our testcases and these were also fixed. Fix #14231 Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
		
							parent
							
								
									f3847c9d82
								
							
						
					
					
						commit
						beb2058186
					
				| @ -182,6 +182,8 @@ var ( | ||||
| 	removedCodePrefix = []byte(`<span class="removed-code">`) | ||||
| 	codeTagSuffix     = []byte(`</span>`) | ||||
| ) | ||||
| 
 | ||||
| var unfinishedtagRegex = regexp.MustCompile(`<[^>]*$`) | ||||
| var trailingSpanRegex = regexp.MustCompile(`<span\s*[[:alpha:]="]*?[>]?$`) | ||||
| var entityRegex = regexp.MustCompile(`&[#]*?[0-9[:alpha:]]*$`) | ||||
| 
 | ||||
| @ -196,10 +198,218 @@ func shouldWriteInline(diff diffmatchpatch.Diff, lineType DiffLineType) bool { | ||||
| 	return false | ||||
| } | ||||
| 
 | ||||
| func fixupBrokenSpans(diffs []diffmatchpatch.Diff) []diffmatchpatch.Diff { | ||||
| 
 | ||||
| 	// Create a new array to store our fixed up blocks
 | ||||
| 	fixedup := make([]diffmatchpatch.Diff, 0, len(diffs)) | ||||
| 
 | ||||
| 	// semantically label some numbers
 | ||||
| 	const insert, delete, equal = 0, 1, 2 | ||||
| 
 | ||||
| 	// record the positions of the last type of each block in the fixedup blocks
 | ||||
| 	last := []int{-1, -1, -1} | ||||
| 	operation := []diffmatchpatch.Operation{diffmatchpatch.DiffInsert, diffmatchpatch.DiffDelete, diffmatchpatch.DiffEqual} | ||||
| 
 | ||||
| 	// create a writer for insert and deletes
 | ||||
| 	toWrite := []strings.Builder{ | ||||
| 		{}, | ||||
| 		{}, | ||||
| 	} | ||||
| 
 | ||||
| 	// make some flags for insert and delete
 | ||||
| 	unfinishedTag := []bool{false, false} | ||||
| 	unfinishedEnt := []bool{false, false} | ||||
| 
 | ||||
| 	// store stores the provided text in the writer for the typ
 | ||||
| 	store := func(text string, typ int) { | ||||
| 		(&(toWrite[typ])).WriteString(text) | ||||
| 	} | ||||
| 
 | ||||
| 	// hasStored returns true if there is stored content
 | ||||
| 	hasStored := func(typ int) bool { | ||||
| 		return (&toWrite[typ]).Len() > 0 | ||||
| 	} | ||||
| 
 | ||||
| 	// stored will return that content
 | ||||
| 	stored := func(typ int) string { | ||||
| 		return (&toWrite[typ]).String() | ||||
| 	} | ||||
| 
 | ||||
| 	// empty will empty the stored content
 | ||||
| 	empty := func(typ int) { | ||||
| 		(&toWrite[typ]).Reset() | ||||
| 	} | ||||
| 
 | ||||
| 	// pop will remove the stored content appending to a diff block for that typ
 | ||||
| 	pop := func(typ int, fixedup []diffmatchpatch.Diff) []diffmatchpatch.Diff { | ||||
| 		if hasStored(typ) { | ||||
| 			if last[typ] > last[equal] { | ||||
| 				fixedup[last[typ]].Text += stored(typ) | ||||
| 			} else { | ||||
| 				fixedup = append(fixedup, diffmatchpatch.Diff{ | ||||
| 					Type: operation[typ], | ||||
| 					Text: stored(typ), | ||||
| 				}) | ||||
| 			} | ||||
| 			empty(typ) | ||||
| 		} | ||||
| 		return fixedup | ||||
| 	} | ||||
| 
 | ||||
| 	// Now we walk the provided diffs and check the type of each block in turn
 | ||||
| 	for _, diff := range diffs { | ||||
| 
 | ||||
| 		typ := delete // flag for handling insert or delete typs
 | ||||
| 		switch diff.Type { | ||||
| 		case diffmatchpatch.DiffEqual: | ||||
| 			// First check if there is anything stored
 | ||||
| 			if hasStored(insert) || hasStored(delete) { | ||||
| 				// There are two reasons for storing content:
 | ||||
| 				// 1. Unfinished Entity <- Could be more efficient here by not doing this if we're looking for a tag
 | ||||
| 				if unfinishedEnt[insert] || unfinishedEnt[delete] { | ||||
| 					// we look for a ';' to finish an entity
 | ||||
| 					idx := strings.IndexRune(diff.Text, ';') | ||||
| 					if idx >= 0 { | ||||
| 						// if we find a ';' store the preceding content to both insert and delete
 | ||||
| 						store(diff.Text[:idx+1], insert) | ||||
| 						store(diff.Text[:idx+1], delete) | ||||
| 
 | ||||
| 						// and remove it from this block
 | ||||
| 						diff.Text = diff.Text[idx+1:] | ||||
| 
 | ||||
| 						// reset the ent flags
 | ||||
| 						unfinishedEnt[insert] = false | ||||
| 						unfinishedEnt[delete] = false | ||||
| 					} else { | ||||
| 						// otherwise store it all on insert and delete
 | ||||
| 						store(diff.Text, insert) | ||||
| 						store(diff.Text, delete) | ||||
| 						// and empty this block
 | ||||
| 						diff.Text = "" | ||||
| 					} | ||||
| 				} | ||||
| 				// 2. Unfinished Tag
 | ||||
| 				if unfinishedTag[insert] || unfinishedTag[delete] { | ||||
| 					// we look for a '>' to finish a tag
 | ||||
| 					idx := strings.IndexRune(diff.Text, '>') | ||||
| 					if idx >= 0 { | ||||
| 						store(diff.Text[:idx+1], insert) | ||||
| 						store(diff.Text[:idx+1], delete) | ||||
| 						diff.Text = diff.Text[idx+1:] | ||||
| 						unfinishedTag[insert] = false | ||||
| 						unfinishedTag[delete] = false | ||||
| 					} else { | ||||
| 						store(diff.Text, insert) | ||||
| 						store(diff.Text, delete) | ||||
| 						diff.Text = "" | ||||
| 					} | ||||
| 				} | ||||
| 
 | ||||
| 				// If we've completed the required tag/entities
 | ||||
| 				if !(unfinishedTag[insert] || unfinishedTag[delete] || unfinishedEnt[insert] || unfinishedEnt[delete]) { | ||||
| 					// pop off the stack
 | ||||
| 					fixedup = pop(insert, fixedup) | ||||
| 					fixedup = pop(delete, fixedup) | ||||
| 				} | ||||
| 
 | ||||
| 				// If that has left this diff block empty then shortcut
 | ||||
| 				if len(diff.Text) == 0 { | ||||
| 					continue | ||||
| 				} | ||||
| 			} | ||||
| 
 | ||||
| 			// check if this block ends in an unfinished tag?
 | ||||
| 			idx := unfinishedtagRegex.FindStringIndex(diff.Text) | ||||
| 			if idx != nil { | ||||
| 				unfinishedTag[insert] = true | ||||
| 				unfinishedTag[delete] = true | ||||
| 			} else { | ||||
| 				// otherwise does it end in an unfinished entity?
 | ||||
| 				idx = entityRegex.FindStringIndex(diff.Text) | ||||
| 				if idx != nil { | ||||
| 					unfinishedEnt[insert] = true | ||||
| 					unfinishedEnt[delete] = true | ||||
| 				} | ||||
| 			} | ||||
| 
 | ||||
| 			// If there is an unfinished component
 | ||||
| 			if idx != nil { | ||||
| 				// Store the fragment
 | ||||
| 				store(diff.Text[idx[0]:], insert) | ||||
| 				store(diff.Text[idx[0]:], delete) | ||||
| 				// and remove it from this block
 | ||||
| 				diff.Text = diff.Text[:idx[0]] | ||||
| 			} | ||||
| 
 | ||||
| 			// If that hasn't left the block empty
 | ||||
| 			if len(diff.Text) > 0 { | ||||
| 				// store the position of the last equal block and store it in our diffs
 | ||||
| 				last[equal] = len(fixedup) | ||||
| 				fixedup = append(fixedup, diff) | ||||
| 			} | ||||
| 			continue | ||||
| 		case diffmatchpatch.DiffInsert: | ||||
| 			typ = insert | ||||
| 			fallthrough | ||||
| 		case diffmatchpatch.DiffDelete: | ||||
| 			// First check if there is anything stored for this type
 | ||||
| 			if hasStored(typ) { | ||||
| 				// if there is prepend it to this block, empty the storage and reset our flags
 | ||||
| 				diff.Text = stored(typ) + diff.Text | ||||
| 				empty(typ) | ||||
| 				unfinishedEnt[typ] = false | ||||
| 				unfinishedTag[typ] = false | ||||
| 			} | ||||
| 
 | ||||
| 			// check if this block ends in an unfinished tag
 | ||||
| 			idx := unfinishedtagRegex.FindStringIndex(diff.Text) | ||||
| 			if idx != nil { | ||||
| 				unfinishedTag[typ] = true | ||||
| 			} else { | ||||
| 				// otherwise does it end in an unfinished entity
 | ||||
| 				idx = entityRegex.FindStringIndex(diff.Text) | ||||
| 				if idx != nil { | ||||
| 					unfinishedEnt[typ] = true | ||||
| 				} | ||||
| 			} | ||||
| 
 | ||||
| 			// If there is an unfinished component
 | ||||
| 			if idx != nil { | ||||
| 				// Store the fragment
 | ||||
| 				store(diff.Text[idx[0]:], typ) | ||||
| 				// and remove it from this block
 | ||||
| 				diff.Text = diff.Text[:idx[0]] | ||||
| 			} | ||||
| 
 | ||||
| 			// If that hasn't left the block empty
 | ||||
| 			if len(diff.Text) > 0 { | ||||
| 				// if the last block of this type was after the last equal block
 | ||||
| 				if last[typ] > last[equal] { | ||||
| 					// store this blocks content on that block
 | ||||
| 					fixedup[last[typ]].Text += diff.Text | ||||
| 				} else { | ||||
| 					// otherwise store the position of the last block of this type and store the block
 | ||||
| 					last[typ] = len(fixedup) | ||||
| 					fixedup = append(fixedup, diff) | ||||
| 				} | ||||
| 			} | ||||
| 			continue | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	// pop off any remaining stored content
 | ||||
| 	fixedup = pop(insert, fixedup) | ||||
| 	fixedup = pop(delete, fixedup) | ||||
| 
 | ||||
| 	return fixedup | ||||
| } | ||||
| 
 | ||||
| func diffToHTML(fileName string, diffs []diffmatchpatch.Diff, lineType DiffLineType) template.HTML { | ||||
| 	buf := bytes.NewBuffer(nil) | ||||
| 	match := "" | ||||
| 
 | ||||
| 	diffs = fixupBrokenSpans(diffs) | ||||
| 
 | ||||
| 	for _, diff := range diffs { | ||||
| 		if shouldWriteInline(diff, lineType) { | ||||
| 			if len(match) > 0 { | ||||
|  | ||||
| @ -15,6 +15,7 @@ import ( | ||||
| 
 | ||||
| 	"code.gitea.io/gitea/models" | ||||
| 	"code.gitea.io/gitea/modules/git" | ||||
| 	"code.gitea.io/gitea/modules/highlight" | ||||
| 	"code.gitea.io/gitea/modules/setting" | ||||
| 	dmp "github.com/sergi/go-diff/diffmatchpatch" | ||||
| 	"github.com/stretchr/testify/assert" | ||||
| @ -23,7 +24,7 @@ import ( | ||||
| 
 | ||||
| func assertEqual(t *testing.T, s1 string, s2 template.HTML) { | ||||
| 	if s1 != string(s2) { | ||||
| 		t.Errorf("%s should be equal %s", s2, s1) | ||||
| 		t.Errorf("Did not receive expected results:\nExpected: %s\nActual:   %s", s1, s2) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| @ -61,7 +62,7 @@ func TestDiffToHTML(t *testing.T) { | ||||
| 		{Type: dmp.DiffEqual, Text: "</span><span class=\"p\">)</span>"}, | ||||
| 	}, DiffLineDel)) | ||||
| 
 | ||||
| 	assertEqual(t, "<span class=\"nx\">r</span><span class=\"p\">.</span><span class=\"nf\">WrapperRenderer</span><span class=\"p\">(</span><span class=\"nx\">w</span><span class=\"p\">,</span> <span class=\"removed-code\"><span class=\"nx\">language</span></span><span class=\"removed-code\"><span class=\"p\">,</span> <span class=\"kc\">true</span><span class=\"p\">,</span> <span class=\"nx\">attrs</span></span><span class=\"p\">,</span> <span class=\"kc\">false</span><span class=\"p\">)</span>", diffToHTML("", []dmp.Diff{ | ||||
| 	assertEqual(t, "<span class=\"nx\">r</span><span class=\"p\">.</span><span class=\"nf\">WrapperRenderer</span><span class=\"p\">(</span><span class=\"nx\">w</span><span class=\"p\">,</span> <span class=\"removed-code\"><span class=\"nx\">language</span><span class=\"p\">,</span> <span class=\"kc\">true</span><span class=\"p\">,</span> <span class=\"nx\">attrs</span></span><span class=\"p\">,</span> <span class=\"kc\">false</span><span class=\"p\">)</span>", diffToHTML("", []dmp.Diff{ | ||||
| 		{Type: dmp.DiffEqual, Text: "<span class=\"nx\">r</span><span class=\"p\">.</span><span class=\"nf\">WrapperRenderer</span><span class=\"p\">(</span><span class=\"nx\">w</span><span class=\"p\">,</span> <span class=\"nx\">"}, | ||||
| 		{Type: dmp.DiffDelete, Text: "language</span><span "}, | ||||
| 		{Type: dmp.DiffEqual, Text: "c"}, | ||||
| @ -69,14 +70,14 @@ func TestDiffToHTML(t *testing.T) { | ||||
| 		{Type: dmp.DiffEqual, Text: "</span><span class=\"p\">,</span> <span class=\"kc\">false</span><span class=\"p\">)</span>"}, | ||||
| 	}, DiffLineDel)) | ||||
| 
 | ||||
| 	assertEqual(t, "<span class=\"added-code\">language</span></span><span class=\"added-code\"><span class=\"p\">,</span> <span class=\"kc\">true</span><span class=\"p\">,</span> <span class=\"nx\">attrs</span></span><span class=\"p\">,</span> <span class=\"kc\">false</span><span class=\"p\">)</span>", diffToHTML("", []dmp.Diff{ | ||||
| 	assertEqual(t, "<span class=\"added-code\">language</span><span class=\"p\">,</span> <span class=\"kc\">true</span><span class=\"p\">,</span> <span class=\"nx\">attrs</span></span><span class=\"p\">,</span> <span class=\"kc\">false</span><span class=\"p\">)</span>", diffToHTML("", []dmp.Diff{ | ||||
| 		{Type: dmp.DiffInsert, Text: "language</span><span "}, | ||||
| 		{Type: dmp.DiffEqual, Text: "c"}, | ||||
| 		{Type: dmp.DiffInsert, Text: "lass=\"p\">,</span> <span class=\"kc\">true</span><span class=\"p\">,</span> <span class=\"nx\">attrs"}, | ||||
| 		{Type: dmp.DiffEqual, Text: "</span><span class=\"p\">,</span> <span class=\"kc\">false</span><span class=\"p\">)</span>"}, | ||||
| 	}, DiffLineAdd)) | ||||
| 
 | ||||
| 	assertEqual(t, "<span class=\"k\">print</span><span class=\"added-code\"></span><span class=\"added-code\"><span class=\"p\">(</span></span><span class=\"sa\"></span><span class=\"s2\">"</span><span class=\"s2\">// </span><span class=\"s2\">"</span><span class=\"p\">,</span> <span class=\"n\">sys</span><span class=\"o\">.</span><span class=\"n\">argv</span><span class=\"added-code\"><span class=\"p\">)</span></span>", diffToHTML("", []dmp.Diff{ | ||||
| 	assertEqual(t, "<span class=\"k\">print</span><span class=\"added-code\"><span class=\"p\">(</span></span><span class=\"sa\"></span><span class=\"s2\">"</span><span class=\"s2\">// </span><span class=\"s2\">"</span><span class=\"p\">,</span> <span class=\"n\">sys</span><span class=\"o\">.</span><span class=\"n\">argv</span><span class=\"added-code\"><span class=\"p\">)</span></span>", diffToHTML("", []dmp.Diff{ | ||||
| 		{Type: dmp.DiffEqual, Text: "<span class=\"k\">print</span>"}, | ||||
| 		{Type: dmp.DiffInsert, Text: "<span"}, | ||||
| 		{Type: dmp.DiffEqual, Text: " "}, | ||||
| @ -85,14 +86,14 @@ func TestDiffToHTML(t *testing.T) { | ||||
| 		{Type: dmp.DiffInsert, Text: "<span class=\"p\">)</span>"}, | ||||
| 	}, DiffLineAdd)) | ||||
| 
 | ||||
| 	assertEqual(t, "sh <span class=\"added-code\">'useradd -u $(stat -c "%u" .gitignore) jenkins</span>'", diffToHTML("", []dmp.Diff{ | ||||
| 	assertEqual(t, "sh <span class=\"added-code\">'useradd -u $(stat -c "%u" .gitignore) jenkins'</span>", diffToHTML("", []dmp.Diff{ | ||||
| 		{Type: dmp.DiffEqual, Text: "sh "}, | ||||
| 		{Type: dmp.DiffDelete, Text: "4;useradd -u 111 jenkins""}, | ||||
| 		{Type: dmp.DiffInsert, Text: "9;useradd -u $(stat -c "%u" .gitignore) jenkins'"}, | ||||
| 		{Type: dmp.DiffEqual, Text: ";"}, | ||||
| 	}, DiffLineAdd)) | ||||
| 
 | ||||
| 	assertEqual(t, "<span class=\"x\">							<h<span class=\"added-code\">4 class=</span><span class=\"added-code\">"release-list-title df ac"</span>></span>", diffToHTML("", []dmp.Diff{ | ||||
| 	assertEqual(t, "<span class=\"x\">							<h<span class=\"added-code\">4 class="release-list-title df ac"</span>></span>", diffToHTML("", []dmp.Diff{ | ||||
| 		{Type: dmp.DiffEqual, Text: "<span class=\"x\">							<h"}, | ||||
| 		{Type: dmp.DiffInsert, Text: "4 class=&#"}, | ||||
| 		{Type: dmp.DiffEqual, Text: "3"}, | ||||
| @ -462,3 +463,14 @@ func TestGetDiffRangeWithWhitespaceBehavior(t *testing.T) { | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func TestDiffToHTML_14231(t *testing.T) { | ||||
| 	setting.Cfg = ini.Empty() | ||||
| 	diffRecord := diffMatchPatch.DiffMain(highlight.Code("main.v", "		run()\n"), highlight.Code("main.v", "		run(db)\n"), true) | ||||
| 	diffRecord = diffMatchPatch.DiffCleanupEfficiency(diffRecord) | ||||
| 
 | ||||
| 	expected := `		<span class="n">run</span><span class="added-code"><span class="o">(</span><span class="n">db</span></span><span class="o">)</span>` | ||||
| 	output := diffToHTML("main.v", diffRecord, DiffLineAdd) | ||||
| 
 | ||||
| 	assertEqual(t, expected, output) | ||||
| } | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user