Package groupthink :: Module stringtree
[hide private]
[frames] | no frames]

Source Code for Module groupthink.stringtree

  1  """ 
  2  a module containing L{SimpleStringTree}, which represents an editable string 
  3  as a dependency tree of edits.  This module exists to support L{UnorderedString}. 
  4   
  5  This module was developed in part as a Google Summer of Code 2009 
  6  project. 
  7  """ 
  8  import random 
  9  random.seed() #Works around some mysterious bug in the Journal or Rainbow that 
 10                #causes the random number generator to be seeded with the same 
 11                #constant value if a Journal entry is re-launched from the 
 12                #Journal. 
 13  from collections import defaultdict 
 14  import dbus # We do dbus (de)serialization in this file to minimize abstraction breakage 
 15  import logging 
 16  import aatree 
 17   
 18  inf = float('inf') 
 19   
20 -class Change:
21 """Each Change represents a change to the StringTree. 22 """
23 - def __init__(self, unique_id, parent, edit):
24 """ 25 @param unique_id: a unique identifier for this Change 26 @param parent: the unique_id that is affected by this 27 Change. Parent unique_id are always associated with an Insertion edit. 28 @param edit: an Insertion, Deletion, or Removal. It's what is to be done to 29 the parent""" 30 self.unique_id = unique_id 31 self.parent = parent 32 self.edit = edit
33
34 - def __repr__(self):
35 return "Change(%d, %d, %s)" % (self.unique_id, self.parent, str(self.edit))
36
37 -class Insertion:
38 """Represents the action of inserting a particular bit of text 39 """
40 - def __init__(self, position, text):
41 """ 42 @param position: the point at which to insert text, in the unmodified 43 coordinates of the parent node (other insertions and deletions 44 to that node do not affect these coordinates) 45 @type position: int 46 @param text: the string to insert 47 @type text: str 48 """ 49 self.position = position 50 self.text = text
51
52 - def __repr__(self):
53 return "Insertion(%d, %s)" % (self.position, self.text)
54
55 -class Deletion:
56 """ 57 Represents the deletion of only those characters present in the parent in 58 this range. Characters that have been inserted into the parent by 59 an Insertion are not affected. 60 """
61 - def __init__(self, position, length):
62 """ 63 @type position: int >= 0 64 @param position: the point, in the unmodified coordinates of the parent 65 node, at which to start deletion 66 @type length: int > 0 67 @param length: the number of characters to delete""" 68 self.position = position 69 self.length = length
70
71 - def __repr__(self):
72 return "Deletion(%d, %d)" % (self.position, self.length)
73
74 -class Removal:
75 """ 76 Not currently used. 77 78 Represents the deletion of all characters ever inserted in this range. 79 Insertions at the endpoints are _not_ included in the Removal, and must 80 be Removed separately. 81 """
82 - def __init__(self, position, length):
83 """ 84 @param position: the point at which to start Removal 85 @param length: the number of points in the unmodified parent coordinate to 86 the end of the Removal. Note that many more than length characters 87 can be removed if there are Insertions in this range.""" 88 self.position = position 89 self.length = length
90
91 - def __repr__(self):
92 return "Removal(%d, %d)" % (self.position, self.length)
93
94 -class Record:
95 """ 96 Each Record is used to store one Change inside the StringTree. 97 The purpose of a Record is contain both the Change itself, and any cached 98 information about the current effect of that Change. 99 """
100 - def __init__(self, change, depth):
101 """ 102 @type change: L{Change} 103 @type depth: int 104 @param depth: the number of parent edits between this edit and the root 105 of the change tree. This state is maintained because the editing 106 algorithm requires that the parent of a new Change not have any 107 remaining descendants that could serve as parent instead. 108 """ 109 self.change = change 110 self.depth = depth
111
112 - def __str__(self):
113 return "Record(%s, %d)" % (str(self.change), self.depth)
114 115 __repr__ = __str__
116
117 -def flatten(L):
118 """Disused. Recursively flattens nested lists.""" 119 o = [] 120 for x in L: 121 if isinstance(x,list): 122 o.extend(flatten(x)) #recursive 123 else: 124 o.append(x) 125 return o
126
127 -def my_rand():
128 """@return: a large random number that will fit into a dbus.Int64""" 129 return random.getrandbits(63)
130
131 -def translator(c, pack):
132 """A translator for L{Change}s""" 133 if pack: 134 if isinstance(c.edit, Insertion): 135 return dbus.Struct((dbus.Int64(c.unique_id), 136 dbus.Int64(c.parent), 137 dbus.Int16(0), 138 dbus.Int64(c.edit.position), 139 dbus.UTF8String(c.edit.text)), 140 signature='xxnxs') 141 elif isinstance(c.edit, Deletion): 142 return dbus.Struct((dbus.Int64(c.unique_id), 143 dbus.Int64(c.parent), 144 dbus.Int16(1), 145 dbus.Int64(c.edit.position), 146 dbus.Int64(c.edit.length)), 147 signature='xxnxx') 148 elif isinstance(c.edit, Removal): 149 return dbus.Struct((dbus.Int64(c.unique_id), 150 dbus.Int64(c.parent), 151 dbus.Int16(2), 152 dbus.Int64(c.edit.position), 153 dbus.Int64(c.edit.length)), 154 signature='xxnxx') 155 else: 156 raise Unimplemented 157 else: 158 if c[2] == 0: 159 ed = Insertion(int(c[3]),str(c[4])) 160 elif c[2] == 1: 161 ed = Deletion(int(c[3]),int(c[4])) 162 elif c[2] == 2: 163 ed = Removal(int(c[3]),int(c[4])) 164 else: 165 raise "unknown type" 166 return Change(int(c[0]), int(c[1]), ed)
167
168 -class EagerHideList:
169 """ 170 EagerHideList provides a list with hidden elements. The standard index 171 considers only the visible elements, but the 'all' index considers all 172 elements. The standard index of an invisible element is considered to be 173 the index of the next visible element. 174 175 EagerHideList has S{Theta}(N) time requirements for random insertion and 176 deletion. L{TreeHideList} can perform these operations in S{Theta}(log(N)) 177 time, and so is generally preferred. EagerHideList is no longer used by 178 SimpleStringTree."""
179 - def __init__(self):
180 self._sourcelist = [] 181 self._poslist = [] 182 self._posmap = {}
183 - def __len__(self):
184 return len(self._sourcelist)
185 - def __iter__(self):
186 return self._sourcelist.__iter__()
187 - def __getitem__(self, s):
188 return self._sourcelist[s]
189 - def index(self, item):
190 x = self._poslist[self._posmap[item]] 191 return x[2]
192 - def hide(self, position, length):
193 """ 194 Makes currently visible elements hidden. 195 @type position: int 196 @param position: a position in the standard index (i.e. excluding 197 hidden elements) 198 @type length: int > 0 199 @param length: the number of elements to hide 200 """ 201 pfirst = self._sourcelist[position] 202 plast = self._sourcelist[position+length-1] 203 ifirst = self._posmap[pfirst] 204 ilast = self._posmap[plast] 205 for i in xrange(ifirst, ilast+1): 206 L = self._poslist[i] 207 L[1] = False #No longer visible, if it was visible before 208 L[2] = position #collapse positions 209 for L in self._poslist[ilast+1:]: 210 L[2] -= length #move all subsequent character up by length 211 212 del self._sourcelist[position:(position+length)]
213 #self._check_invariants()
214 - def getitem_all(self, s):
215 if isinstance(s, int): 216 return self._poslist[s][0] 217 else: 218 return [x[0] for x in self._poslist[s]]
219 - def index_all(self, item):
220 return self._posmap[item]
221 - def is_visible(self, i):
222 return self._poslist[i][1]
223 - def is_visible_item(self, item):
224 return self._poslist[self._posmap[item]][1]
225 - def insert_sequence_all(self, position, sequence, visibility):
226 """Insert sequence with visibility into the all-coordinates at position""" 227 if position > 0: 228 psource = self._poslist[position][2] 229 else: 230 psource = 0 231 length = len(sequence) 232 newlist = [] 233 newlistsource = [] 234 i = psource 235 for elt, viz in zip(sequence, visibility): 236 newlist.append([elt, viz, i]) 237 if viz: 238 newlistsource.append(elt) 239 i += 1 240 self._poslist[position:position] = newlist 241 for i in xrange(position,position+length): 242 L = self._poslist[i] 243 self._posmap[L[0]] = i 244 num_viz = len(newlistsource) 245 for i in xrange(position+length,len(self._poslist)): 246 L = self._poslist[i] 247 L[2] += num_viz 248 self._posmap[L[0]] = i 249 self._sourcelist[psource:psource] = newlistsource
250 #self._check_invariants()
251 - def insert_sequence_leftof(self, target, sequence, visibility):
252 self.insert_sequence_all(self._posmap[target], sequence, visibility)
253
254 - def _check_invariants(self):
255 assert len(self._posmap) == len(self._poslist) 256 for i in xrange(len(self._poslist)): 257 assert self._posmap[self._poslist[i][0]] == i 258 if self._poslist[i][1]: 259 assert self._sourcelist[self._poslist[i][2]] == self._poslist[i][0] 260 if i > 0: 261 if self._poslist[i-1][1]: 262 assert self._poslist[i-1][2] + 1 == self._poslist[i][2] 263 else: 264 assert self._poslist[i-1][2] == self._poslist[i][2]
265
266 -class SimpleStringTree:
267 """ 268 a class implementing an editable string, represented internally as a 269 dependency tree of L{Change}s. 270 271 Purpose 272 ======= 273 274 The logical structure of a SimpleStringTree is derived from common version 275 control systems such as CVS. In a version control system, each edit is 276 represented as a patch, also known as a diff, that can be applied to the 277 previous version of a document to generate the next version. These patches 278 are typically generated on a line-by-line basis, with the line to be edited 279 described either by its absolute position, or by its position relative to 280 some "context" (i.e. nearby lines). 281 282 When two people submit patches that modify the same line, the two patches 283 are said to "conflict", because they cannot both be applied, at least not 284 while retaining a semblance of their original meaning. These conflicts are 285 reported to the user and must be resolved manually. 286 287 When editing computer source code, a conflict typically indicates that if 288 both edits are applied, the 289 program will crash. For human languages, however, the requirements are not 290 so strict. 291 292 In real-time collaborative text editing, conflicts can most 293 easily be resolved by applying both edits and then fixing the text within 294 the editor. This procedure only works if all editors see the same text 295 when a conflict occurs. Thus, the purpose of SimpleStringTree is to ensure 296 that, no matter what sequence of edits is made, all users who have seen all 297 edits are presented with the same string. 298 299 API 300 === 301 SimpleStringTree provides a "native" API based on insert() and delete(). 302 To ease testing, it additionally provides a number of methods such as seek() 303 and write() that mirror the API of python's StringIO module. These methods 304 are in place principally to enable direct testing against python's StringIO 305 module, which should produce identical results when these methods are used. 306 307 The name 308 ======== 309 SimpleStringTree is "Simple" in that it supports only Insertions and 310 Deletions. Handling References, while valuable, has proven quite difficult, 311 and so will not be addressed by this data structure. 312 313 Some code for handling Removals will be left in for the moment, since it is 314 easy enough, even though it is not presently used.""" 315
316 - def __init__(self, initstring=""):
317 self.ROOT = Record(Change(-1, -1, Insertion(0, "")), 0) 318 self._id2rec = dict() #unique_id: Record(change) | change.unique_id = unique_id 319 self._id2rec[-1] = self.ROOT 320 self._parent2children = defaultdict(set) # unique_id: {Record(change) | change.parent == unique_id} 321 self._cursor = 0 322 self._listing = aatree.AATreeHideList() 323 self._listing.insert_sequence_all(0,[(-1,0)],[False]) 324 if initstring: 325 self.insert(initstring, 0)
326
327 - def __repr__(self):
328 return "\n".join((str(v) for v in self._id2rec.values()))
329
330 - def getvalue(self, r = None):
331 if r is None: 332 r = self.ROOT 333 s = "".join(self._id2rec[x[0]].change.edit.text[x[1]] for x in self._listing) 334 return s
335
336 - def next(self):
337 raise
338
339 - def flush(self):
340 # This could be used to implement lazy evaluation of input by not 341 # re-evaluating the string until flush (or read?) 342 pass
343
344 - def close(self):
345 raise
346
347 - def read(self, size=float('inf')):
348 #Efficiency: This method should use size to avoid calling getvalue and rendering the entire string 349 s = self.getvalue() 350 outpoint = min(len(s), self._cursor + size) 351 inpoint = self._cursor 352 self._cursor = outpoint 353 return s[inpoint:outpoint]
354
355 - def readline(self, size=float('inf')):
356 #Efficiency: This method should use size to avoid rendering the whole string 357 s = self.getvalue() 358 outpoint = min(len(s), self._cursor + size) 359 inpoint = self._cursor 360 i = s.find("\n", inpoint, outpoint) 361 if i == -1 or i >= outpoint: 362 self._cursor = outpoint 363 return s[inpoint:outpoint] 364 else: 365 self._cursor = i + 1 366 return s[inpoint:(i+1)]
367
368 - def readlines(self, sizehint=None):
369 #Efficiency: use sizehint 370 s = self.getvalue() 371 t = s[self._cursor:] 372 self._cursor = len(s) 373 return t.splitlines(True)
374
375 - def seek(self, offset, whence=0):
376 if whence == 0: 377 self._cursor = offset 378 elif whence == 1: 379 self._cursor += offset 380 elif whence == 2: 381 self._cursor = len(self._listing) + offset
382
383 - def tell(self):
384 return self._cursor
385
386 - def truncate(self, size=None):
387 if size is None: 388 size = self._cursor 389 return self.delete(size, len(self._listing) - size)
390
391 - def write(self, text):
392 L = min(len(self._listing) - self._cursor, len(text)) 393 changelist = [] 394 if L > 0: 395 changelist.extend(self.delete(self._cursor, L)) 396 changelist.extend(self.insert(text)) 397 return changelist
398
399 - def writelines(self, sequence):
400 s = "".join(sequence) 401 return self.write(s)
402 403 # Non-filelike text editing methods 404
405 - def insert(self, text, k=None):
406 """ 407 Insert text into the string, without overwriting any existing text. 408 @type text: str 409 @param text: the text to insert 410 @type k: int >= 0 411 @param k: the location at which to insert the text. If k is None, 412 then the current seek position is used 413 @raise None: raises an error if k is greater than the length of the 414 string. 415 """ 416 if k is None: 417 k = self._cursor 418 419 if len(self._listing) == 0: 420 r = self.ROOT 421 uid = -1 422 inspoint = 0 423 elif k == 0: 424 (uid, inspoint) = self._listing[k] 425 r = self._id2rec[uid] 426 elif k < len(self._listing): 427 # When not inserting at the endpoints, we have to be sure to 428 # check if we are at the boundary between a parent and one of its 429 # descendants. If so, we must make our insertion in the descendant, 430 # not the parent, because the insertions would "conflict" (occur at 431 # the same location) in the parent, which would produce an ordering 432 # ambiguity, which will resolve in our favor only 50% of the time. 433 pL, pR = self._listing[(k-1):(k+1)] 434 (uidL, inspointL) = pL 435 (uidR, inspointR) = pR 436 rR = self._id2rec[uidR] 437 if uidL == uidR: #There's no boundary here at all (at least not one 438 # of any importance to us. Therefore, we have to insert to the 439 # left of the character at k, as usual. 440 r = rR 441 uid = uidR 442 inspoint = inspointR 443 else: #There's a boundary of some sort here. We always choose to 444 # insert at the deeper node (breaking left in case of a tie). 445 # (In the case that neither node is the ancestor of the other, 446 # either choice would be acceptable, regardless of depth. This 447 # logic is therefore acceptable, and has the advantage of being 448 # simple and fast.) 449 rL = self._id2rec[uidL] 450 if rR.depth > rL.depth: 451 r = rR 452 uid = uidR 453 inspoint = inspointR 454 else: 455 r = rL 456 uid = uidL 457 inspoint = inspointL + 1 458 elif k == len(self._listing): 459 (uid,i) = self._listing[k-1] 460 r = self._id2rec[uid] 461 inspoint = i + 1 462 else: 463 raise 464 465 e = Insertion(inspoint, text) 466 c = Change(my_rand(), r.change.unique_id, e) 467 self._add_change_treeonly(c) 468 target = (uid, inspoint) 469 self._insert_listonly(c.unique_id, target, len(text)) 470 self._cursor = k + len(text) 471 return [c]
472
473 - def delete(self, k, n):
474 """ 475 Starting at a point k (0-indexed), delete n characters 476 @type k: int 477 @param k: the position at which to begin deletion. The character at 478 position k will be deleted. 479 @type n: int 480 @param n: the number of characters to delete 481 @raise None: raises an error if the deletion extends beyond the end of 482 the string.""" 483 if k + n > len(self._listing): 484 raise 485 p = self._listing[k] 486 contigs = [[p[0],p[1],p[1]]] 487 for (uid, index) in self._listing[(k+1):(k+n)]: 488 #This logic produces deletions that span any missing chunks. This 489 #produces a smaller number of deletions than making sure that they 490 #are actually "contiguous", but it might interact badly with a 491 #hypothetical undo system. 492 if contigs[-1][0] == uid: 493 contigs[-1][2] = index 494 else: 495 contigs.append([uid,index,index]) 496 changelist = [Change(my_rand(), c[0], Deletion(c[1],1 + c[2]-c[1])) for c in contigs] 497 for c in changelist: 498 self._add_change_treeonly(c) 499 self._delete_listonly(k,n) 500 return changelist
501
502 - def get_range(self, rec, point, m):
503 todo = [(rec, point, m, None)] # None is a dummy value since p is unused 504 ranges = [] 505 while len(todo) > 0: 506 (rec, point, m, p) = todo[0] 507 h = self._range_helper(point, m) 508 self._step(h, rec) 509 if h.outpoint is not None: 510 ranges.append((rec, h.point_parent, h.outpoint - h.point_parent)) 511 #print rec, h.point_parent, h.outpoint - h.point_parent 512 todo.extend(h.todo) 513 #print todo 514 del todo[0] 515 return ranges
516
517 - def move(self, rempoint, n, inspoint):
518 """ 519 Move a substring. Equivalent to read(), delete(), and insert(). 520 In StringTree, move() would ideally coherently copy a section of text, 521 such that any subsequent edits appear in the new location, not the old. 522 In SimpleStringTree, this is not possible, so move() just falls back to 523 Deletion and Insertion. 524 @type rempoint: int 525 @param rempoint: the point at which to start deletion 526 @type n: int 527 @param n: the number of characters to move 528 @param inspoint: the position at which to insert""" 529 self.seek(rempoint) 530 t = self.read(n) 531 532 if rempoint > inspoint: 533 L = self.delete(rempoint,n) 534 L.extend(self.insert(t, inspoint)) 535 else: 536 L = self.insert(t, inspoint) 537 L.extend(self.delete(rempoint,n)) 538 return L
539 540 # Patch management methods 541
542 - def add_change(self, c):
543 """ 544 Apply a change to the string. 545 @type c: L{Change} 546 @param c: a L{Change} to make to the string. c.parent MUST already 547 have been added. 548 """ 549 if c.unique_id in self._id2rec: 550 return [] 551 if isinstance(c.edit, Insertion): 552 p = self._effective_parent(c.unique_id, c.parent, c.edit.position) 553 i = self._listing.index(p) 554 d = len(c.edit.text) 555 self._insert_listonly(c.unique_id, p, d) 556 flist = [Insertion(i, c.edit.text)] 557 elif isinstance(c.edit, Deletion): 558 flist = [] 559 start = None 560 end = None 561 for i in xrange(c.edit.position,c.edit.position + c.edit.length): 562 p = (c.parent, i) 563 if self._listing.is_visible_item(p): 564 q = self._listing.index(p) 565 if end == q: 566 end += 1 567 else: 568 if end is not None: 569 n = end - start 570 flist.append(Deletion(start,n)) 571 self._delete_listonly(start,n) 572 start = q 573 end = start + 1 574 if end is not None: # the last contig won't be handled by the loop 575 n = end - start 576 flist.append(Deletion(start,n)) 577 self._delete_listonly(start,n) 578 else: 579 raise 580 self._add_change_treeonly(c) 581 return flist
582
583 - def _effective_parent(self, uid, parentid, position):
584 """ 585 The effective parent of an insertion is defined as the (uid, loc) pair 586 that causes it to be inserted in the right location in the string. This 587 is only relevant in the event of a conflict, in which case the conflict 588 edits are required to be ordered from least uid to greatest uid. The 589 effective parent of a conflicted edit, then, is either the original pair, 590 or (u_next, 0), where u_next is the uid of the sibling directly to the right 591 of the input uid. That is to say, it is the least u greater than uid 592 that has the same position.""" 593 if parentid in self._parent2children: 594 u_next = None 595 for r in self._parent2children[parentid]: 596 u = r.change.unique_id 597 p = r.change.edit.position 598 if (p == position) and isinstance(r.change.edit, Insertion) and (u > uid): 599 if (u_next is None) or (u < u_next): 600 u_next = u 601 if u_next is not None: 602 return (u_next, 0) 603 return (parentid, position)
604
605 - def _delete_listonly(self, position, length):
606 """Given the position in _sourcelist, and the length of the deletion, 607 perform the deletion in the lists""" 608 self._listing.hide(position,length)
609
610 - def _insert_listonly(self, uid, target, length):
611 """Make a new insertion into the lists with uid and length at position 612 in _poslist""" 613 elts = [(uid,i) for i in xrange(length+1)] 614 visibility = [True] * length 615 visibility.append(False) 616 self._listing.insert_sequence_leftof(target, elts, visibility)
617
618 - def _add_change_treeonly(self, c):
619 if c.unique_id in self._id2rec: 620 return 621 d = self._id2rec[c.parent].depth + 1 622 r = Record(c,d) 623 self._id2rec[c.unique_id] = r 624 if c.parent not in self._parent2children: 625 self._parent2children[c.parent] = set() 626 self._parent2children[c.parent].add(r)
627
628 - def get_insert(self, k, n = 0, parent=None):
629 """ 630 FIXME: This method could be useful, but it no longer works 631 get_insert finds and returns the youngest insertion containing positions 632 k to k + n in the total coordinates of parent. If parent is unspecified, 633 then it is taken to be the root, meaning the coordinates in question 634 are the current document coordinates. 635 636 The return value is a tuple (rec, k_unmodified, k_modified), meaning 637 the record containing k and k + n, the position of k in rec's 638 unmodified coordinates, and the position of k in rec's 639 modified coordinates. 640 641 "containing" is defined so that a record with n characters 642 (labeled 0...n-1) can still be returned as containing k...k+n. In 643 other words, each Insertion contains both endpoints. However, an 644 Insertion that has been totally deleted is ignored entirely.""" 645 if parent is None: 646 parent = self.ROOT 647 648 h = self._get_insert_helper(k, n) 649 self._step(h, parent) 650 while h.child is not None: 651 parent = h.child 652 h = self._get_insert_helper(h.child_k, n) 653 self._step(h, parent) 654 655 return (parent, h.k_in_parent, h.k)
656
657 - def get_changes(self): #TODO: add arguments to get only changes in a certain range
658 """ 659 get_changes provides a depth-first topologically sorted list of all 660 Changes in this Tree. 661 @rtype: iterable(L{Change}) 662 @return: a list of changes to the tree, ordering according to a 663 topological sort of the dependency tree. This ordering guarantees 664 that no change appears until after all of its dependencies (i.e. 665 ancestors).""" 666 L = [] 667 stack = [-1] 668 while stack: 669 x = self._id2rec[stack.pop()].change 670 L.append(x) 671 if x.unique_id in self._parent2children: 672 stack.extend(r.change.unique_id 673 for r in self._parent2children[x.unique_id]) 674 return L
675
676 - def is_ready(self, c):
677 """ 678 Determin whether a Change c may safely be added 679 to the Tree. Specifically, it checks whether c.parent is already known. 680 @rtype: bool 681 @return: c.parent is already present in the tree.""" 682 return c.parent in self._id2rec
683