Attachment 'e2pathwalker.py'
Download 1 #!/usr/bin/env python
2
3 # Author: Ian Rees (ian.rees@bcm.edu), 08/01/2010
4 # Copyright (c) 2000-2011 Baylor College of Medicine
5
6 # This software is issued under a joint BSD/GNU license. You may use the
7 # source code in this file under either license. However, note that the
8 # complete EMAN2 and SPARX software packages have some GPL dependencies,
9 # so you are responsible for compliance with the licenses of these packages
10 # if you opt to use BSD licensing. The warranty disclaimer below holds
11 # in either instance.
12 #
13 # This complete copyright notice must be included in any revised version of the
14 # source code. Additional authorship citations may be added, but existing
15 # author citations must be preserved.
16 #
17 # This program is free software; you can redistribute it and/or modify
18 # it under the terms of the GNU General Public License as published by
19 # the Free Software Foundation; either version 2 of the License, or
20 # (at your option) any later version.
21 #
22 # This program is distributed in the hope that it will be useful,
23 # but WITHOUT ANY WARRANTY; without even the implied warranty of
24 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 # GNU General Public License for more details.
26 #
27 # You should have received a copy of the GNU General Public License
28 # along with this program; if not, write to the Free Software
29 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 2111-1307 USA
30
31
32 import EMAN2
33 import collections
34 import math
35 import random
36 import sys
37 import optparse
38 import os
39 import commands
40 import sys
41 import operator
42 import copy
43 import re
44 import numpy
45 import json
46
47
48
49
50 def cross_product(a,b):
51 """Cross product of two 3-d vectors. from http://www-hep.colorado.edu/~fperez/python/python-c/weave_examples.html"""
52 cross = [0]*3
53 cross[0] = a[1]*b[2]-a[2]*b[1]
54 cross[1] = a[2]*b[0]-a[0]*b[2]
55 cross[2] = a[0]*b[1]-a[1]*b[0]
56 return numpy.array(cross)
57
58
59
60 def norm_vector(a,b):
61 v1=(a[0]-b[0],a[1]-b[1],a[2]-b[2])
62 lengthV=math.sqrt(v1[0]**2+v1[1]**2+v1[2]**2)
63 vec=(v1[0]/lengthV,v1[1]/lengthV,v1[2]/lengthV)
64 return vec
65
66
67
68
69
70 class PathWalker(object):
71
72 def __init__(self, filename=None, start=None, end=None, edgefile=None, edges=None, dmin=2.0, dmax=4.2, average=3.78, atomtype='CA', chain=None, noise=0, solver=False):
73
74 # Run parameters
75 self.dmin = dmin
76 self.dmax = dmax
77 self.average = average
78 self.filename = filename
79 self.basename = self.get_basename()
80 self.chain = chain
81 self.atomtype = atomtype
82 self.noise = noise
83 self.solver = solver
84
85 if self.atomtype in ['all', 'None', '']:
86 self.atomtype = None
87
88 # Read PDB file
89 # pointmap is a mapping from the PDB atom # to the Path atom #
90 self.pointmap, self.points = self.read_pdb(atomtype=self.atomtype, chain=self.chain)
91
92 # Point graph
93 self.itree = {}
94
95 # Calculated distances
96 self.distances = {}
97 self.weighted = {}
98
99 if self.average > self.dmax:
100 self.dmax = self.average
101 if self.average < self.dmin:
102 self.dmin = self.average
103
104 for i in self.points.keys():
105 self.itree[i] = set()
106
107 for count1, point1 in self.points.items():
108 for count2, point2 in self.points.items():
109 d, w = self.calcweight(point1, point2)
110 self.distances[(count1, count2)] = d
111 self.weighted[(count1, count2)] = w
112 if self.cutoff(d):
113 self.itree[count1].add(count2)
114 # print count1, count2, self.distances[(count1, count2)], self.weighted[(count1, count2)]
115
116
117 # Read an edge fragment file... 1 string of points per line, separated space
118 self.fixededges = self.read_fixed(edgefile)
119
120 # ... add any additional edges, and/or start+end
121 if edges:
122 self.fixededges.extend(edges)
123
124 self.start = min(self.points)
125 self.end = max(self.points)
126 if start != None:
127 self.start = start
128 if end != None:
129 self.end = end
130
131 print "Note: linking start/end: ", self.start, self.end
132
133 self.fixededges.append((self.start, self.end))
134
135 # Process forced edges
136 for link in self.fixededges:
137 self.itree[link[0]].add(link[1])
138 self.itree[link[1]].add(link[0])
139 self.weighted[(link[0], link[1])] = 0
140 self.weighted[(link[1], link[0])] = 0
141
142 # Some useful statistics
143 self.endpoints = self.find_endpoints()
144 self.branches = self.find_branches()
145 self.total_branches = len(self.branches)
146 self.total_endpoints = len(self.endpoints)
147 self.total_nodes = len(self.itree)
148 self.total_edges = len(self.distances)
149
150
151
152
153 def run(self):
154 # Write the processed PDB file
155 self.stats()
156
157 self.write_pdb(self.points.keys(), tree=self.itree)
158
159 if not self.solver:
160 return
161
162 ret = self.get_json()
163 #print ret
164
165 # Solve paths
166 ret['paths'] = self.solve(solver=self.solver)
167
168 # We need to rotate the path so the start atom is first..
169 paths = ret.get('paths', {})
170 for pathid in sorted(paths.keys()):
171 print "--- Evaluating path %s of %s ---"%(pathid, len(paths))
172 d = paths[pathid]
173
174 # Rotate the path so the starting element is first in the list
175 rot = d['path'].index(self.start or d['path'][0])
176 path = collections.deque(d['path'])
177 path.rotate(-rot)
178 d['path'] = list(path)
179
180 # Evaluate path and calculate CA ramachandran angles
181 d.update(self.evaluate_path(d['path']))
182 d.update(self.ca_ram(d['path']))
183
184
185 # Write output
186 self.write_pdbs(paths, filename=self.basename+".out.pdb")
187 self.write_json(ret)
188
189
190
191
192 def read_fixed(self, edgefile):
193 # Edge file format:
194 # 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
195 # 39 40 41 42 43
196 # ...
197 fixededges = []
198 if not edgefile:
199 return fixededges
200 f = open(edgefile)
201 fragments = [i.strip() for i in f.readlines()]
202 f.close()
203 for fragment in fragments:
204 fragment = map(int, fragment.split())
205 for i in range(len(fragment)-1):
206 fixededges.append((fragment[i], fragment[i+1]))
207 return fixededges
208
209
210
211 def get_json(self):
212 keys = [
213 'total_nodes',
214 'total_edges',
215 'total_branches',
216 'total_endpoints',
217 'edges_fixed',
218 'noise',
219 'filename',
220 'basename',
221 'dmax',
222 'dmin',
223 'average',
224 'chain',
225 'atomtype',
226 'solver',
227 'start',
228 'end',
229 'fixededges',
230 'endpoints',
231 'branches']
232
233 ret = {}
234 for key in keys:
235 i = getattr(self, key, None)
236 if i != None:
237 ret[key] = i
238
239 return ret
240
241
242
243 def stats(self):
244 print "\n=== Statistics for %s ==="%self.filename
245 print "Total Nodes: %s"%len(self.itree)
246 print "Total Edges: %s"%len(self.distances)
247
248 print "Endpoints: %s"%self.total_endpoints
249 print "\t", self.endpoints
250
251 print "Branches: %s"%self.total_branches
252 print "\t", self.branches
253
254 print "Edges:"
255 for k in sorted(self.points.keys()):
256 print "\t%-10s%s"%(k, self.itree[k])
257
258
259
260
261 def solve(self, solver="exhaustive"):
262 solver = solver.lower()
263 method = getattr(self, '_solve_%s'%solver, None)
264 if method:
265 return method()
266 # else:
267 # raise ValueError, "Unknown solver: %s"%solver
268 return {}
269
270
271
272
273 # def children(start, tree, minlength=1):
274 def _solve_exhaustive(self, minlength=1):
275
276 minlength = len(self.itree) - 1
277 print "\n=== Exhaustive Search ==="
278
279 endpaths = set()
280 to_crawl = collections.deque()
281 to_crawl.append((self.start,))
282
283 while to_crawl:
284 current = to_crawl.pop()
285 children = self.itree.get(current[-1], ())
286 # print endpaths
287
288 if not children - set(current) and len(current) > minlength:
289 print "Found path of len", len(current)
290 endpaths.add(current)
291 continue
292
293 for child in children:
294 if child not in current:
295 to_crawl.append(current + (child,))
296
297 ret = {}
298 for count, path in enumerate(endpaths):
299 ret[count] = {'path':path, 'solver':'exhaustive', 'score':0}
300
301 return ret
302
303
304
305 def _solve_concorde(self):
306 print "\n=== Solving TSP with Concorde ==="
307
308 tspfile = self.basename+".tsp"
309 outfile = self.basename+".out"
310
311 print "TSPLib file: %s"%tspfile
312 print "Results file: %s"%outfile
313
314 self.write_tsplib(filename=tspfile)
315 os.system("concorde -x -m -o %s %s"%(outfile, tspfile))
316
317 return self.readtour_concorde(outfile)
318
319
320
321
322 def _solve_lkh(self):
323 print "\n=== Solving TSP with LKH ==="
324
325 tspfile = self.basename+".tsp"
326 lkhfile = self.basename+".lkh"
327 outfile = self.basename+".out"
328
329 lkh = """PROBLEM_FILE = %s\nOUTPUT_TOUR_FILE = %s\nPRECISION = 100\n"""%(tspfile,outfile)
330 f = open(lkhfile, "w")
331 f.write(lkh)
332 f.close()
333
334 self.write_tsplib(filename=tspfile)
335 os.system("LKH %s"%(lkhfile))
336
337 return self.readtour_lkh(outfile)
338
339
340
341
342 def readtour_lkh(self, tourfile):
343 # TSP Tour file format..
344 f = open(tourfile)
345 r = [i.strip() for i in f.readlines()]
346 f.close()
347
348 score_tsp = filter(lambda i:i.startswith('COMMENT : Length'), r)
349 if score_tsp:
350 score_tsp = score_tsp[0].partition('=')[2].strip()
351 score_tsp = float(score_tsp)
352
353 path = map(int, r[r.index("TOUR_SECTION")+1:r.index("-1")])
354 path = [self.pointmap[i] for i in path]
355
356 # Save output
357 ret = {
358 'path': path,
359 'score': score_tsp,
360 'solver':'lkh'
361 }
362 return {0:ret}
363
364
365
366
367 def readtour_concorde(self, tourfile):
368 # Concorde Tour file format:
369 # numpoints
370 # point1, point2, point3...
371 f = open(tourfile)
372 r = f.readlines()
373 f.close()
374
375 path = [map(int, i.split()) for i in r][1:]
376 path = reduce(operator.concat, r)
377 path = [i+1 for i in r]
378
379 # Save output
380 ret = {
381 'path': path,
382 'score': 0,
383 'solver':'concorde'
384 }
385 return {0:ret}
386
387
388
389
390 def evaluate_path(self, path=None):
391
392 print "\n=== Evaluating Path ==="
393
394 def fmt(*args):
395 r = ['%-10s'%i for i in args]
396 return "".join(r)
397
398 print fmt("PDB Edge", "", "Distance", "Weight")
399 breaks = []
400
401 pmin = min(path)
402 pmax = max(path)
403 distances = []
404 r = []
405 for i in range(len(path)-1):
406 p1 = path[i]
407 p2 = path[i+1]
408 d1 = '%0.2f'%self.distances.get((p1, p2))
409 d2 = self.weighted.get((p1, p2))
410 distances.append(self.distances.get((p1, p2)))
411
412
413 if (path[i] in [pmin, pmax] or path[i] in [path[i+1]+1, path[i+1]-1]):
414 r.append(True)
415 b = ""
416 else:
417 r.append(False)
418 breaks.append((d1, d2))
419 b = "BREAK!"
420
421 print fmt(path[i], path[i+1], p1, p2, d1, d2, b)
422
423 r2 = []
424 j = 0
425 for i in range(len(r)-1):
426 if r[i]==True and r[i+1]==True: j += 1
427 else: j = 0
428 r2.append(j)
429
430 rmax = max(r2)
431 score_fragment = float(rmax) / float(len(r2))
432
433 correctbonds = len(path)-len(breaks)
434 score_path = float(correctbonds) / len(path)
435
436 print "\nPath quality statistics:"
437 print "\tNoise: %s"%self.noise
438 print "\tLongest continuous fragment score: %s"%score_fragment
439 print "\tNumber of breaks: %s out of %s bonds"%(len(breaks), len(path))
440 print "\tPath score: %0.5f"%(score_path)
441 print "\tMinimum distance:", min(distances)
442 print "\tMaximum distance:", max(distances)
443
444 # Save output
445 ret = {
446 'path_length': len(path),
447 'path_breaks': len(breaks),
448 'score_fragment': score_fragment,
449 'score_path': score_path,
450 'dmin': min(distances),
451 'dmax': max(distances)
452 }
453 return ret
454
455
456
457
458 def cutoff(self, d):
459 return self.dmin <= d <= self.dmax
460
461
462
463
464 def calcweight(self, point1, point2):
465 d = self.distance(point1, point2)
466 if self.cutoff(d):
467 w = int(math.fabs(d-self.average)*100)
468 # w = 0
469 else:
470 w = int((math.fabs(d-self.average)*100)**2)
471 if w > 100000:
472 w = 100000
473 return d, w
474
475
476
477
478 def distance(self, point1, point2):
479 return math.sqrt(sum([(x[0]-x[1])**2 for x in zip(point1, point2)]))
480
481
482
483
484 def find_endpoints(self):
485 """Return list of points that have != 2 connections as possible termini; note: circular paths may have 2 links..."""
486 return [k for k,v in self.itree.items() if len(self.itree.get(k)) == 1]
487
488
489
490
491 def find_branches(self):
492 """Return points with > 2 connections"""
493 return [k for k,v in self.itree.items() if len(self.itree.get(k)) > 2]
494
495
496
497
498 def get_basename(self):
499 basename = os.path.basename(self.filename).split(".")
500 basename = ".".join(basename[:basename.index("pdb")])
501 r = "%s-(\d+).tsp"%basename
502 r = re.compile(r)
503 runs = map(int, [(r.findall(i) or [0])[0] for i in os.listdir(".")])
504 if not runs:
505 runcount = 0
506 else:
507 runcount = max(runs)+1
508
509 return "%s-%s"%(basename, runcount)
510
511
512
513 def write_json(self, soln, filename=None):
514 outfile = filename or self.basename + ".json"
515 print "Writing solution dictionary to %s"%outfile
516 f = open(outfile, 'w')
517 json.dump(soln, f, indent=1)
518 f.close()
519
520
521
522
523 def write_pdb(self, path, filename=None, tree=None):
524 d = {0:{'path':path}}
525 return self.write_pdbs(d, filename=filename, tree=tree)
526
527
528
529
530 def write_pdbs(self, paths, filename=None, tree=None):
531 outfile = filename or self.basename + ".pdb"
532 out = open(outfile,"w")
533 chain = "A"
534
535 print "\n=== Writing %s ==="%outfile
536
537 for pathid in sorted(paths.keys()):
538 d = paths[pathid]
539 path = d['path']
540 count = 1
541 out.write(
542 "MODEL %4d\n"%pathid
543 )
544 for atom in path:
545 point = self.points.get(atom)
546 out.write(
547 "ATOM %6d C ALA %s%4d %8.3f%8.3f%8.3f %5.2f 0 S_00 0\n"
548 %(atom, chain, atom, point[0], point[1], point[2], len(self.itree.get(atom,[])))
549 )
550 count += 1
551 out.write("""TER %6d ALA %s%4d\n"""%(count, chain, atom))
552
553 if tree:
554 connected = []
555 count = 0
556 for k,v in tree.items():
557 for v2 in v:
558 if (k,v2) in connected or (v2,k) in connected:
559 continue
560 count+=1
561 out.write('CONECT %4d %4d\n'%(k,v2))
562 connected.append((k,v2))
563 print "Wrote %s edges"%count
564
565 # out.write('CONECT %4d %4d\n'%(atoms[0], atoms[-1]))
566 out.write("ENDMDL\n")
567
568
569 print "Wrote %s points"%len(path)
570
571
572 out.write("END")
573 out.close()
574
575
576
577
578 def makenoise(self):
579 return random.gauss(self.average, self.noise) - self.average
580
581
582
583 def read_pdb(self, atomtype=None, chain=None):
584 print "\n=== Reading %s (atom type %s, chain %s) ==="%(self.filename, atomtype, chain)
585 points = {}
586 pointmap = {}
587
588 pdbfile = open(self.filename, "r")
589 lines = pdbfile.readlines()
590 pdbfile.close()
591
592 atomtypes = set()
593 chains = set()
594 for line in (i for i in lines if i.startswith("ATOM ")):
595 atomtypes.add(line[12:15].strip() or None)
596 chains.add(line[21].strip() or None)
597
598 print "Found chains %s and atomtypes %s"%(chains, atomtypes)
599 if len(atomtypes) == 1:
600 print "Only one atomtype present; using all atoms in PDB file"
601 atomtype = None
602
603 if len(chains) == 1:
604 print "Only one chain present"
605 chain = None
606
607 count = 1
608 for line in (i for i in lines if i.startswith("ATOM ")):
609 latomtype = line[12:15].strip()
610 lchain = line[21].strip() or None
611
612 if atomtype and atomtype != latomtype:
613 continue
614 if chain and chain != lchain:
615 continue
616
617 pos = int(line[23:27])
618 x = [float(line[30:38].strip()), float(line[38:46].strip()), float(line[46:54].strip())]
619 points[pos] = tuple([i+self.makenoise() for i in x])
620 pointmap[count] = pos
621 count += 1
622
623 if len(points) == 0:
624 raise Exception, "No atoms found in PDB file! Are chain and atomtype correct?"
625
626 print "Loaded %s C-a points"%len(points)
627
628 return pointmap, points
629
630
631
632
633
634 def write_tsplib(self, filename=None):
635 filename = filename or self.basename+".tsp"
636 fout = open(filename, "w")
637
638 # TSPLib format header
639 header = [
640 "NAME: %s"%self.filename,
641 "TYPE: TSP",
642 "COMMENT: %s"%self.filename,
643 "DIMENSION: %s"%len(self.points),
644 "EDGE_WEIGHT_TYPE: EXPLICIT",
645 "EDGE_WEIGHT_FORMAT: FULL_MATRIX",
646 ""
647 ]
648
649 fout.write("\n".join(header))
650
651 #print "POINTMAP"
652 #print self.pointmap
653
654 if self.fixededges:
655 fout.write("FIXED_EDGES_SECTION\n")
656 for i in self.fixededges:
657 fout.write("%s %s\n"%(i[0], i[1]))
658 fout.write("-1\n")
659
660 fout.write("EDGE_WEIGHT_SECTION\n")
661
662 keyorder = sorted(self.points.keys())
663 for point1 in keyorder:
664 row = []
665 for point2 in keyorder:
666 d = self.weighted.get((point1, point2))
667 row.append(d)
668
669 fout.write("%s\n"%" ".join(map(str, row)))
670
671
672 fout.write("EOF")
673 fout.close()
674
675 return filename
676
677
678
679
680 def ca_ram(self, path, filename=None):
681
682 print "\n=== C-a Ramachandran ==="
683
684 points = map(self.points.get, path)
685 # filename = filename or self.basename+".out.angles"
686
687 out=[]
688 index=1
689 while index < len(points)-2:
690 #calculate Ca-Ca*-Ca angle
691 v1=norm_vector(points[index-1],points[index])
692 v2=norm_vector(points[index+1],points[index])
693 dp=numpy.dot(v1,v2)
694 if dp > 1:
695 dp=1
696 if dp<-1:
697 dp=-1
698 CaCaCa=math.acos(dp)*(180/math.pi)
699 #print index+1, CaCaCa,
700
701
702 #calculate Ca-Ca*-Ca-Ca torsion
703 v1=norm_vector(points[index-1],points[index])
704 v2=norm_vector(points[index+1],points[index])
705 xp1=cross_product(v1,v2)
706 lengthxp1=math.sqrt(xp1[0]**2+xp1[1]**2+xp1[2]**2)
707 xp1n=(xp1[0]/lengthxp1, xp1[1]/lengthxp1, xp1[2]/lengthxp1)
708
709 v3=norm_vector(points[index],points[index+1])
710 v4=norm_vector(points[index+2],points[index+1])
711 xp2=cross_product(v3,v4)
712 lengthxp2=math.sqrt(xp2[0]**2+xp2[1]**2+xp2[2]**2)
713 xp2n=(xp2[0]/lengthxp2, xp2[1]/lengthxp2, xp2[2]/lengthxp2)
714
715 dpxps=numpy.dot(xp2n,xp1n)
716 if dpxps > 1:
717 dp=1
718 if dpxps<-1:
719 dpxps=-1
720 CaCaCaCa=math.acos(dpxps)*(180/math.pi)
721 # print CaCaCa, CaCaCaCa
722 out.append((CaCaCa, CaCaCaCa))
723 index=index+1
724
725 #f = open(filename, 'w')
726 #for i in out:
727 # f.write("%s %s\n"%(i[0],i[1]))
728 #f.close()
729 return {'ca_angles': out}
730
731
732
733 # parse helices
734 # a = [i.strip().split() for i in helix.split("\n")]
735 # for i in b: print " ".join(map(str,range(int(i[5]), int(i[8])+1)))
736
737
738 def main():
739 usage = """%prog [options] <pdb file>
740 Find paths between two atoms in a PDB model
741 """
742
743 parser = optparse.OptionParser(usage=usage, version=EMAN2.EMANVERSION)
744 parser.add_option("--start", type="int",help="Start ATOM")
745 parser.add_option("--end", type="int",help="End ATOM")
746 parser.add_option("--average", type="float",help="Average Ca-Ca length", default=3.78)
747 parser.add_option("--dmin", type="float",help="Mininum Ca-Ca length", default=2.0)
748 parser.add_option("--dmax", type="float",help="Maximum Ca-Ca length", default=4.2)
749 parser.add_option("--noise", type="float",help="Add Gaussian Noise", default=0.0)
750 parser.add_option("--solver", type="str" ,help="Run TSP Solver: concorde or lkh")
751 parser.add_option("--atomtype", type="str" ,help="Load Atom Type. Default: 'CA'. Options: 'C' or 'all'", default="CA")
752 parser.add_option("--chain", type="str" ,help="Load Chain. Default: load all chains")
753 parser.add_option("--edgefile", type="str" ,help="Load fixed fragment file; one sequence of forced connections per line, separated by space.")
754 parser.add_option("-e", "--edge", action="append", help="Forced edge: e.g. -e1,3")
755 parser.add_option("--iterations", type="int",help="Iterations", default=1)
756 parser.add_option("--verbose", "-v", dest="verbose", action="store", metavar="n",type="int", default=0, help='verbose level [0-9], higher number means higher level of verboseness')
757
758 (options, args) = parser.parse_args()
759
760 filename = args[0]
761 for j in range(options.iterations):
762 print "Iteration:", j
763
764 pw = PathWalker(
765 filename=filename,
766 start=options.start,
767 end=options.end,
768 edgefile=options.edgefile,
769 dmin=options.dmin,
770 dmax=options.dmax,
771 average=options.average,
772 noise=options.noise,
773 atomtype=options.atomtype,
774 chain=options.chain,
775 solver=options.solver
776 )
777 pw.run()
778
779
780
781 # If executed as a program
782 if __name__ == '__main__':
783 main()
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.- [get | view] (2011-10-26 21:54:24, 3332.0 KB) [[attachment:WorkshopData.zip]]
- [get | view] (2011-10-26 21:54:24, 18.8 KB) [[attachment:e2pathwalker.py]]
You are not allowed to attach a file to this page.