Main Page   Alphabetical List   Compound List   File List   Compound Members   File Members   Related Pages  

pdbplugin.c

Go to the documentation of this file.
00001 /***************************************************************************
00002  *cr
00003  *cr            (C) Copyright 1995-2006 The Board of Trustees of the
00004  *cr                        University of Illinois
00005  *cr                         All Rights Reserved
00006  *cr
00007  ***************************************************************************/
00008 
00009 /***************************************************************************
00010  * RCS INFORMATION:
00011  *
00012  *      $RCSfile: pdbplugin.c,v $
00013  *      $Author: johns $       $Locker:  $             $State: Exp $
00014  *      $Revision: 1.65 $       $Date: 2006/03/01 19:55:24 $
00015  *
00016  ***************************************************************************/
00017 
00018 /*
00019  * PDB file format specifications:
00020  *   http://www.rcsb.org/pdb/static.do?p=file_formats/pdb/index.html
00021  */
00022 
00023 #include "largefiles.h"   /* platform dependent 64-bit file I/O defines */
00024 
00025 #include "molfile_plugin.h"
00026 #include "readpdb.h"
00027 #include "periodic_table.h"
00028 #include <stdio.h>
00029 #include <stdlib.h>
00030 #include <string.h>
00031 
00032 /*
00033  * API functions start here
00034  */
00035 
00036 typedef struct {
00037   FILE *fd;
00038   int first_frame;
00039   int natoms;
00040   molfile_atom_t *atomlist;
00041   molfile_metadata_t *meta;
00042   int nconect;
00043   int nbonds, maxbnum;
00044   int *from, *to, *idxmap;
00045 } pdbdata;
00046 
00047 static void *open_pdb_read(const char *filepath, const char *filetype, 
00048     int *natoms) {
00049   FILE *fd;
00050   pdbdata *pdb;
00051   char pdbstr[PDB_BUFFER_LENGTH];
00052   int indx, nconect;
00053 
00054   fd = fopen(filepath, "r");
00055   if (!fd) 
00056     return NULL;
00057   pdb = (pdbdata *)malloc(sizeof(pdbdata));
00058   pdb->fd = fd;
00059   pdb->meta = (molfile_metadata_t *) malloc(sizeof(molfile_metadata_t));
00060   memset(pdb->meta, 0, sizeof(molfile_metadata_t));
00061 
00062   pdb->meta->remarklen = 0;
00063   pdb->meta->remarks = NULL;
00064 
00065   *natoms=0;
00066   nconect=0;
00067   do {
00068     indx = read_pdb_record(pdb->fd, pdbstr);
00069     if (indx == PDB_ATOM) {
00070       *natoms += 1;
00071     } else if (indx == PDB_CONECT) {
00072       nconect++;
00073     } else if (indx == PDB_HEADER) {
00074       get_pdb_header(pdbstr, pdb->meta->accession, pdb->meta->date, NULL);
00075       if (strlen(pdb->meta->accession) > 0) 
00076         strcpy(pdb->meta->database, "PDB");
00077     } else if (indx == PDB_REMARK || indx == PDB_CONECT || indx == PDB_UNKNOWN) {
00078       int len=strlen(pdbstr);
00079       int newlen = len + pdb->meta->remarklen;
00080 
00081       char *newstr=realloc(pdb->meta->remarks, newlen + 1);
00082       if (newstr != NULL) {
00083         pdb->meta->remarks = newstr;
00084         pdb->meta->remarks[pdb->meta->remarklen] = '\0';
00085         memcpy(pdb->meta->remarks + pdb->meta->remarklen, pdbstr, len);
00086         pdb->meta->remarks[newlen] = '\0';
00087         pdb->meta->remarklen = newlen;
00088       }
00089     }
00090  
00091   } while (indx != PDB_END && indx != PDB_EOF);
00092 
00093   /* If no atoms were found, this is probably not a PDB file! */
00094   if (!*natoms) {
00095     fprintf(stderr, "PDB file '%s' contains no atoms.\n", filepath);
00096     if (pdb->meta->remarks != NULL)
00097       free(pdb->meta->remarks);
00098     if (pdb->meta != NULL)
00099       free(pdb->meta);
00100     free(pdb);
00101     return NULL;
00102   }
00103 
00104   rewind(pdb->fd); /* if ok, rewind file and prepare to parse it for real */
00105   pdb->natoms = *natoms;
00106   pdb->nconect = nconect;
00107   pdb->nbonds = 0;
00108   pdb->maxbnum = 0;
00109   pdb->from = NULL;
00110   pdb->to = NULL;
00111   pdb->idxmap = NULL;
00112   pdb->atomlist = NULL;
00113 
00114 #if defined(VMDUSECONECTRECORDS)
00115   /* allocate atom index translation table if we have 99,999 atoms or less */
00116   /* and we have conect records to process                                 */
00117   if (pdb->natoms < 100000 && pdb->nconect > 0) {
00118     pdb->idxmap = (int *) malloc(100000 * sizeof(int));
00119     memset(pdb->idxmap, 0, 100000 * sizeof(int));
00120   }
00121 #endif
00122  
00123   return pdb; 
00124 }
00125 
00126 static int read_pdb_structure(void *mydata, int *optflags, 
00127     molfile_atom_t *atoms) { 
00128   pdbdata *pdb = (pdbdata *)mydata;
00129   molfile_atom_t *atom;
00130   char pdbrec[PDB_BUFFER_LENGTH];
00131   int i, rectype, atomserial, pteidx;
00132   char ridstr[8];
00133   char elementsymbol[3];
00134   int badptecount = 0;
00135   long fpos = ftell(pdb->fd);
00136 
00137   *optflags = MOLFILE_INSERTION | MOLFILE_OCCUPANCY | MOLFILE_BFACTOR |
00138               MOLFILE_ALTLOC | MOLFILE_ATOMICNUMBER | MOLFILE_BONDSSPECIAL;
00139 
00140   i = 0;
00141   do {
00142     rectype = read_pdb_record(pdb->fd, pdbrec);
00143     switch (rectype) {
00144     case PDB_ATOM:
00145       atom = atoms+i;
00146       get_pdb_fields(pdbrec, strlen(pdbrec), &atomserial, 
00147           atom->name, atom->resname, atom->chain, atom->segid, 
00148           ridstr, atom->insertion, atom->altloc, elementsymbol,
00149           NULL, NULL, NULL, &atom->occupancy, &atom->bfactor);
00150 
00151       if (pdb->idxmap != NULL && atomserial < 100000) {
00152         pdb->idxmap[atomserial] = i; /* record new serial number translation */ 
00153       }
00154  
00155       atom->resid = atoi(ridstr);
00156 
00157       /* determine atomic number from the element symbol */
00158       pteidx = get_pte_idx_from_string(elementsymbol);
00159       atom->atomicnumber = pteidx;
00160       if (pteidx != 0) {
00161         atom->mass = get_pte_mass(pteidx);
00162         atom->radius = get_pte_vdw_radius(pteidx);
00163       } else {
00164         badptecount++; /* unrecognized element */
00165       }
00166  
00167       strcpy(atom->type, atom->name);
00168       i++;
00169       break;
00170 
00171     case PDB_CONECT:
00172       /* only read CONECT records for structures where we know they can */
00173       /* be valid for all of the atoms in the structure                 */
00174       if (pdb->idxmap != NULL) {
00175         get_pdb_conect(pdbrec, pdb->natoms, pdb->idxmap, 
00176                        &pdb->maxbnum, &pdb->nbonds, &pdb->from, &pdb->to);
00177       }
00178       break;
00179 
00180     default:
00181       /* other record types are ignored in the structure callback */
00182       /* and are dealt with in the timestep callback or elsewhere */
00183       break;
00184     }
00185   } while (rectype != PDB_END && rectype != PDB_EOF);
00186 
00187   fseek(pdb->fd, fpos, SEEK_SET);
00188 
00189   /* if all atoms are recognized, set the mass and radius flags too,  */
00190   /* otherwise let VMD guess these for itself using it's own methods  */
00191   if (badptecount == 0) {
00192     *optflags |= MOLFILE_MASS | MOLFILE_RADIUS;
00193   }
00194 
00195   return MOLFILE_SUCCESS;
00196 }
00197 
00198 static int read_bonds(void *v, int *nbonds, int **fromptr, int **toptr, float **
00199 bondorder) {
00200   pdbdata *pdb = (pdbdata *)v;
00201   
00202   *nbonds = 0;
00203   *fromptr = NULL;
00204   *toptr = NULL;
00205   *bondorder = NULL; /* PDB files don't have bond order information */
00206 
00207 // The newest plugin API allows us to return CONECT records as 
00208 // additional bonds above and beyond what the distance search returns.
00209 // Without that feature, we otherwise have to check completeness and
00210 // ignore them if they don't look to be fully specified for this molecule
00211 #if !defined(MOLFILE_BONDSSPECIAL)
00212   if (pdb->natoms >= 100000) {
00213     printf("pdbplugin) Warning: more than 99,999 atoms, ignored CONECT records\n");
00214     return MOLFILE_SUCCESS;
00215   } else if (((float) pdb->nconect / (float) pdb->natoms) <= 0.85) {
00216     printf("pdbplugin) Warning: Probable incomplete bond structure specified,\n");
00217     printf("pdbplugin)          ignoring CONECT records\n");
00218     return MOLFILE_SUCCESS;
00219   } else if (pdb->nconect == 0) {
00220     return MOLFILE_SUCCESS;
00221   }
00222 #endif
00223 
00224   *nbonds = pdb->nbonds;
00225   *fromptr = pdb->from;
00226   *toptr = pdb->to;
00227 
00228   return MOLFILE_SUCCESS;
00229 }
00230 
00231 
00232 /* 
00233  * 
00234  */
00235 static int read_next_timestep(void *v, int natoms, molfile_timestep_t *ts) {
00236   pdbdata *pdb = (pdbdata *)v;
00237   char pdbstr[PDB_BUFFER_LENGTH];
00238   int indx, i;
00239   float *x, *y, *z;
00240   float occup, bfac;
00241   if (pdb->natoms == 0) 
00242     return MOLFILE_ERROR; /* EOF */
00243   if (ts) {
00244     x = ts->coords;
00245     y = x+1;
00246     z = x+2;
00247   } else {
00248     x = y = z = 0;
00249   } 
00250   i = 0;
00251   do {
00252     indx = read_pdb_record(pdb->fd, pdbstr);
00253     if((indx == PDB_END || indx == PDB_EOF) && (i < pdb->natoms)) {
00254       return MOLFILE_ERROR;
00255     } else if(indx == PDB_ATOM) {
00256       if(i++ >= pdb->natoms) {
00257         break;      
00258       }
00259       /* just get the coordinates, and store them */
00260       if (ts) {
00261         get_pdb_coordinates(pdbstr, x, y, z, &occup, &bfac);
00262         x += 3;
00263         y += 3;
00264         z += 3;
00265       } 
00266     } else if (indx == PDB_CRYST1) {
00267       if (ts) {
00268         get_pdb_cryst1(pdbstr, &ts->alpha, &ts->beta, &ts->gamma,
00269                                &ts->A, &ts->B, &ts->C);
00270       }
00271     }
00272   } while(!(indx == PDB_END || indx == PDB_EOF));
00273 
00274   return MOLFILE_SUCCESS;
00275 }
00276 
00277 static void close_pdb_read(void *v) { 
00278   pdbdata *pdb = (pdbdata *)v;
00279   if (pdb->fd != NULL)
00280     fclose(pdb->fd);
00281   if (pdb->idxmap != NULL)
00282     free(pdb->idxmap);
00283   if (pdb->meta->remarks != NULL)
00284     free(pdb->meta->remarks);
00285   if (pdb->meta != NULL) 
00286     free(pdb->meta);
00287   free(pdb);
00288 }
00289 
00290 static void *open_file_write(const char *path, const char *filetype, 
00291     int natoms) {
00292 
00293   FILE *fd;
00294   pdbdata *pdb;
00295   fd = fopen(path, "w");
00296   if (!fd) {
00297     fprintf(stderr, "Unable to open file %s for writing\n", path);
00298     return NULL;
00299   }
00300   pdb = (pdbdata *)malloc(sizeof(pdbdata));
00301   pdb->fd = fd;
00302   pdb->natoms = natoms; 
00303   pdb->atomlist = NULL;
00304   pdb->first_frame = 1;
00305   return pdb;
00306 }
00307  
00308 static int write_structure(void *v, int optflags, 
00309     const molfile_atom_t *atoms) {
00310 
00311   int i;
00312   pdbdata *pdb = (pdbdata *)v;
00313   int natoms = pdb->natoms;
00314   pdb->atomlist = (molfile_atom_t *)malloc(natoms*sizeof(molfile_atom_t));
00315   memcpy(pdb->atomlist, atoms, natoms*sizeof(molfile_atom_t));
00316 
00317   /* If occ, bfactor, and insertion aren't given, we assign defaultvalues. */
00318   if (!(optflags & MOLFILE_OCCUPANCY)) {
00319     for (i=0; i<natoms; i++) pdb->atomlist[i].occupancy = 0.0f;
00320   }
00321   if (!(optflags & MOLFILE_BFACTOR)) {
00322     for (i=0; i<natoms; i++) pdb->atomlist[i].bfactor= 0.0f;
00323   }
00324   if (!(optflags & MOLFILE_INSERTION)) {
00325     for (i=0; i<natoms; i++) {
00326       pdb->atomlist[i].insertion[0] =' ';
00327       pdb->atomlist[i].insertion[1] ='\0';
00328     }
00329   }
00330   if (!(optflags & MOLFILE_ALTLOC)) {
00331     for (i=0; i<natoms; i++) {
00332       pdb->atomlist[i].altloc[0]=' ';
00333       pdb->atomlist[i].altloc[1]='\0';
00334     }
00335   }
00336   if (!(optflags & MOLFILE_ATOMICNUMBER)) {
00337     for (i=0; i<natoms; i++) pdb->atomlist[i].atomicnumber = 0;
00338   }
00339 
00340   /* TODO: put bonds into CONECT records? */
00341   return MOLFILE_SUCCESS;
00342 }
00343 
00344 /* SEQRES records look like this:
00345 
00346 COLUMNS        DATA TYPE       FIELD         DEFINITION
00347 ---------------------------------------------------------------------------------
00348  1 -  6        Record name     "SEQRES"
00349 
00350  9 - 10        Integer         serNum        Serial number of the SEQRES record
00351                                              for the current chain.  Starts at 1
00352                                              and increments by one each line.
00353                                              Reset to 1 for each chain.
00354 
00355 12             Character       chainID       Chain identifier.  This may be any
00356                                              single legal character, including a
00357                                              blank which is used if there is
00358                                              only one chain.
00359 
00360 14 - 17        Integer         numRes        Number of residues in the chain.
00361                                              This value is repeated on every
00362                                              record.
00363 
00364 20 - 22        Residue name    resName       Residue name.
00365 
00366 24 - 26        Residue name    resName       Residue name.
00367 
00368 ... and so forth out to 68-70, for a total of 13 in each line (except possibly
00369 the last.
00370 
00371 source:
00372 http://www.rcsb.org/pdb/file_formats/pdb/pdbguide2.2/part_35.html
00373 */
00374 
00375 /*
00376  * However, we don't use them right now because of several issues that
00377  * can't presently be resolved satisfactorily in VMD:
00378 
00379 According to the RCSB, SEQRES records have to contain all residues, not
00380 just those in the structure, which means VMD will usually produce incorrect
00381 output and there's nothing we can do about it.  The RCSB actually specifies
00382 that all residues in the chain have to present in the SEQRES records, even
00383 if they're not in the structure.
00384   
00385 We can never know which residues to output.  Our current system of outputting   
00386 everything is just terrible when you have 20,000 waters in your system; we
00387 have to fix this immediately.  We could almost get away with making a hash
00388 table of the names of protein and nucleic acid residues and only write chains
00389 containing those residues.  However, there's this little snippet from the
00390 specification:
00391   
00392 * Heterogens which are integrated into the backbone of the chain are listed
00393   as being part of the chain and are included in the SEQRES records for
00394   that chain.
00395   
00396 That means that we can never know what might appear in the sequence unless we
00397 also read HET records and keep track of them in VMD as well.  We shouldn't 
00398 get people depending on such fallible SEQRES records.
00399   
00400 And of course, there's the fact that no other program that we know of besides   
00401 CE needs these SEQRES records.
00402 
00403  * Uncomment the write_seqres line in write_timestep to turn them back on.
00404  */
00405 
00406 
00407 #if 0
00408 static void write_seqres(FILE * fd, int natoms, const molfile_atom_t *atomlist) {
00409   int i=0;
00410   while (i < natoms) {
00411     int k, serNum;
00412     int j = i;
00413     int ires, nres = 1;
00414     int resid = atomlist[i].resid;
00415     /* Count up the number of residues in the chain */
00416     const char *chain = atomlist[i].chain;
00417     while (j < natoms && !strcmp(chain, atomlist[j].chain)) {
00418       if (resid != atomlist[j].resid) {
00419         nres++;
00420         resid = atomlist[j].resid;
00421       }
00422       j++;
00423     }
00424     /* There are nres residues in the chain, from atoms i to j. */
00425     serNum = 1;
00426     ires = 1;
00427     resid = atomlist[i].resid;
00428     fprintf(fd, "SEQRES  %2d %c %4d  ",  serNum, chain[0], nres);
00429     serNum = 2;
00430     fprintf(fd, "%3s ", atomlist[i].resname);
00431     for (k=i; k<j; k++) {
00432       if (resid != atomlist[k].resid) {
00433         resid = atomlist[k].resid;
00434         if (!(ires % 13)) {
00435           fprintf(fd, "\nSEQRES  %2d %c %4d  ",  serNum, chain[0], nres);
00436           serNum++;
00437         }
00438         fprintf(fd, "%3s ", atomlist[k].resname);
00439         ires++;
00440       }
00441     }
00442     i = j;
00443     fprintf(fd, "\n");
00444   }
00445 }
00446 #endif
00447 
00448 /*
00449 CRYST1 records look like this:
00450 The CRYST1 record presents the unit cell parameters, space group, and Z value. If the structure was not determined by crystallographic means, CRYST1 simply defines a unit cube. 
00451 
00452 
00453 Record Format 
00454 
00455 COLUMNS       DATA TYPE      FIELD         DEFINITION
00456 -------------------------------------------------------------
00457  1 -  6       Record name    "CRYST1"
00458 
00459  7 - 15       Real(9.3)      a             a (Angstroms).
00460 
00461 16 - 24       Real(9.3)      b             b (Angstroms).
00462 
00463 25 - 33       Real(9.3)      c             c (Angstroms).
00464 
00465 34 - 40       Real(7.2)      alpha         alpha (degrees).
00466 
00467 41 - 47       Real(7.2)      beta          beta (degrees).
00468 
00469 48 - 54       Real(7.2)      gamma         gamma (degrees).
00470 
00471 56 - 66       LString        sGroup        Space group.
00472 
00473 67 - 70       Integer        z             Z value.
00474 
00475 * If the coordinate entry describes a structure determined by a technique
00476 other than crystallography, CRYST1 contains a = b = c = 1.0, alpha =
00477 beta = gamma = 90 degrees, space group = P 1, and Z = 1.
00478 
00479 We will use "P 1" and "1" for space group and z value, as recommended, but
00480 we'll populate the other fields with the unit cell information we do have.
00481 
00482 */
00483   
00484 static void write_cryst1(FILE *fd, const molfile_timestep_t *ts) {
00485   fprintf(fd, "CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f P 1           1\n", 
00486     ts->A, ts->B, ts->C, ts->alpha, ts->beta, ts->gamma);
00487 }
00488 
00489 
00490 static int write_timestep(void *v, const molfile_timestep_t *ts) {
00491   pdbdata *pdb = (pdbdata *)v; 
00492   const molfile_atom_t *atom;
00493   const float *pos;
00494   int i;
00495   char elementsymbol[3];
00496 
00497   if (pdb->natoms == 0)
00498     return MOLFILE_SUCCESS;
00499 
00500   if (pdb->first_frame) {
00501     /* Turn off SEQRES writing for now; see comments above.
00502     write_seqres(pdb->fd, pdb->natoms, pdb->atomlist);
00503     */
00504     write_cryst1(pdb->fd, ts);
00505     pdb->first_frame = 0;
00506   }
00507   atom = pdb->atomlist;
00508   pos = ts->coords;
00509   for (i=0; i<pdb->natoms; i++) {
00510     /*
00511      * The 8.3 format for position, occupancy, and bfactor permits values 
00512      * only in the range of -999.9994 to 9999.9994 (so that they round
00513      * to the range [-999.999, 9999.999]).  If values fall outside of that
00514      * range, fail and emit an error message rather than generate a
00515      * misformatted PDB file.
00516      */
00517 #define PDBBAD(x) ((x) < -999.9994f || (x) > 9999.9994f)
00518     if (PDBBAD(pos[0]) || PDBBAD(pos[1]) || PDBBAD(pos[2]) ||
00519                 PDBBAD(atom->occupancy) || PDBBAD(atom->bfactor)) {
00520             fprintf(stderr, "PDB WRITE ERROR: Position, occupancy, or b-factor (beta) for atom %d\n", i);
00521       fprintf(stderr, "                 cannot be written in PDB format.\n");
00522       fprintf(stderr, "                 File will be truncated.\n");
00523       return MOLFILE_ERROR;
00524     }
00525 
00526     /* check the atomicnumber and format the atomic element symbol string */
00527     strcpy(elementsymbol, (atom->atomicnumber < 1) ? "  " : get_pte_label(atom->atomicnumber));
00528     elementsymbol[0] = toupper(elementsymbol[0]);
00529     elementsymbol[1] = toupper(elementsymbol[1]);
00530  
00531     if (!write_raw_pdb_record(pdb->fd,  
00532         "ATOM  ", i+1, atom->name, atom->resname, atom->resid, 
00533         atom->insertion, atom->altloc, elementsymbol,
00534         pos[0], pos[1], pos[2], 
00535         atom->occupancy, atom->bfactor, atom->chain, atom->segid)) {
00536       fprintf(stderr, 
00537           "PDB: Error encoutered writing atom %d; file may be incomplete.\n", 
00538           i+1);
00539       return MOLFILE_ERROR;
00540     }
00541     ++atom;
00542     pos += 3;
00543   }
00544   fprintf(pdb->fd, "END\n");
00545 
00546   return MOLFILE_SUCCESS;
00547 }
00548  
00549 static void close_file_write(void *v) {
00550   pdbdata *pdb = (pdbdata *)v; 
00551   fclose(pdb->fd);
00552   free(pdb->atomlist);
00553   free(pdb);
00554 }
00555 
00556 static int read_molecule_metadata(void *v, molfile_metadata_t **metadata) {
00557   pdbdata *pdb = (pdbdata *)v; 
00558   *metadata = pdb->meta;
00559   return MOLFILE_SUCCESS;
00560 }
00561 
00562 /*
00563  * Initialization stuff down here
00564  */
00565 
00566 static molfile_plugin_t plugin = {
00567   vmdplugin_ABIVERSION,                     /* ABI version */
00568   MOLFILE_PLUGIN_TYPE,                      /* type */
00569   "pdb",                                    /* short name */
00570   "PDB",                                    /* pretty name */
00571   "Justin Gullingsrud, John Stone",         /* author */
00572   1,                                        /* major version */
00573   11,                                       /* minor version */
00574   VMDPLUGIN_THREADSAFE,                     /* is_reentrant */
00575   "pdb,ent",                                /* filename extensions */
00576   open_pdb_read,
00577   read_pdb_structure,
00578   read_bonds,                               /* read bond list */
00579   read_next_timestep,
00580   close_pdb_read,
00581   open_file_write,
00582   write_structure,
00583   write_timestep,
00584   close_file_write,
00585   0,
00586   0,
00587   0,
00588   read_molecule_metadata, 
00589 };
00590  
00591 VMDPLUGIN_API int VMDPLUGIN_init() {
00592   return VMDPLUGIN_SUCCESS;
00593 }
00594 
00595 VMDPLUGIN_API int VMDPLUGIN_register(void *v, vmdplugin_register_cb cb) {
00596   (*cb)(v, (vmdplugin_t *)&plugin);
00597   return VMDPLUGIN_SUCCESS;
00598 }
00599 
00600 VMDPLUGIN_API int VMDPLUGIN_fini() {
00601   return VMDPLUGIN_SUCCESS;
00602 }
00603 

Generated on Wed Mar 22 13:15:30 2006 for VMD Plugins (current) by doxygen1.2.14 written by Dimitri van Heesch, © 1997-2002