sci-chemistry/vmd: Fix support for current CUDA versions
authorPacho Ramos <pacho@gentoo.org>
Sun, 8 Mar 2020 18:46:56 +0000 (19:46 +0100)
committerPacho Ramos <pacho@gentoo.org>
Sun, 8 Mar 2020 18:46:56 +0000 (19:46 +0100)
Package-Manager: Portage-2.3.93, Repoman-2.3.20
Signed-off-by: Pacho Ramos <pacho@gentoo.org>
sci-chemistry/vmd/files/vmd-1.9.3-cuda.patch [new file with mode: 0644]
sci-chemistry/vmd/vmd-1.9.3-r5.ebuild [new file with mode: 0644]

diff --git a/sci-chemistry/vmd/files/vmd-1.9.3-cuda.patch b/sci-chemistry/vmd/files/vmd-1.9.3-cuda.patch
new file mode 100644 (file)
index 0000000..258efb7
--- /dev/null
@@ -0,0 +1,426 @@
+--- a/src/CUDAMarchingCubes.cu 2018-03-30 18:52:25.467189457 +0300
++++ b/src/CUDAMarchingCubes.cu 2018-03-30 18:52:02.387136244 +0300
+@@ -10,7 +10,7 @@
+  *
+  *      $RCSfile: CUDAMarchingCubes.cu,v $
+  *      $Author: johns $        $Locker:  $             $State: Exp $
+- *      $Revision: 1.30 $       $Date: 2016/11/28 03:04:58 $
++ *      $Revision: 1.32 $       $Date: 2018/02/15 05:15:02 $
+  *
+  ***************************************************************************
+  * DESCRIPTION:
+@@ -25,14 +25,17 @@
+ //
+ // Description: This class computes an isosurface for a given density grid
+ //              using a CUDA Marching Cubes (MC) alorithm. 
+-//              The implementation is based on the MC demo from the 
+-//              Nvidia GPU Computing SDK, but has been improved 
+-//              and extended.  This implementation achieves higher 
+-//              performance by reducing the number of temporary memory
+-//              buffers, reduces the number of scan calls by using vector
+-//              integer types, and allows extraction of per-vertex normals 
+-//              optionally computes per-vertex colors if provided with a 
+-//              volumetric texture map.
++//
++//              The implementation is loosely based on the MC demo from 
++//              the Nvidia GPU Computing SDK, but the design has been 
++//              improved and extended in several ways.  
++//
++//              This implementation achieves higher performance
++//              by reducing the number of temporary memory
++//              buffers, reduces the number of scan calls by using 
++//              vector integer types, and allows extraction of 
++//              per-vertex normals and optionally computes 
++//              per-vertex colors if a volumetric texture map is provided.
+ //
+ // Author: Michael Krone <michael.krone@visus.uni-stuttgart.de>
+ //         John Stone <johns@ks.uiuc.edu>
+@@ -48,7 +51,7 @@
+ #include <thrust/functional.h>
+ //
+-// Restrict macro to make it easy to do perf tuning tess
++// Restrict macro to make it easy to do perf tuning tests
+ //
+ #if 0
+ #define RESTRICT __restrict__
+@@ -171,6 +174,11 @@
+ texture<float, 3, cudaReadModeElementType> volumeTex;
+ // sample volume data set at a point p, p CAN NEVER BE OUT OF BOUNDS
++// XXX The sampleVolume() call underperforms vs. peak memory bandwidth
++//     because we don't strictly enforce coalescing requirements in the
++//     layout of the input volume presently.  If we forced X/Y dims to be
++//     warp-multiple it would become possible to use wider fetches and
++//     a few other tricks to improve global memory bandwidth 
+ __device__ float sampleVolume(const float * RESTRICT data, 
+                               uint3 p, uint3 gridSize) {
+     return data[(p.z*gridSize.x*gridSize.y) + (p.y*gridSize.x) + p.x];
+@@ -592,6 +600,30 @@
+     cudaBindTextureToArray(volumeTex, d_vol, desc);
+ }
++#if CUDART_VERSION >= 9000
++//
++// XXX CUDA 9.0RC breaks the usability of Thrust scan() prefix sums when
++//     used with the built-in uint2 vector integer types.  To workaround
++//     the problem we have to define our own type and associated conversion
++//     routines etc.
++//
++
++// XXX workaround for uint2 breakage in CUDA 9.0RC
++struct myuint2 : uint2 {
++  __host__ __device__ myuint2() : uint2(make_uint2(0, 0)) {}
++  __host__ __device__ myuint2(int val) : uint2(make_uint2(val, val)) {}
++  __host__ __device__ myuint2(uint2 val) : uint2(make_uint2(val.x, val.y)) {}
++};
++
++void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
++    const uint2 zero = make_uint2(0, 0);
++    thrust::exclusive_scan(thrust::device_ptr<myuint2>((myuint2*)input),
++                           thrust::device_ptr<myuint2>((myuint2*)input + numElements),
++                           thrust::device_ptr<myuint2>((myuint2*)output),
++                           (myuint2) zero);
++}
++
++#else
+ void ThrustScanWrapperUint2(uint2* output, uint2* input, unsigned int numElements) {
+     const uint2 zero = make_uint2(0, 0);
+@@ -601,6 +633,7 @@
+                            zero);
+ }
++#endif
+ void ThrustScanWrapperArea(float* output, float* input, unsigned int numElements) {
+     thrust::inclusive_scan(thrust::device_ptr<float>(input), 
+@@ -639,11 +672,9 @@
+ }
+-///////////////////////////////////////////////////////////////////////////////
+ //
+ // class CUDAMarchingCubes
+ //
+-///////////////////////////////////////////////////////////////////////////////
+ CUDAMarchingCubes::CUDAMarchingCubes() {
+     // initialize values
+@@ -713,9 +744,6 @@
+ }
+-////////////////////////////////////////////////////////////////////////////////
+-//! Run the Cuda part of the computation
+-////////////////////////////////////////////////////////////////////////////////
+ void CUDAMarchingCubes::computeIsosurfaceVerts(float3* vertOut, unsigned int maxverts, dim3 & grid3) {
+     // check if data is available
+     if (!this->setdata)
+
+--- a/src/CUDAMDFF.cu  2016-12-01 10:11:56.000000000 +0300
++++ b/src/CUDAMDFF.cu  2018-03-30 18:56:44.352937599 +0300
+@@ -11,7 +11,7 @@
+  *
+  *      $RCSfile: CUDAMDFF.cu,v $
+  *      $Author: johns $        $Locker:  $             $State: Exp $
+- *      $Revision: 1.75 $      $Date: 2015/04/07 20:41:26 $
++ *      $Revision: 1.78 $      $Date: 2018/02/19 07:10:37 $
+  *
+  ***************************************************************************
+  * DESCRIPTION:
+@@ -28,12 +28,16 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <cuda.h>
+-#include <float.h> // FLT_MAX etc
+-
++#if CUDART_VERSION >= 9000
++#include <cuda_fp16.h> // need to explicitly include for CUDA 9.0
++#endif
+ #if CUDART_VERSION < 4000
+ #error The VMD MDFF feature requires CUDA 4.0 or later
+ #endif
++#include <float.h> // FLT_MAX etc
++
++
+ #include "Inform.h"
+ #include "utilities.h"
+ #include "WKFThreads.h"
+@@ -588,6 +592,43 @@
+ }
++
++// #define VMDUSESHUFFLE 1
++#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
++// New warp shuffle-based CC sum reduction for Kepler and later GPUs.
++inline __device__ void cc_sumreduction(int tid, int totaltb, 
++                                float4 &total_cc_sums,
++                                float &total_lcc,
++                                int &total_lsize,
++                                float4 *tb_cc_sums,
++                                float *tb_lcc,
++                                int *tb_lsize) {
++  total_cc_sums = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
++  total_lcc = 0.0f;
++  total_lsize = 0;
++
++  // use precisely one warp to do the final reduction
++  if (tid < warpSize) {
++    for (int i=tid; i<totaltb; i+=warpSize) {
++      total_cc_sums += tb_cc_sums[i];
++      total_lcc     += tb_lcc[i];
++      total_lsize   += tb_lsize[i];
++    }
++
++    // perform intra-warp parallel reduction...
++    // general loop version of parallel sum-reduction
++    for (int mask=warpSize/2; mask>0; mask>>=1) {
++      total_cc_sums.x += __shfl_xor_sync(0xffffffff, total_cc_sums.x, mask);
++      total_cc_sums.y += __shfl_xor_sync(0xffffffff, total_cc_sums.y, mask);
++      total_cc_sums.z += __shfl_xor_sync(0xffffffff, total_cc_sums.z, mask);
++      total_cc_sums.w += __shfl_xor_sync(0xffffffff, total_cc_sums.w, mask);
++      total_lcc     += __shfl_xor_sync(0xffffffff, total_lcc, mask);
++      total_lsize   += __shfl_xor_sync(0xffffffff, total_lsize, mask);
++    }
++  }
++}
++#else
++// shared memory based CC sum reduction 
+ inline __device__ void cc_sumreduction(int tid, int totaltb, 
+                                 float4 &total_cc_sums,
+                                 float &total_lcc,
+@@ -629,6 +670,7 @@
+   total_lcc = tb_lcc[0];
+   total_lsize = tb_lsize[0];
+ }
++#endif
+ inline __device__ void thread_cc_sum(float ref, float density,
+@@ -750,6 +792,92 @@
+   }
++#if defined(VMDUSESHUFFLE) && __CUDA_ARCH__ >= 300 && CUDART_VERSION >= 9000
++  // all threads write their local sums to shared memory...
++  __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
++  __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
++  __shared__ float tb_lcc_s[TOTALBLOCKSZ];
++  __shared__ int tb_lsize_s[TOTALBLOCKSZ];
++
++  tb_cc_means_s[tid] = thread_cc_means;
++  tb_cc_squares_s[tid] = thread_cc_squares;
++  tb_lcc_s[tid] = thread_lcc;
++  tb_lsize_s[tid] = thread_lsize;
++  __syncthreads(); // all threads must hit syncthreads call...
++
++  // use precisely one warp to do the thread-block-wide reduction
++  if (tid < warpSize) {
++    float2 tmp_cc_means = make_float2(0.0f, 0.0f);
++    float2 tmp_cc_squares = make_float2(0.0f, 0.0f);
++    float tmp_lcc = 0.0f;
++    int tmp_lsize = 0;
++    for (int i=tid; i<TOTALBLOCKSZ; i+=warpSize) {
++      tmp_cc_means   += tb_cc_means_s[i];
++      tmp_cc_squares += tb_cc_squares_s[i];
++      tmp_lcc        += tb_lcc_s[i];
++      tmp_lsize      += tb_lsize_s[i];
++    }
++
++    // perform intra-warp parallel reduction...
++    // general loop version of parallel sum-reduction
++    for (int mask=warpSize/2; mask>0; mask>>=1) {
++      tmp_cc_means.x   += __shfl_xor_sync(0xffffffff, tmp_cc_means.x, mask);
++      tmp_cc_means.y   += __shfl_xor_sync(0xffffffff, tmp_cc_means.y, mask);
++      tmp_cc_squares.x += __shfl_xor_sync(0xffffffff, tmp_cc_squares.x, mask);
++      tmp_cc_squares.y += __shfl_xor_sync(0xffffffff, tmp_cc_squares.y, mask);
++      tmp_lcc          += __shfl_xor_sync(0xffffffff, tmp_lcc, mask);
++      tmp_lsize        += __shfl_xor_sync(0xffffffff, tmp_lsize, mask);
++    }
++
++    // write per-thread-block partial sums to global memory,
++    // if a per-thread-block CC output array is provided, write the 
++    // local CC for this thread block out, and finally, check if we 
++    // are the last thread block to finish, and finalize the overall
++    // CC results for the entire grid of thread blocks.
++    if (tid == 0) {   
++      unsigned int bid = blockIdx.z * gridDim.x * gridDim.y +
++                         blockIdx.y * gridDim.x + blockIdx.x;
++
++      tb_cc_sums[bid] = make_float4(tmp_cc_means.x, tmp_cc_means.y,
++                                    tmp_cc_squares.x, tmp_cc_squares.y);
++      tb_lcc[bid]     = tmp_lcc;
++      tb_lsize[bid]   = tmp_lsize;
++
++      if (tb_CC != NULL) {
++        float cc = calc_cc(tb_cc_means_s[0].x, tb_cc_means_s[0].y,
++                           tb_cc_squares_s[0].x, tb_cc_squares_s[0].y,
++                           tb_lsize_s[0], tb_lcc_s[0]);
++
++        // write local per-thread-block CC to global memory
++        tb_CC[bid]   = cc;
++      }
++
++      __threadfence();
++
++      unsigned int value = atomicInc(&tbcatomic[0], totaltb);
++      isLastBlockDone = (value == (totaltb - 1));
++    }
++  }
++  __syncthreads();
++
++  if (isLastBlockDone) {
++    float4 total_cc_sums;
++    float total_lcc;
++    int total_lsize;
++    cc_sumreduction(tid, totaltb, total_cc_sums, total_lcc, total_lsize,
++                    tb_cc_sums, tb_lcc, tb_lsize); 
++
++    if (tid == 0) {
++      tb_cc_sums[totaltb] = total_cc_sums;
++      tb_lcc[totaltb] = total_lcc;
++      tb_lsize[totaltb] = total_lsize;
++    }
++ 
++    reset_atomic_counter(&tbcatomic[0]);
++  }
++
++#else
++
+   // all threads write their local sums to shared memory...
+   __shared__ float2 tb_cc_means_s[TOTALBLOCKSZ];
+   __shared__ float2 tb_cc_squares_s[TOTALBLOCKSZ];
+@@ -794,6 +922,7 @@
+     }
+     __syncthreads(); // all threads must hit syncthreads call...
+   }
++//#endif
+   // write per-thread-block partial sums to global memory,
+   // if a per-thread-block CC output array is provided, write the 
+@@ -847,6 +976,7 @@
+     }
+ #endif
+   }
++#endif
+ }
+
+--- a/src/CUDAQuickSurf.cu     2016-12-01 10:11:56.000000000 +0300
++++ b/src/CUDAQuickSurf.cu     2018-03-30 19:01:38.777196233 +0300
+@@ -11,7 +11,7 @@
+  *
+  *      $RCSfile: CUDAQuickSurf.cu,v $
+  *      $Author: johns $        $Locker:  $             $State: Exp $
+- *      $Revision: 1.81 $      $Date: 2016/04/20 04:57:46 $
++ *      $Revision: 1.84 $      $Date: 2018/02/15 04:59:15 $
+  *
+  ***************************************************************************
+  * DESCRIPTION:
+@@ -22,6 +22,9 @@
+ #include <stdlib.h>
+ #include <string.h>
+ #include <cuda.h>
++#if CUDART_VERSION >= 9000
++#include <cuda_fp16.h>  // need to explicitly include for CUDA 9.0
++#endif
+ #if CUDART_VERSION < 4000
+ #error The VMD QuickSurf feature requires CUDA 4.0 or later
+@@ -130,14 +133,14 @@
+ #define GUNROLL      1
+ #endif
+-#if __CUDA_ARCH__ >= 300
+ #define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
+-#define MINBLOCKDENS 1
++#if __CUDA_ARCH__ >= 600
++#define MINBLOCKDENS 16
++#elif __CUDA_ARCH__ >= 300
++#define MINBLOCKDENS 16
+ #elif __CUDA_ARCH__ >= 200
+-#define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
+ #define MINBLOCKDENS 1
+ #else
+-#define MAXTHRDENS  ( GBLOCKSZX * GBLOCKSZY * GBLOCKSZZ )
+ #define MINBLOCKDENS 1
+ #endif
+@@ -150,7 +153,7 @@
+ //
+ template<class DENSITY, class VOLTEX>
+ __global__ static void 
+-// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
++__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
+ gaussdensity_fast_tex_norm(int natoms,
+                       const float4 * RESTRICT sorted_xyzr, 
+                       const float4 * RESTRICT sorted_color, 
+@@ -217,6 +220,8 @@
+     for (yab=yabmin; yab<=yabmax; yab++) {
+       for (xab=xabmin; xab<=xabmax; xab++) {
+         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
++        // this biggest latency hotspot in the kernel, if we could improve
++        // packing of the grid cell map, we'd likely improve performance 
+         uint2 atomstartend = cellStartEnd[abcellidx];
+         if (atomstartend.x != GRID_CELL_EMPTY) {
+           unsigned int atomid;
+@@ -296,7 +301,7 @@
+ __global__ static void 
+-// __launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
++__launch_bounds__ ( MAXTHRDENS, MINBLOCKDENS )
+ gaussdensity_fast_tex3f(int natoms,
+                         const float4 * RESTRICT sorted_xyzr, 
+                         const float4 * RESTRICT sorted_color, 
+@@ -363,6 +368,8 @@
+     for (yab=yabmin; yab<=yabmax; yab++) {
+       for (xab=xabmin; xab<=xabmax; xab++) {
+         int abcellidx = zab * acplanesz + yab * acncells.x + xab;
++        // this biggest latency hotspot in the kernel, if we could improve
++        // packing of the grid cell map, we'd likely improve performance 
+         uint2 atomstartend = cellStartEnd[abcellidx];
+         if (atomstartend.x != GRID_CELL_EMPTY) {
+           unsigned int atomid;
+@@ -550,7 +557,6 @@
+ // per-GPU handle with various memory buffer pointers, etc.
+ typedef struct {
+-  /// max grid sizes and attributes the current allocations will support
+   int verbose;
+   long int natoms;
+   int colorperatom;
+@@ -561,18 +567,18 @@
+   int gy;
+   int gz;
+-  CUDAMarchingCubes *mc;     ///< Marching cubes class used to extract surface
++  CUDAMarchingCubes *mc;     
+-  float *devdensity;         ///< density map stored in GPU memory
+-  void *devvoltexmap;        ///< volumetric texture map
+-  float4 *xyzr_d;            ///< atom coords and radii
+-  float4 *sorted_xyzr_d;     ///< cell-sorted coords and radii
+-  float4 *color_d;           ///< colors
+-  float4 *sorted_color_d;    ///< cell-sorted colors
+-
+-  unsigned int *atomIndex_d; ///< cell index for each atom
+-  unsigned int *atomHash_d;  ///<  
+-  uint2 *cellStartEnd_d;     ///< cell start/end indices 
++  float *devdensity;         
++  void *devvoltexmap;        
++  float4 *xyzr_d;            
++  float4 *sorted_xyzr_d;     
++  float4 *color_d;           
++  float4 *sorted_color_d;    
++
++  unsigned int *atomIndex_d; 
++  unsigned int *atomHash_d;  
++  uint2 *cellStartEnd_d;     
+   void *safety;
+   float3 *v3f_d;
diff --git a/sci-chemistry/vmd/vmd-1.9.3-r5.ebuild b/sci-chemistry/vmd/vmd-1.9.3-r5.ebuild
new file mode 100644 (file)
index 0000000..a4e0e6d
--- /dev/null
@@ -0,0 +1,272 @@
+# Copyright 1999-2020 Gentoo Authors
+# Distributed under the terms of the GNU General Public License v2
+
+EAPI=7
+PYTHON_COMPAT=( python2_7 )
+
+inherit cuda desktop flag-o-matic prefix python-single-r1 toolchain-funcs xdg
+
+DESCRIPTION="Visual Molecular Dynamics"
+HOMEPAGE="http://www.ks.uiuc.edu/Research/vmd/"
+SRC_URI="
+       https://dev.gentoo.org/~jlec/distfiles/${P}-gentoo-patches.tar.xz
+       ${P}.src.tar
+"
+
+SLOT="0"
+LICENSE="vmd"
+KEYWORDS="~amd64 ~x86 ~amd64-linux ~x86-linux"
+IUSE="cuda gromacs msms povray sqlite tachyon xinerama"
+REQUIRED_USE="${PYTHON_REQUIRED_USE}"
+
+RESTRICT="fetch"
+
+# currently, tk-8.5* with USE=truetype breaks some
+# tk apps such as Sequence Viewer or Timeline.
+CDEPEND="
+       >=dev-lang/tk-8.6.1:0=
+       dev-lang/perl
+       dev-libs/expat
+       $(python_gen_cond_dep '
+               || (
+                       dev-python/numpy-python2[${PYTHON_MULTI_USEDEP}]
+                       dev-python/numpy[${PYTHON_MULTI_USEDEP}]
+               )
+       ')
+       sci-libs/netcdf:0=
+       virtual/opengl
+       >=x11-libs/fltk-1.1.10-r2:1
+       x11-libs/libXft
+       x11-libs/libXi
+       ${PYTHON_DEPS}
+       cuda? ( >=dev-util/nvidia-cuda-toolkit-4.2.9-r1:= )
+       gromacs? ( >=sci-chemistry/gromacs-5.0.4-r1:0=[tng] )
+       sqlite? ( dev-db/sqlite:3= )
+       tachyon? ( >=media-gfx/tachyon-0.99_beta6 )
+       xinerama? ( x11-libs/libXinerama )
+"
+DEPEND="${CDEPEND}"
+BDEPEND="
+       virtual/pkgconfig
+       dev-lang/swig
+"
+RDEPEND="${CDEPEND}
+       sci-biology/stride
+       sci-chemistry/chemical-mime-data
+       sci-chemistry/surf
+       x11-terms/xterm
+       msms? ( sci-chemistry/msms-bin )
+       povray? ( media-gfx/povray )
+"
+
+VMD_DOWNLOAD="http://www.ks.uiuc.edu/Development/Download/download.cgi?PackageName=VMD"
+# Binary only plugin!!
+QA_PREBUILT="usr/lib*/vmd/plugins/LINUX/tcl/intersurf1.1/bin/intersurf.so"
+QA_FLAGS_IGNORED_amd64=" usr/lib64/vmd/plugins/LINUX/tcl/volutil1.3/volutil"
+QA_FLAGS_IGNORED_x86=" usr/lib/vmd/plugins/LINUX/tcl/volutil1.3/volutil"
+
+pkg_nofetch() {
+       elog "Please download ${P}.src.tar from"
+       elog "${VMD_DOWNLOAD}"
+       elog "after agreeing to the license and get"
+       elog "https://dev.gentoo.org/~jlec/distfiles/${P}-gentoo-patches.tar.xz"
+       elog "Place both into your DISTDIR directory"
+       elog
+       elog "Due to an upstream bug (https://bugs.gentoo.org/640440) sources"
+       elog "file may get downloaded as a compressed tarball or not. In that case"
+       elog "you will need to ensure you uncompress the file and rename it"
+       elog "as ${P}.src.tar"
+}
+
+src_prepare() {
+       xdg_src_prepare
+
+       use cuda && cuda_sanitize
+
+       # Compat with newer CUDA versions (from Arch)
+       eapply "${FILESDIR}"/${P}-cuda.patch
+
+       cd "${WORKDIR}"/plugins || die
+
+       eapply -p2 "${WORKDIR}"/${P}-gentoo-plugins.patch
+
+       [[ ${SILENT} == yes ]] || sed '/^.SILENT/d' -i $(find -name Makefile)
+
+       sed \
+               -e "s:CC = gcc:CC = $(tc-getCC):" \
+               -e "s:CXX = g++:CXX = $(tc-getCXX):" \
+               -e "s:COPTO =.*\":COPTO = -fPIC -o \":" \
+               -e "s:LOPTO = .*\":LOPTO = ${LDFLAGS} -fPIC -o \":" \
+               -e "s:CCFLAGS =.*\":CCFLAGS = ${CFLAGS}\":" \
+               -e "s:CXXFLAGS =.*\":CXXFLAGS = ${CXXFLAGS}\":" \
+               -e "s:SHLD = gcc:SHLD = $(tc-getCC) -shared:" \
+               -e "s:SHXXLD = g++:SHXXLD = $(tc-getCXX) -shared:" \
+               -e "s:-ltcl8.5:-ltcl:" \
+               -i Make-arch || die "Failed to set up plugins Makefile"
+
+       sed \
+               -e '/^AR /s:=:?=:g' \
+               -e '/^RANLIB /s:=:?=:g' \
+               -i ../plugins/*/Makefile || die
+
+       tc-export AR RANLIB
+
+       sed \
+               -e "s:\$(CXXFLAGS)::g" \
+               -i hesstrans/Makefile || die
+
+       # prepare vmd itself
+       cd "${S}" || die
+
+       eapply -p2 "${WORKDIR}"/${P}-gentoo-base.patch
+       eapply "${FILESDIR}"/${P}-configure-libtachyon.patch
+       eapply "${FILESDIR}"/${P}-tmpdir.patch
+
+       # PREFIX
+       sed \
+               -e "s:/usr/include/:${EPREFIX}/usr/include:g" \
+               -i configure || die
+
+       sed \
+               -e "s:gentoo-bindir:${ED}/usr/bin:g" \
+               -e "s:gentoo-libdir:${ED}/usr/$(get_libdir):g" \
+               -e "s:gentoo-opengl-include:${EPREFIX}/usr/include/GL:g" \
+               -e "s:gentoo-opengl-libs:${EPREFIX}/usr/$(get_libdir):g" \
+               -e "s:gentoo-gcc:$(tc-getCC):g" \
+               -e "s:gentoo-g++:$(tc-getCXX):g" \
+               -e "s:gentoo-nvcc:${EPREFIX}/opt/cuda/bin/nvcc:g" \
+               -e "s:gentoo-cflags:${CFLAGS}:g" \
+               -e "s:gentoo-cxxflags:${CXXFLAGS}:g" \
+               -e "s:gentoo-nvflags::g" \
+               -e "s:gentoo-ldflags:${LDFLAGS}:g" \
+               -e "s:gentoo-plugindir:${WORKDIR}/plugins:g" \
+               -e "s:gentoo-fltk-include:$(fltk-config --includedir):g" \
+               -e "s:gentoo-fltk-libs:$(dirname $(fltk-config --libs)) -Wl,-rpath,$(dirname $(fltk-config --libs)):g" \
+               -e "s:gentoo-libtachyon-include:${EPREFIX}/usr/include/tachyon:g" \
+               -e "s:gentoo-libtachyon-libs:${EPREFIX}/usr/$(get_libdir):g" \
+               -e "s:gentoo-netcdf-include:${EPREFIX}/usr/include:g" \
+               -e "s:gentoo-netcdf-libs:${EPREFIX}/usr/$(get_libdir):g" \
+               -i configure || die
+
+       if use cuda; then
+               sed \
+                       -e "s:gentoo-cuda-lib:${EPREFIX}/opt/cuda/$(get_libdir):g" \
+                       -e "/NVCCFLAGS/s:=:= ${NVCCFLAGS}:g" \
+                       -i configure src/Makefile || die
+               sed \
+                       -e '/compute_/d' \
+                       -i configure || die
+               sed \
+                       -e 's:-gencode .*code=sm_..::' \
+                       -i src/Makefile || die
+       fi
+
+       sed \
+               -e "s:LINUXPPC:LINUX:g" \
+               -e "s:LINUXALPHA:LINUX:g" \
+               -e "s:LINUXAMD64:LINUX:g" \
+               -e "s:gentoo-stride:${EPREFIX}/usr/bin/stride:g" \
+               -e "s:gentoo-surf:${EPREFIX}/usr/bin/surf:g" \
+               -e "s:gentoo-tachyon:${EPREFIX}/usr/bin/tachyon:g" \
+               -i "${S}"/bin/vmd.sh || die "failed setting up vmd wrapper script"
+
+       EMAKEOPTS=(
+               TCLINC="-I${EPREFIX}/usr/include"
+               TCLLIB="-L${EPREFIX}/usr/$(get_libdir)"
+               TCLLDFLAGS="-shared"
+               NETCDFLIB="$($(tc-getPKG_CONFIG) --libs-only-L netcdf)${EPREFIX}/usr/$(get_libdir)/libnetcdf.so"
+               NETCDFINC="$($(tc-getPKG_CONFIG) --cflags-only-I netcdf)${EPREFIX}/usr/include"
+               NETCDFLDFLAGS="$($(tc-getPKG_CONFIG) --libs netcdf)"
+               NETCDFDYNAMIC=1
+               EXPATINC="-I${EPREFIX}/usr/include"
+               EXPATLIB="$($(tc-getPKG_CONFIG) --libs expat)"
+               EXPATLDFLAGS="-shared"
+               EXPATDYNAMIC=1
+       )
+       if use gromacs; then
+               EMAKEOPTS+=(
+                       TNGLIB="$($(tc-getPKG_CONFIG) --libs libgromacs)"
+                       TNGINC="-I${EPREFIX}/usr/include"
+                       TNGLDFLAGS="-shared"
+                       TNGDYNAMIC=1
+               )
+       fi
+       if use sqlite; then
+               EMAKEOPTS+=(
+                       SQLITELIB="$($(tc-getPKG_CONFIG) --libs sqlite3)"
+                       SQLITEINC="-I${EPREFIX}/usr/include"
+                       SQLITELDFLAGS="-shared"
+                       SQLITEDYNAMIC=1
+               )
+       fi
+}
+
+src_configure() {
+       local myconf="OPENGL OPENGLPBUFFER COLVARS FLTK TK TCL PTHREADS PYTHON IMD NETCDF NUMPY NOSILENT XINPUT"
+       rm -f configure.options && echo $myconf >> configure.options
+
+       use cuda && myconf+=" CUDA"
+#      use mpi && myconf+=" MPI"
+       use tachyon && myconf+=" LIBTACHYON"
+       use xinerama && myconf+=" XINERAMA"
+
+       export \
+               PYTHON_INCLUDE_DIR="$(python_get_includedir)" \
+               PYTHON_LIBRARY_DIR="$(python_get_library_path)" \
+               PYTHON_LIBRARY="$(python_get_LIBS)" \
+               NUMPY_INCLUDE_DIR="$(python_get_sitedir)/numpy/core/include" \
+               NUMPY_LIBRARY_DIR="$(python_get_sitedir)/numpy/core/include"
+
+       perl ./configure LINUX \
+               ${myconf} || die
+}
+
+src_compile() {
+       # build plugins
+       cd "${WORKDIR}"/plugins || die
+
+       emake \
+               ${EMAKEOPTS[@]} \
+               LINUX
+
+       # build vmd
+       cd "${S}"/src || die
+       emake
+}
+
+src_install() {
+       # install plugins
+       cd "${WORKDIR}"/plugins || die
+       emake \
+                       PLUGINDIR="${ED}/usr/$(get_libdir)/${PN}/plugins" \
+                       distrib
+
+       # install vmd
+       cd "${S}"/src || die
+       emake install
+
+       # install docs
+       cd "${S}" || die
+       dodoc Announcement README doc/ig.pdf doc/ug.pdf
+
+       # remove some of the things we don't want and need in
+       # /usr/lib
+       cd "${ED}"/usr/$(get_libdir)/vmd || die
+       rm -fr doc README Announcement LICENSE || \
+               die "failed to clean up /usr/lib/vmd directory"
+
+       # adjust path in vmd wrapper
+       sed \
+               -e "s:${ED}::" -i "${ED}"/usr/bin/${PN} \
+               -e "/^defaultvmddir/s:^.*$:defaultvmddir=\"${EPREFIX}/usr/$(get_libdir)/${PN}\":g" \
+               || die "failed to set up vmd wrapper script"
+
+       # install icon and generate desktop entry
+       insinto /usr/share/pixmaps
+       doins "${WORKDIR}"/vmd.png
+       eprefixify "${WORKDIR}"/vmd.desktop
+       sed -i '/^Path/d' "${WORKDIR}"/vmd.desktop || die
+       # Open PDB files with VMD
+       echo "MimeType=chemical/x-pdb;" >> "${WORKDIR}"/vmd.desktop || die
+       domenu "${WORKDIR}"/vmd.desktop
+}