/********************************************************************/ /***** GPU Graph Cut ************************************************/ /********************************************************************/ //////////////////////////////////////////////////// // Copyright (c) 2018 Kiyoshi Oguri 2018.02.14 // // Released under the MIT license // // http://opensource.org/licenses/mit-license.php // //////////////////////////////////////////////////// #include #include extern int cost(int S, int H, int W); extern void cut(int S, int H, int W); int LOOP = 1600; int SIZE_S; int SIZE_H; int SIZE_W; int SIZE_HW; int SIZE_SHW; __constant__ int size_s; __constant__ int size_h; __constant__ int size_w; __constant__ int size_hw; __constant__ int size_shw; uint *h_DAT; uint *d_DAT; __constant__ uint *DAT; size_t DAT_SIZE; #define M_HGT (0xffe00000) #define M_HDU (0x001c0000) #define M_WRL (0x00038000) #define M_SFB (0x00007f00) #define M_OVF (0x000000ff) #define S_HGT (21) #define S_HDU (18) #define S_WRL (15) #define S_SFB (8) #define S_OVF (0) #define U_HGT (~M_HGT) #define U_HDU (~M_HDU) #define U_WRL (~M_WRL) #define U_SFB (~M_SFB) #define U_OVF (~M_OVF) inline uint h_ADR(uint S, uint H, uint W) { uint ss = S / 2; uint sss = S % 2; uint hh = H / 4; uint hhh = H % 4; uint ww = W / 4; uint www = W % 4; return ss*32*(SIZE_W/4)*(SIZE_H/4) + hh*32*(SIZE_W/4) + ww*32 + sss*16 + hhh*4 + www; } inline __device__ uint ADR(uint S, uint H, uint W) { uint ss = S / 2; uint sss = S % 2; uint hh = H / 4; uint hhh = H % 4; uint ww = W / 4; uint www = W % 4; return ss*32*(size_w/4)*(size_h/4) + hh*32*(size_w/4) + ww*32 + sss*16 + hhh*4 + www; } __device__ void push_e(uint S, uint H, uint W, uint nS, uint nH, uint nW, uint D) { uint own = DAT[ADR(S, H, W)]; uint nxt = DAT[ADR(nS, nH, nW)]; uint Hgt = nxt>>S_HGT; uint hgt = own>>S_HGT; if (Hgt <= hgt) return; uint edg; if (D == 0) edg = ((own&M_SFB)>>S_SFB); if (D == 2) edg = ((own&M_HDU)>>S_HDU); if (D == 4) edg = ((own&M_WRL)>>S_WRL); if (edg == 0) return; uint ovf = own&M_OVF; if (ovf > 0) { bool q = edg > ovf; uint pp = (q)? ovf: edg; if (D == 0) own -= (pp<>S_HGT; uint hgt = own>>S_HGT; if (Hgt <= hgt) return; uint Edg; if (D == 5) Edg = ((nxt&M_SFB)>>S_SFB); if (D == 1) Edg = ((nxt&M_HDU)>>S_HDU); if (D == 3) Edg = ((nxt&M_WRL)>>S_WRL); uint edg; if (D == 5) edg = 127-Edg; if (D == 1) edg = 6-Edg; if (D == 3) edg = 6-Edg; if (edg == 0) return; uint ovf = own&M_OVF; if (ovf > 0) { bool q = edg > ovf; uint pp = (q)? ovf: edg; own -= pp; if (q) own = (own&U_HGT)|(Hgt<> 4; uint hhh = (PP & 12) >> 2; uint www = (PP & 3); uint S = 2 * SS + sss; uint H = 8 * HH + hh + hhh; uint W = 8 * WW + ww + www; /////////////////////////////// push_e(S, H, W, S+1, H, W, 0); if (W!=size_w-1) push_e(S, H, W, S, H, W+1, 4); if (W!=0) push_o(S, H, W, S, H, W-1, 3); if (H!=size_h-1) push_e(S, H, W, S, H+1, W, 2); if (H!=0) push_o(S, H, W, S, H-1, W, 1); if (S!=0) push_o(S, H, W, S-1, H, W, 5); } __global__ void time_set(uint time) { /////////////////////////////// uint sa = blockDim.x * blockIdx.x + threadIdx.x; uint H = sa / size_w; uint W = sa % size_w; /////////////////////////////// DAT[ADR(size_s, H, W)] = (DAT[ADR(size_s, H, W)]&U_HGT)|(time<>>(time); wave<<< grid, block >>>(0, 4, 0, 4); wave<<< grid, block >>>(0, 4, 4, 0); wave<<< grid, block >>>(4, 0, 0, 4); wave<<< grid, block >>>(4, 0, 4, 0); } } void data_set(int penalty_w, int penalty_h, int inhibit_a, int inhibit_b) { for (int H = 0; H < SIZE_H; H++) { for (int W = 0; W < SIZE_W; W++) { for (int S = 0; S < SIZE_S; S++) { /////////////////////////////// h_DAT[h_ADR(S, H, W)] = 0; /////////////////////////////// h_DAT[h_ADR(S, H, W)] = (cost(S+1, H, W)/6)<= 0; S--) { if (S == 0) cut(S, H, W); else { uint aa = h_DAT[h_ADR(S, H, W)]>>S_HGT; uint bb = h_DAT[h_ADR(S-1, H, W)]>>S_HGT; if (aa > bb) { if (aa - bb > LOOP/10) { cut(S, H, W); break; } } } } } } } int graph_cut(int penalty_w, int penalty_h, int inhibit_a, int inhibit_b) { printf("S-H-W=%d-%d-%d LOOP=%d\n", SIZE_S, SIZE_H, SIZE_W, LOOP); if (SIZE_S% 4) { printf("ERROR! SIZE_S must be the integer multiole of 4\n"); return 0; } if (SIZE_H%16) { printf("ERROR! SIZE_H must be the integer multiole of 16\n"); return 0; } if (SIZE_W%32) { printf("ERROR! SIZE_W must be the integer multiole of 32\n"); return 0; } cudaMemcpyToSymbol(size_s, &SIZE_S, sizeof(int)); cudaMemcpyToSymbol(size_h, &SIZE_H, sizeof(int)); cudaMemcpyToSymbol(size_w, &SIZE_W, sizeof(int)); cudaMemcpyToSymbol(size_hw, &SIZE_HW, sizeof(int)); cudaMemcpyToSymbol(size_shw, &SIZE_SHW, sizeof(int)); DAT_SIZE = sizeof(uint) * (SIZE_S+2)*SIZE_H*SIZE_W; h_DAT = new uint[(SIZE_S+2)*SIZE_H*SIZE_W]; cudaMalloc((void **)&d_DAT, DAT_SIZE); cudaMemcpyToSymbol(DAT, &d_DAT, sizeof(uint*)); data_set(penalty_w, penalty_h, inhibit_a, inhibit_b); //---------------------------// auto start = std::chrono::system_clock::now(); wave_cut(LOOP); auto end = std::chrono::system_clock::now(); auto dur = end - start; int usec = std::chrono::duration_cast(dur).count(); printf("wave time = %dus\n", usec); int flow = sink_chk(); printf("wave flow = %d\n", flow); //---------------------------// dep_set(); cudaFree(d_DAT); delete [] h_DAT; return flow; }