LLVM OpenMP* Runtime Library
kmp_dispatch.cpp
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 /*
17  * Dynamic scheduling initialization and dispatch.
18  *
19  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20  * it may change values between parallel regions. __kmp_max_nth
21  * is the largest value __kmp_nth may take, 1 is the smallest.
22  *
23  */
24 
25 /* ------------------------------------------------------------------------ */
26 /* ------------------------------------------------------------------------ */
27 
28 #include "kmp.h"
29 #include "kmp_i18n.h"
30 #include "kmp_itt.h"
31 #include "kmp_str.h"
32 #include "kmp_error.h"
33 #include "kmp_stats.h"
34 #if KMP_OS_WINDOWS && KMP_ARCH_X86
35  #include <float.h>
36 #endif
37 
38 #if OMPT_SUPPORT
39 #include "ompt-internal.h"
40 #include "ompt-specific.h"
41 #endif
42 
43 /* ------------------------------------------------------------------------ */
44 /* ------------------------------------------------------------------------ */
45 
46 // template for type limits
47 template< typename T >
48 struct i_maxmin {
49  static const T mx;
50  static const T mn;
51 };
52 template<>
53 struct i_maxmin< int > {
54  static const int mx = 0x7fffffff;
55  static const int mn = 0x80000000;
56 };
57 template<>
58 struct i_maxmin< unsigned int > {
59  static const unsigned int mx = 0xffffffff;
60  static const unsigned int mn = 0x00000000;
61 };
62 template<>
63 struct i_maxmin< long long > {
64  static const long long mx = 0x7fffffffffffffffLL;
65  static const long long mn = 0x8000000000000000LL;
66 };
67 template<>
68 struct i_maxmin< unsigned long long > {
69  static const unsigned long long mx = 0xffffffffffffffffLL;
70  static const unsigned long long mn = 0x0000000000000000LL;
71 };
72 //-------------------------------------------------------------------------
73 
74 #ifdef KMP_STATIC_STEAL_ENABLED
75 
76  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77  template< typename T >
78  struct dispatch_private_infoXX_template {
79  typedef typename traits_t< T >::unsigned_t UT;
80  typedef typename traits_t< T >::signed_t ST;
81  UT count; // unsigned
82  T ub;
83  /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84  T lb;
85  ST st; // signed
86  UT tc; // unsigned
87  T static_steal_counter; // for static_steal only; maybe better to put after ub
88 
89  /* parm[1-4] are used in different ways by different scheduling algorithms */
90 
91  // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92  // a) parm3 is properly aligned and
93  // b) all parm1-4 are in the same cache line.
94  // Because of parm1-4 are used together, performance seems to be better
95  // if they are in the same line (not measured though).
96 
97  struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98  T parm1;
99  T parm2;
100  T parm3;
101  T parm4;
102  };
103 
104  UT ordered_lower; // unsigned
105  UT ordered_upper; // unsigned
106  #if KMP_OS_WINDOWS
107  T last_upper;
108  #endif /* KMP_OS_WINDOWS */
109  };
110 
111 #else /* KMP_STATIC_STEAL_ENABLED */
112 
113  // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114  template< typename T >
115  struct dispatch_private_infoXX_template {
116  typedef typename traits_t< T >::unsigned_t UT;
117  typedef typename traits_t< T >::signed_t ST;
118  T lb;
119  T ub;
120  ST st; // signed
121  UT tc; // unsigned
122 
123  T parm1;
124  T parm2;
125  T parm3;
126  T parm4;
127 
128  UT count; // unsigned
129 
130  UT ordered_lower; // unsigned
131  UT ordered_upper; // unsigned
132  #if KMP_OS_WINDOWS
133  T last_upper;
134  #endif /* KMP_OS_WINDOWS */
135  };
136 
137 #endif /* KMP_STATIC_STEAL_ENABLED */
138 
139 // replaces dispatch_private_info structure and dispatch_private_info_t type
140 template< typename T >
141 struct KMP_ALIGN_CACHE dispatch_private_info_template {
142  // duplicate alignment here, otherwise size of structure is not correct in our compiler
143  union KMP_ALIGN_CACHE private_info_tmpl {
144  dispatch_private_infoXX_template< T > p;
145  dispatch_private_info64_t p64;
146  } u;
147  enum sched_type schedule; /* scheduling algorithm */
148  kmp_uint32 ordered; /* ordered clause specified */
149  kmp_uint32 ordered_bumped;
150  kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151  dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152  kmp_uint32 nomerge; /* don't merge iters if serialized */
153  kmp_uint32 type_size;
154  enum cons_type pushed_ws;
155 };
156 
157 
158 // replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159 template< typename UT >
160 struct dispatch_shared_infoXX_template {
161  /* chunk index under dynamic, number of idle threads under static-steal;
162  iteration index otherwise */
163  volatile UT iteration;
164  volatile UT num_done;
165  volatile UT ordered_iteration;
166  UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167 };
168 
169 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
170 template< typename UT >
171 struct dispatch_shared_info_template {
172  // we need union here to keep the structure size
173  union shared_info_tmpl {
174  dispatch_shared_infoXX_template< UT > s;
175  dispatch_shared_info64_t s64;
176  } u;
177  volatile kmp_uint32 buffer_index;
178 };
179 
180 /* ------------------------------------------------------------------------ */
181 /* ------------------------------------------------------------------------ */
182 
183 #undef USE_TEST_LOCKS
184 
185 // test_then_add template (general template should NOT be used)
186 template< typename T >
187 static __forceinline T
188 test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189 
190 template<>
191 __forceinline kmp_int32
192 test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193 {
194  kmp_int32 r;
195  r = KMP_TEST_THEN_ADD32( p, d );
196  return r;
197 }
198 
199 template<>
200 __forceinline kmp_int64
201 test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202 {
203  kmp_int64 r;
204  r = KMP_TEST_THEN_ADD64( p, d );
205  return r;
206 }
207 
208 // test_then_inc_acq template (general template should NOT be used)
209 template< typename T >
210 static __forceinline T
211 test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212 
213 template<>
214 __forceinline kmp_int32
215 test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216 {
217  kmp_int32 r;
218  r = KMP_TEST_THEN_INC_ACQ32( p );
219  return r;
220 }
221 
222 template<>
223 __forceinline kmp_int64
224 test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225 {
226  kmp_int64 r;
227  r = KMP_TEST_THEN_INC_ACQ64( p );
228  return r;
229 }
230 
231 // test_then_inc template (general template should NOT be used)
232 template< typename T >
233 static __forceinline T
234 test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235 
236 template<>
237 __forceinline kmp_int32
238 test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239 {
240  kmp_int32 r;
241  r = KMP_TEST_THEN_INC32( p );
242  return r;
243 }
244 
245 template<>
246 __forceinline kmp_int64
247 test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248 {
249  kmp_int64 r;
250  r = KMP_TEST_THEN_INC64( p );
251  return r;
252 }
253 
254 // compare_and_swap template (general template should NOT be used)
255 template< typename T >
256 static __forceinline kmp_int32
257 compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258 
259 template<>
260 __forceinline kmp_int32
261 compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262 {
263  return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264 }
265 
266 template<>
267 __forceinline kmp_int32
268 compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269 {
270  return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271 }
272 
273 /*
274  Spin wait loop that first does pause, then yield.
275  Waits until function returns non-zero when called with *spinner and check.
276  Does NOT put threads to sleep.
277 #if USE_ITT_BUILD
278  Arguments:
279  obj -- is higher-level synchronization object to report to ittnotify. It is used to report
280  locks consistently. For example, if lock is acquired immediately, its address is
281  reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282  immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283  address, not an address of low-level spinner.
284 #endif // USE_ITT_BUILD
285 */
286 template< typename UT >
287 // ToDo: make inline function (move to header file for icl)
288 static UT // unsigned 4- or 8-byte type
289 __kmp_wait_yield( volatile UT * spinner,
290  UT checker,
291  kmp_uint32 (* pred)( UT, UT )
292  USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
293  )
294 {
295  // note: we may not belong to a team at this point
296  register volatile UT * spin = spinner;
297  register UT check = checker;
298  register kmp_uint32 spins;
299  register kmp_uint32 (*f) ( UT, UT ) = pred;
300  register UT r;
301 
302  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303  KMP_INIT_YIELD( spins );
304  // main wait spin loop
305  while(!f(r = *spin, check))
306  {
307  KMP_FSYNC_SPIN_PREPARE( obj );
308  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309  It causes problems with infinite recursion because of exit lock */
310  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311  __kmp_abort_thread(); */
312 
313  // if we are oversubscribed,
314  // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315  // pause is in the following code
316  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317  KMP_YIELD_SPIN( spins );
318  }
319  KMP_FSYNC_SPIN_ACQUIRED( obj );
320  return r;
321 }
322 
323 template< typename UT >
324 static kmp_uint32 __kmp_eq( UT value, UT checker) {
325  return value == checker;
326 }
327 
328 template< typename UT >
329 static kmp_uint32 __kmp_neq( UT value, UT checker) {
330  return value != checker;
331 }
332 
333 template< typename UT >
334 static kmp_uint32 __kmp_lt( UT value, UT checker) {
335  return value < checker;
336 }
337 
338 template< typename UT >
339 static kmp_uint32 __kmp_ge( UT value, UT checker) {
340  return value >= checker;
341 }
342 
343 template< typename UT >
344 static kmp_uint32 __kmp_le( UT value, UT checker) {
345  return value <= checker;
346 }
347 
348 
349 /* ------------------------------------------------------------------------ */
350 /* ------------------------------------------------------------------------ */
351 
352 static void
353 __kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354 {
355  kmp_info_t *th;
356 
357  KMP_DEBUG_ASSERT( gtid_ref );
358 
359  if ( __kmp_env_consistency_check ) {
360  th = __kmp_threads[*gtid_ref];
361  if ( th -> th.th_root -> r.r_active
362  && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
363 #if KMP_USE_DYNAMIC_LOCK
364  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365 #else
366  __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
367 #endif
368  }
369  }
370 }
371 
372 template< typename UT >
373 static void
374 __kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375 {
376  typedef typename traits_t< UT >::signed_t ST;
377  dispatch_private_info_template< UT > * pr;
378 
379  int gtid = *gtid_ref;
380 // int cid = *cid_ref;
381  kmp_info_t *th = __kmp_threads[ gtid ];
382  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383 
384  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385  if ( __kmp_env_consistency_check ) {
386  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387  ( th -> th.th_dispatch -> th_dispatch_pr_current );
388  if ( pr -> pushed_ws != ct_none ) {
389 #if KMP_USE_DYNAMIC_LOCK
390  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391 #else
392  __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
393 #endif
394  }
395  }
396 
397  if ( ! th -> th.th_team -> t.t_serialized ) {
398  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399  ( th -> th.th_dispatch -> th_dispatch_sh_current );
400  UT lower;
401 
402  if ( ! __kmp_env_consistency_check ) {
403  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404  ( th -> th.th_dispatch -> th_dispatch_pr_current );
405  }
406  lower = pr->u.p.ordered_lower;
407 
408  #if ! defined( KMP_GOMP_COMPAT )
409  if ( __kmp_env_consistency_check ) {
410  if ( pr->ordered_bumped ) {
411  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412  __kmp_error_construct2(
413  kmp_i18n_msg_CnsMultipleNesting,
414  ct_ordered_in_pdo, loc_ref,
415  & p->stack_data[ p->w_top ]
416  );
417  }
418  }
419  #endif /* !defined(KMP_GOMP_COMPAT) */
420 
421  KMP_MB();
422  #ifdef KMP_DEBUG
423  {
424  const char * buff;
425  // create format specifiers before the debug output
426  buff = __kmp_str_format(
427  "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428  traits_t< UT >::spec, traits_t< UT >::spec );
429  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430  __kmp_str_free( &buff );
431  }
432  #endif
433 
434  __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435  USE_ITT_BUILD_ARG( NULL )
436  );
437  KMP_MB(); /* is this necessary? */
438  #ifdef KMP_DEBUG
439  {
440  const char * buff;
441  // create format specifiers before the debug output
442  buff = __kmp_str_format(
443  "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444  traits_t< UT >::spec, traits_t< UT >::spec );
445  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446  __kmp_str_free( &buff );
447  }
448  #endif
449  }
450  KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451 }
452 
453 static void
454 __kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455 {
456  kmp_info_t *th;
457 
458  if ( __kmp_env_consistency_check ) {
459  th = __kmp_threads[*gtid_ref];
460  if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461  __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462  }
463  }
464 }
465 
466 template< typename UT >
467 static void
468 __kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469 {
470  typedef typename traits_t< UT >::signed_t ST;
471  dispatch_private_info_template< UT > * pr;
472 
473  int gtid = *gtid_ref;
474 // int cid = *cid_ref;
475  kmp_info_t *th = __kmp_threads[ gtid ];
476  KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477 
478  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479  if ( __kmp_env_consistency_check ) {
480  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481  ( th -> th.th_dispatch -> th_dispatch_pr_current );
482  if ( pr -> pushed_ws != ct_none ) {
483  __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484  }
485  }
486 
487  if ( ! th -> th.th_team -> t.t_serialized ) {
488  dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489  ( th -> th.th_dispatch -> th_dispatch_sh_current );
490 
491  if ( ! __kmp_env_consistency_check ) {
492  pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493  ( th -> th.th_dispatch -> th_dispatch_pr_current );
494  }
495 
496  KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497  #if ! defined( KMP_GOMP_COMPAT )
498  if ( __kmp_env_consistency_check ) {
499  if ( pr->ordered_bumped != 0 ) {
500  struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501  /* How to test it? - OM */
502  __kmp_error_construct2(
503  kmp_i18n_msg_CnsMultipleNesting,
504  ct_ordered_in_pdo, loc_ref,
505  & p->stack_data[ p->w_top ]
506  );
507  }
508  }
509  #endif /* !defined(KMP_GOMP_COMPAT) */
510 
511  KMP_MB(); /* Flush all pending memory write invalidates. */
512 
513  pr->ordered_bumped += 1;
514 
515  KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516  gtid, pr->ordered_bumped ) );
517 
518  KMP_MB(); /* Flush all pending memory write invalidates. */
519 
520  /* TODO use general release procedure? */
521  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522 
523  KMP_MB(); /* Flush all pending memory write invalidates. */
524  }
525  KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526 }
527 
528 /* Computes and returns x to the power of y, where y must a non-negative integer */
529 template< typename UT >
530 static __forceinline long double
531 __kmp_pow(long double x, UT y) {
532  long double s=1.0L;
533 
534  KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535  //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536  while(y) {
537  if ( y & 1 )
538  s *= x;
539  x *= x;
540  y >>= 1;
541  }
542  return s;
543 }
544 
545 /* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546  (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547  __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548  (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549 */
550 template< typename T >
551 static __inline typename traits_t< T >::unsigned_t
552 __kmp_dispatch_guided_remaining(
553  T tc,
554  typename traits_t< T >::floating_t base,
555  typename traits_t< T >::unsigned_t idx
556 ) {
557  /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558  least for ICL 8.1, long double arithmetic may not really have
559  long double precision, even with /Qlong_double. Currently, we
560  workaround that in the caller code, by manipulating the FPCW for
561  Windows* OS on IA-32 architecture. The lack of precision is not
562  expected to be a correctness issue, though.
563  */
564  typedef typename traits_t< T >::unsigned_t UT;
565 
566  long double x = tc * __kmp_pow< UT >(base, idx);
567  UT r = (UT) x;
568  if ( x == r )
569  return r;
570  return r + 1;
571 }
572 
573 // Parameters of the guided-iterative algorithm:
574 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
575 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
576 // by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578 static int guided_int_param = 2;
579 static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580 
581 // UT - unsigned flavor of T, ST - signed flavor of T,
582 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583 template< typename T >
584 static void
585 __kmp_dispatch_init(
586  ident_t * loc,
587  int gtid,
588  enum sched_type schedule,
589  T lb,
590  T ub,
591  typename traits_t< T >::signed_t st,
592  typename traits_t< T >::signed_t chunk,
593  int push_ws
594 ) {
595  typedef typename traits_t< T >::unsigned_t UT;
596  typedef typename traits_t< T >::signed_t ST;
597  typedef typename traits_t< T >::floating_t DBL;
598  static const int ___kmp_size_type = sizeof( UT );
599 
600  int active;
601  T tc;
602  kmp_info_t * th;
603  kmp_team_t * team;
604  kmp_uint32 my_buffer_index;
605  dispatch_private_info_template< T > * pr;
606  dispatch_shared_info_template< UT > volatile * sh;
607 
608  KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609  KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610 
611  if ( ! TCR_4( __kmp_init_parallel ) )
612  __kmp_parallel_initialize();
613 
614 #if INCLUDE_SSC_MARKS
615  SSC_MARK_DISPATCH_INIT();
616 #endif
617  #ifdef KMP_DEBUG
618  {
619  const char * buff;
620  // create format specifiers before the debug output
621  buff = __kmp_str_format(
622  "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623  traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624  KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625  __kmp_str_free( &buff );
626  }
627  #endif
628  /* setup data */
629  th = __kmp_threads[ gtid ];
630  team = th -> th.th_team;
631  active = ! team -> t.t_serialized;
632  th->th.th_ident = loc;
633 
634 #if USE_ITT_BUILD
635  kmp_uint64 cur_chunk = chunk;
636  int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637  KMP_MASTER_GTID(gtid) &&
638 #if OMP_40_ENABLED
639  th->th.th_teams_microtask == NULL &&
640 #endif
641  team->t.t_active_level == 1;
642 #endif
643  if ( ! active ) {
644  pr = reinterpret_cast< dispatch_private_info_template< T >* >
645  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646  } else {
647  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649 
650  my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651 
652  /* What happens when number of threads changes, need to resize buffer? */
653  pr = reinterpret_cast< dispatch_private_info_template< T > * >
654  ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655  sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656  ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657  }
658 
659  /* Pick up the nomerge/ordered bits from the scheduling type */
660  if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661  pr->nomerge = TRUE;
662  schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663  } else {
664  pr->nomerge = FALSE;
665  }
666  pr->type_size = ___kmp_size_type; // remember the size of variables
667  if ( kmp_ord_lower & schedule ) {
668  pr->ordered = TRUE;
669  schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670  } else {
671  pr->ordered = FALSE;
672  }
673  if ( schedule == kmp_sch_static ) {
674  schedule = __kmp_static;
675  } else {
676  if ( schedule == kmp_sch_runtime ) {
677  // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
678  schedule = team -> t.t_sched.r_sched_type;
679  // Detail the schedule if needed (global controls are differentiated appropriately)
680  if ( schedule == kmp_sch_guided_chunked ) {
681  schedule = __kmp_guided;
682  } else if ( schedule == kmp_sch_static ) {
683  schedule = __kmp_static;
684  }
685  // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
686  chunk = team -> t.t_sched.chunk;
687 
688  #ifdef KMP_DEBUG
689  {
690  const char * buff;
691  // create format specifiers before the debug output
692  buff = __kmp_str_format(
693  "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
694  traits_t< ST >::spec );
695  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
696  __kmp_str_free( &buff );
697  }
698  #endif
699  } else {
700  if ( schedule == kmp_sch_guided_chunked ) {
701  schedule = __kmp_guided;
702  }
703  if ( chunk <= 0 ) {
704  chunk = KMP_DEFAULT_CHUNK;
705  }
706  }
707 
708  if ( schedule == kmp_sch_auto ) {
709  // mapping and differentiation: in the __kmp_do_serial_initialize()
710  schedule = __kmp_auto;
711  #ifdef KMP_DEBUG
712  {
713  const char * buff;
714  // create format specifiers before the debug output
715  buff = __kmp_str_format(
716  "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
717  traits_t< ST >::spec );
718  KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
719  __kmp_str_free( &buff );
720  }
721  #endif
722  }
723 
724  /* guided analytical not safe for too many threads */
725  if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
726  schedule = kmp_sch_guided_iterative_chunked;
727  KMP_WARNING( DispatchManyThreads );
728  }
729  pr->u.p.parm1 = chunk;
730  }
731  KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
732  "unknown scheduling type" );
733 
734  pr->u.p.count = 0;
735 
736  if ( __kmp_env_consistency_check ) {
737  if ( st == 0 ) {
738  __kmp_error_construct(
739  kmp_i18n_msg_CnsLoopIncrZeroProhibited,
740  ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
741  );
742  }
743  }
744 
745  tc = ( ub - lb + st );
746  if ( st != 1 ) {
747  if ( st < 0 ) {
748  if ( lb < ub ) {
749  tc = 0; // zero-trip
750  } else { // lb >= ub
751  tc = (ST)tc / st; // convert to signed division
752  }
753  } else { // st > 0
754  if ( ub < lb ) {
755  tc = 0; // zero-trip
756  } else { // lb >= ub
757  tc /= st;
758  }
759  }
760  } else if ( ub < lb ) { // st == 1
761  tc = 0; // zero-trip
762  }
763 
764  pr->u.p.lb = lb;
765  pr->u.p.ub = ub;
766  pr->u.p.st = st;
767  pr->u.p.tc = tc;
768 
769  #if KMP_OS_WINDOWS
770  pr->u.p.last_upper = ub + st;
771  #endif /* KMP_OS_WINDOWS */
772 
773  /* NOTE: only the active parallel region(s) has active ordered sections */
774 
775  if ( active ) {
776  if ( pr->ordered == 0 ) {
777  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
778  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
779  } else {
780  pr->ordered_bumped = 0;
781 
782  pr->u.p.ordered_lower = 1;
783  pr->u.p.ordered_upper = 0;
784 
785  th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
786  th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
787  }
788  }
789 
790  if ( __kmp_env_consistency_check ) {
791  enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
792  if ( push_ws ) {
793  __kmp_push_workshare( gtid, ws, loc );
794  pr->pushed_ws = ws;
795  } else {
796  __kmp_check_workshare( gtid, ws, loc );
797  pr->pushed_ws = ct_none;
798  }
799  }
800 
801  switch ( schedule ) {
802  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
804  {
805  T nproc = team->t.t_nproc;
806  T ntc, init;
807 
808  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
809 
810  ntc = (tc % chunk ? 1 : 0) + tc / chunk;
811  if ( nproc > 1 && ntc >= nproc ) {
812  T id = __kmp_tid_from_gtid(gtid);
813  T small_chunk, extras;
814 
815  small_chunk = ntc / nproc;
816  extras = ntc % nproc;
817 
818  init = id * small_chunk + ( id < extras ? id : extras );
819  pr->u.p.count = init;
820  pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
821 
822  pr->u.p.parm2 = lb;
823  //pr->pfields.parm3 = 0; // it's not used in static_steal
824  pr->u.p.parm4 = id;
825  pr->u.p.st = st;
826  break;
827  } else {
828  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
829  gtid ) );
830  schedule = kmp_sch_static_balanced;
831  /* too few iterations: fall-through to kmp_sch_static_balanced */
832  } // if
833  /* FALL-THROUGH to static balanced */
834  } // case
835  #endif
836  case kmp_sch_static_balanced:
837  {
838  T nproc = team->t.t_nproc;
839  T init, limit;
840 
841  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
842  gtid ) );
843 
844  if ( nproc > 1 ) {
845  T id = __kmp_tid_from_gtid(gtid);
846 
847  if ( tc < nproc ) {
848  if ( id < tc ) {
849  init = id;
850  limit = id;
851  pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
852  } else {
853  pr->u.p.count = 1; /* means no more chunks to execute */
854  pr->u.p.parm1 = FALSE;
855  break;
856  }
857  } else {
858  T small_chunk = tc / nproc;
859  T extras = tc % nproc;
860  init = id * small_chunk + (id < extras ? id : extras);
861  limit = init + small_chunk - (id < extras ? 0 : 1);
862  pr->u.p.parm1 = (id == nproc - 1);
863  }
864  } else {
865  if ( tc > 0 ) {
866  init = 0;
867  limit = tc - 1;
868  pr->u.p.parm1 = TRUE;
869  } else {
870  // zero trip count
871  pr->u.p.count = 1; /* means no more chunks to execute */
872  pr->u.p.parm1 = FALSE;
873  break;
874  }
875  }
876 #if USE_ITT_BUILD
877  // Calculate chunk for metadata report
878  if ( itt_need_metadata_reporting )
879  cur_chunk = limit - init + 1;
880 #endif
881  if ( st == 1 ) {
882  pr->u.p.lb = lb + init;
883  pr->u.p.ub = lb + limit;
884  } else {
885  T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
886  pr->u.p.lb = lb + init * st;
887  // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
888  if ( st > 0 ) {
889  pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
890  } else {
891  pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
892  }
893  }
894  if ( pr->ordered ) {
895  pr->u.p.ordered_lower = init;
896  pr->u.p.ordered_upper = limit;
897  }
898  break;
899  } // case
900  case kmp_sch_guided_iterative_chunked :
901  {
902  T nproc = team->t.t_nproc;
903  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
904 
905  if ( nproc > 1 ) {
906  if ( (2L * chunk + 1 ) * nproc >= tc ) {
907  /* chunk size too large, switch to dynamic */
908  schedule = kmp_sch_dynamic_chunked;
909  } else {
910  // when remaining iters become less than parm2 - switch to dynamic
911  pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
912  *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
913  }
914  } else {
915  KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
916  schedule = kmp_sch_static_greedy;
917  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
918  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
919  pr->u.p.parm1 = tc;
920  } // if
921  } // case
922  break;
923  case kmp_sch_guided_analytical_chunked:
924  {
925  T nproc = team->t.t_nproc;
926  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
927 
928  if ( nproc > 1 ) {
929  if ( (2L * chunk + 1 ) * nproc >= tc ) {
930  /* chunk size too large, switch to dynamic */
931  schedule = kmp_sch_dynamic_chunked;
932  } else {
933  /* commonly used term: (2 nproc - 1)/(2 nproc) */
934  DBL x;
935 
936  #if KMP_OS_WINDOWS && KMP_ARCH_X86
937  /* Linux* OS already has 64-bit computation by default for
938  long double, and on Windows* OS on Intel(R) 64,
939  /Qlong_double doesn't work. On Windows* OS
940  on IA-32 architecture, we need to set precision to
941  64-bit instead of the default 53-bit. Even though long
942  double doesn't work on Windows* OS on Intel(R) 64, the
943  resulting lack of precision is not expected to impact
944  the correctness of the algorithm, but this has not been
945  mathematically proven.
946  */
947  // save original FPCW and set precision to 64-bit, as
948  // Windows* OS on IA-32 architecture defaults to 53-bit
949  unsigned int oldFpcw = _control87(0,0);
950  _control87(_PC_64,_MCW_PC); // 0,0x30000
951  #endif
952  /* value used for comparison in solver for cross-over point */
953  long double target = ((long double)chunk * 2 + 1) * nproc / tc;
954 
955  /* crossover point--chunk indexes equal to or greater than
956  this point switch to dynamic-style scheduling */
957  UT cross;
958 
959  /* commonly used term: (2 nproc - 1)/(2 nproc) */
960  x = (long double)1.0 - (long double)0.5 / nproc;
961 
962  #ifdef KMP_DEBUG
963  { // test natural alignment
964  struct _test_a {
965  char a;
966  union {
967  char b;
968  DBL d;
969  };
970  } t;
971  ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
972  //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
973  KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
974  }
975  #endif // KMP_DEBUG
976 
977  /* save the term in thread private dispatch structure */
978  *(DBL*)&pr->u.p.parm3 = x;
979 
980  /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
981  {
982  UT left, right, mid;
983  long double p;
984 
985  /* estimate initial upper and lower bound */
986 
987  /* doesn't matter what value right is as long as it is positive, but
988  it affects performance of the solver
989  */
990  right = 229;
991  p = __kmp_pow< UT >(x,right);
992  if ( p > target ) {
993  do{
994  p *= p;
995  right <<= 1;
996  } while(p>target && right < (1<<27));
997  left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
998  } else {
999  left = 0;
1000  }
1001 
1002  /* bisection root-finding method */
1003  while ( left + 1 < right ) {
1004  mid = (left + right) / 2;
1005  if ( __kmp_pow< UT >(x,mid) > target ) {
1006  left = mid;
1007  } else {
1008  right = mid;
1009  }
1010  } // while
1011  cross = right;
1012  }
1013  /* assert sanity of computed crossover point */
1014  KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1015 
1016  /* save the crossover point in thread private dispatch structure */
1017  pr->u.p.parm2 = cross;
1018 
1019  // C75803
1020  #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1021  #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1022  #else
1023  #define GUIDED_ANALYTICAL_WORKAROUND (x)
1024  #endif
1025  /* dynamic-style scheduling offset */
1026  pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1027  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1028  // restore FPCW
1029  _control87(oldFpcw,_MCW_PC);
1030  #endif
1031  } // if
1032  } else {
1033  KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1034  gtid ) );
1035  schedule = kmp_sch_static_greedy;
1036  /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1037  pr->u.p.parm1 = tc;
1038  } // if
1039  } // case
1040  break;
1041  case kmp_sch_static_greedy:
1042  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1043  pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1044  ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1045  tc;
1046  break;
1047  case kmp_sch_static_chunked :
1048  case kmp_sch_dynamic_chunked :
1049  KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1050  break;
1051  case kmp_sch_trapezoidal :
1052  {
1053  /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1054 
1055  T parm1, parm2, parm3, parm4;
1056  KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1057 
1058  parm1 = chunk;
1059 
1060  /* F : size of the first cycle */
1061  parm2 = ( tc / (2 * team->t.t_nproc) );
1062 
1063  if ( parm2 < 1 ) {
1064  parm2 = 1;
1065  }
1066 
1067  /* L : size of the last cycle. Make sure the last cycle
1068  * is not larger than the first cycle.
1069  */
1070  if ( parm1 < 1 ) {
1071  parm1 = 1;
1072  } else if ( parm1 > parm2 ) {
1073  parm1 = parm2;
1074  }
1075 
1076  /* N : number of cycles */
1077  parm3 = ( parm2 + parm1 );
1078  parm3 = ( 2 * tc + parm3 - 1) / parm3;
1079 
1080  if ( parm3 < 2 ) {
1081  parm3 = 2;
1082  }
1083 
1084  /* sigma : decreasing incr of the trapezoid */
1085  parm4 = ( parm3 - 1 );
1086  parm4 = ( parm2 - parm1 ) / parm4;
1087 
1088  // pointless check, because parm4 >= 0 always
1089  //if ( parm4 < 0 ) {
1090  // parm4 = 0;
1091  //}
1092 
1093  pr->u.p.parm1 = parm1;
1094  pr->u.p.parm2 = parm2;
1095  pr->u.p.parm3 = parm3;
1096  pr->u.p.parm4 = parm4;
1097  } // case
1098  break;
1099 
1100  default:
1101  {
1102  __kmp_msg(
1103  kmp_ms_fatal, // Severity
1104  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1105  KMP_HNT( GetNewerLibrary ), // Hint
1106  __kmp_msg_null // Variadic argument list terminator
1107  );
1108  }
1109  break;
1110  } // switch
1111  pr->schedule = schedule;
1112  if ( active ) {
1113  /* The name of this buffer should be my_buffer_index when it's free to use it */
1114 
1115  KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1116  gtid, my_buffer_index, sh->buffer_index) );
1117  __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1118  USE_ITT_BUILD_ARG( NULL )
1119  );
1120  // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1121  // *always* 32-bit integers.
1122  KMP_MB(); /* is this necessary? */
1123  KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1124  gtid, my_buffer_index, sh->buffer_index) );
1125 
1126  th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1127  th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1128 #if USE_ITT_BUILD
1129  if ( pr->ordered ) {
1130  __kmp_itt_ordered_init( gtid );
1131  }; // if
1132  // Report loop metadata
1133  if ( itt_need_metadata_reporting ) {
1134  // Only report metadata by master of active team at level 1
1135  kmp_uint64 schedtype = 0;
1136  switch ( schedule ) {
1137  case kmp_sch_static_chunked:
1138  case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1139  break;
1140  case kmp_sch_static_greedy:
1141  cur_chunk = pr->u.p.parm1;
1142  break;
1143  case kmp_sch_dynamic_chunked:
1144  schedtype = 1;
1145  break;
1146  case kmp_sch_guided_iterative_chunked:
1147  case kmp_sch_guided_analytical_chunked:
1148  schedtype = 2;
1149  break;
1150  default:
1151 // Should we put this case under "static"?
1152 // case kmp_sch_static_steal:
1153  schedtype = 3;
1154  break;
1155  }
1156  __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1157  }
1158 #endif /* USE_ITT_BUILD */
1159  }; // if
1160 
1161  #ifdef KMP_DEBUG
1162  {
1163  const char * buff;
1164  // create format specifiers before the debug output
1165  buff = __kmp_str_format(
1166  "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1167  " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1168  " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1169  traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1170  traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1171  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1172  traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1173  KD_TRACE(10, ( buff,
1174  gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1175  pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1176  pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1177  pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1178  __kmp_str_free( &buff );
1179  }
1180  #endif
1181  #if ( KMP_STATIC_STEAL_ENABLED )
1182  if ( ___kmp_size_type < 8 ) {
1183  // It cannot be guaranteed that after execution of a loop with some other schedule kind
1184  // all the parm3 variables will contain the same value.
1185  // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1186  // rather than program life-time increment.
1187  // So the dedicated variable is required. The 'static_steal_counter' is used.
1188  if( schedule == kmp_sch_static_steal ) {
1189  // Other threads will inspect this variable when searching for a victim.
1190  // This is a flag showing that other threads may steal from this thread since then.
1191  volatile T * p = &pr->u.p.static_steal_counter;
1192  *p = *p + 1;
1193  }
1194  }
1195  #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1196 
1197 #if OMPT_SUPPORT && OMPT_TRACE
1198  if ((ompt_status == ompt_status_track_callback) &&
1199  ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1200  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1201  ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1202  ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1203  team_info->parallel_id, task_info->task_id, team_info->microtask);
1204  }
1205 #endif
1206 }
1207 
1208 /*
1209  * For ordered loops, either __kmp_dispatch_finish() should be called after
1210  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1211  * every chunk of iterations. If the ordered section(s) were not executed
1212  * for this iteration (or every iteration in this chunk), we need to set the
1213  * ordered iteration counters so that the next thread can proceed.
1214  */
1215 template< typename UT >
1216 static void
1217 __kmp_dispatch_finish( int gtid, ident_t *loc )
1218 {
1219  typedef typename traits_t< UT >::signed_t ST;
1220  kmp_info_t *th = __kmp_threads[ gtid ];
1221 
1222  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1223  if ( ! th -> th.th_team -> t.t_serialized ) {
1224 
1225  dispatch_private_info_template< UT > * pr =
1226  reinterpret_cast< dispatch_private_info_template< UT >* >
1227  ( th->th.th_dispatch->th_dispatch_pr_current );
1228  dispatch_shared_info_template< UT > volatile * sh =
1229  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1230  ( th->th.th_dispatch->th_dispatch_sh_current );
1231  KMP_DEBUG_ASSERT( pr );
1232  KMP_DEBUG_ASSERT( sh );
1233  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1234  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1235 
1236  if ( pr->ordered_bumped ) {
1237  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1238  gtid ) );
1239  pr->ordered_bumped = 0;
1240  } else {
1241  UT lower = pr->u.p.ordered_lower;
1242 
1243  #ifdef KMP_DEBUG
1244  {
1245  const char * buff;
1246  // create format specifiers before the debug output
1247  buff = __kmp_str_format(
1248  "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1249  traits_t< UT >::spec, traits_t< UT >::spec );
1250  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1251  __kmp_str_free( &buff );
1252  }
1253  #endif
1254 
1255  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1256  USE_ITT_BUILD_ARG(NULL)
1257  );
1258  KMP_MB(); /* is this necessary? */
1259  #ifdef KMP_DEBUG
1260  {
1261  const char * buff;
1262  // create format specifiers before the debug output
1263  buff = __kmp_str_format(
1264  "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1265  traits_t< UT >::spec, traits_t< UT >::spec );
1266  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1267  __kmp_str_free( &buff );
1268  }
1269  #endif
1270 
1271  test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1272  } // if
1273  } // if
1274  KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1275 }
1276 
1277 #ifdef KMP_GOMP_COMPAT
1278 
1279 template< typename UT >
1280 static void
1281 __kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1282 {
1283  typedef typename traits_t< UT >::signed_t ST;
1284  kmp_info_t *th = __kmp_threads[ gtid ];
1285 
1286  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1287  if ( ! th -> th.th_team -> t.t_serialized ) {
1288 // int cid;
1289  dispatch_private_info_template< UT > * pr =
1290  reinterpret_cast< dispatch_private_info_template< UT >* >
1291  ( th->th.th_dispatch->th_dispatch_pr_current );
1292  dispatch_shared_info_template< UT > volatile * sh =
1293  reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1294  ( th->th.th_dispatch->th_dispatch_sh_current );
1295  KMP_DEBUG_ASSERT( pr );
1296  KMP_DEBUG_ASSERT( sh );
1297  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1298  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1299 
1300 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1301  UT lower = pr->u.p.ordered_lower;
1302  UT upper = pr->u.p.ordered_upper;
1303  UT inc = upper - lower + 1;
1304 
1305  if ( pr->ordered_bumped == inc ) {
1306  KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1307  gtid ) );
1308  pr->ordered_bumped = 0;
1309  } else {
1310  inc -= pr->ordered_bumped;
1311 
1312  #ifdef KMP_DEBUG
1313  {
1314  const char * buff;
1315  // create format specifiers before the debug output
1316  buff = __kmp_str_format(
1317  "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1318  "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1319  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1320  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1321  __kmp_str_free( &buff );
1322  }
1323  #endif
1324 
1325  __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1326  USE_ITT_BUILD_ARG(NULL)
1327  );
1328 
1329  KMP_MB(); /* is this necessary? */
1330  KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1331  gtid ) );
1332  pr->ordered_bumped = 0;
1334  #ifdef KMP_DEBUG
1335  {
1336  const char * buff;
1337  // create format specifiers before the debug output
1338  buff = __kmp_str_format(
1339  "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1340  "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1341  traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1342  KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1343  __kmp_str_free( &buff );
1344  }
1345  #endif
1346 
1347  test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1348  }
1349 // }
1350  }
1351  KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1352 }
1353 
1354 #endif /* KMP_GOMP_COMPAT */
1355 
1356 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1357  * (no more work), then tell OMPT the loop is over. In some cases
1358  * kmp_dispatch_fini() is not called. */
1359 #if OMPT_SUPPORT && OMPT_TRACE
1360 #define OMPT_LOOP_END \
1361  if (status == 0) { \
1362  if ((ompt_status == ompt_status_track_callback) && \
1363  ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1364  ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1365  ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1366  ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1367  team_info->parallel_id, task_info->task_id); \
1368  } \
1369  }
1370 #else
1371 #define OMPT_LOOP_END // no-op
1372 #endif
1373 
1374 template< typename T >
1375 static int
1376 __kmp_dispatch_next(
1377  ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1378 ) {
1379 
1380  typedef typename traits_t< T >::unsigned_t UT;
1381  typedef typename traits_t< T >::signed_t ST;
1382  typedef typename traits_t< T >::floating_t DBL;
1383 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1384  static const int ___kmp_size_type = sizeof( UT );
1385 #endif
1386 
1387  int status;
1388  dispatch_private_info_template< T > * pr;
1389  kmp_info_t * th = __kmp_threads[ gtid ];
1390  kmp_team_t * team = th -> th.th_team;
1391 
1392  KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
1393  #ifdef KMP_DEBUG
1394  {
1395  const char * buff;
1396  // create format specifiers before the debug output
1397  buff = __kmp_str_format(
1398  "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1399  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1400  KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1401  __kmp_str_free( &buff );
1402  }
1403  #endif
1404 
1405  if ( team -> t.t_serialized ) {
1406  /* NOTE: serialize this dispatch becase we are not at the active level */
1407  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1408  ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1409  KMP_DEBUG_ASSERT( pr );
1410 
1411  if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1412  *p_lb = 0;
1413  *p_ub = 0;
1414 // if ( p_last != NULL )
1415 // *p_last = 0;
1416  if ( p_st != NULL )
1417  *p_st = 0;
1418  if ( __kmp_env_consistency_check ) {
1419  if ( pr->pushed_ws != ct_none ) {
1420  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1421  }
1422  }
1423  } else if ( pr->nomerge ) {
1424  kmp_int32 last;
1425  T start;
1426  UT limit, trip, init;
1427  ST incr;
1428  T chunk = pr->u.p.parm1;
1429 
1430  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1431 
1432  init = chunk * pr->u.p.count++;
1433  trip = pr->u.p.tc - 1;
1434 
1435  if ( (status = (init <= trip)) == 0 ) {
1436  *p_lb = 0;
1437  *p_ub = 0;
1438 // if ( p_last != NULL )
1439 // *p_last = 0;
1440  if ( p_st != NULL )
1441  *p_st = 0;
1442  if ( __kmp_env_consistency_check ) {
1443  if ( pr->pushed_ws != ct_none ) {
1444  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1445  }
1446  }
1447  } else {
1448  start = pr->u.p.lb;
1449  limit = chunk + init - 1;
1450  incr = pr->u.p.st;
1451 
1452  if ( (last = (limit >= trip)) != 0 ) {
1453  limit = trip;
1454  #if KMP_OS_WINDOWS
1455  pr->u.p.last_upper = pr->u.p.ub;
1456  #endif /* KMP_OS_WINDOWS */
1457  }
1458  if ( p_last != NULL )
1459  *p_last = last;
1460  if ( p_st != NULL )
1461  *p_st = incr;
1462  if ( incr == 1 ) {
1463  *p_lb = start + init;
1464  *p_ub = start + limit;
1465  } else {
1466  *p_lb = start + init * incr;
1467  *p_ub = start + limit * incr;
1468  }
1469 
1470  if ( pr->ordered ) {
1471  pr->u.p.ordered_lower = init;
1472  pr->u.p.ordered_upper = limit;
1473  #ifdef KMP_DEBUG
1474  {
1475  const char * buff;
1476  // create format specifiers before the debug output
1477  buff = __kmp_str_format(
1478  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1479  traits_t< UT >::spec, traits_t< UT >::spec );
1480  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1481  __kmp_str_free( &buff );
1482  }
1483  #endif
1484  } // if
1485  } // if
1486  } else {
1487  pr->u.p.tc = 0;
1488  *p_lb = pr->u.p.lb;
1489  *p_ub = pr->u.p.ub;
1490  #if KMP_OS_WINDOWS
1491  pr->u.p.last_upper = *p_ub;
1492  #endif /* KMP_OS_WINDOWS */
1493  if ( p_last != NULL )
1494  *p_last = TRUE;
1495  if ( p_st != NULL )
1496  *p_st = pr->u.p.st;
1497  } // if
1498  #ifdef KMP_DEBUG
1499  {
1500  const char * buff;
1501  // create format specifiers before the debug output
1502  buff = __kmp_str_format(
1503  "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1504  "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1505  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1506  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
1507  __kmp_str_free( &buff );
1508  }
1509  #endif
1510 #if INCLUDE_SSC_MARKS
1511  SSC_MARK_DISPATCH_NEXT();
1512 #endif
1513  OMPT_LOOP_END;
1514  return status;
1515  } else {
1516  kmp_int32 last = 0;
1517  dispatch_shared_info_template< UT > *sh;
1518  T start;
1519  ST incr;
1520  UT limit, trip, init;
1521 
1522  KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1523  &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1524 
1525  pr = reinterpret_cast< dispatch_private_info_template< T >* >
1526  ( th->th.th_dispatch->th_dispatch_pr_current );
1527  KMP_DEBUG_ASSERT( pr );
1528  sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1529  ( th->th.th_dispatch->th_dispatch_sh_current );
1530  KMP_DEBUG_ASSERT( sh );
1531 
1532  if ( pr->u.p.tc == 0 ) {
1533  // zero trip count
1534  status = 0;
1535  } else {
1536  switch (pr->schedule) {
1537  #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1538  case kmp_sch_static_steal:
1539  {
1540  T chunk = pr->u.p.parm1;
1541 
1542  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1543 
1544  trip = pr->u.p.tc - 1;
1545 
1546  if ( ___kmp_size_type > 4 ) {
1547  // Other threads do not look into the data of this thread,
1548  // so it's not necessary to make volatile casting.
1549  init = ( pr->u.p.count )++;
1550  status = ( init < (UT)pr->u.p.ub );
1551  } else {
1552  typedef union {
1553  struct {
1554  UT count;
1555  T ub;
1556  } p;
1557  kmp_int64 b;
1558  } union_i4;
1559  // All operations on 'count' or 'ub' must be combined atomically together.
1560  // stealing implemented only for 4-byte indexes
1561  {
1562  union_i4 vold, vnew;
1563  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1564  vnew = vold;
1565  vnew.p.count++;
1566  while( ! KMP_COMPARE_AND_STORE_ACQ64(
1567  ( volatile kmp_int64* )&pr->u.p.count,
1568  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1569  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1570  KMP_CPU_PAUSE();
1571  vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1572  vnew = vold;
1573  vnew.p.count++;
1574  }
1575  vnew = vold;
1576  init = vnew.p.count;
1577  status = ( init < (UT)vnew.p.ub ) ;
1578  }
1579 
1580  if( !status ) {
1581  kmp_info_t **other_threads = team->t.t_threads;
1582  int while_limit = 10;
1583  int while_index = 0;
1584 
1585  // TODO: algorithm of searching for a victim
1586  // should be cleaned up and measured
1587  while ( ( !status ) && ( while_limit != ++while_index ) ) {
1588  union_i4 vold, vnew;
1589  kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1590  T victimIdx = pr->u.p.parm4;
1591  T oldVictimIdx = victimIdx;
1592  dispatch_private_info_template< T > * victim;
1593 
1594  do {
1595  if( !victimIdx ) {
1596  victimIdx = team->t.t_nproc - 1;
1597  } else {
1598  --victimIdx;
1599  }
1600  victim = reinterpret_cast< dispatch_private_info_template< T >* >
1601  ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1602  } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1603  // TODO: think about a proper place of this test
1604  if ( ( !victim ) ||
1605  ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1606  (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1607  // TODO: delay would be nice
1608  continue;
1609  // the victim is not ready yet to participate in stealing
1610  // because the victim is still in kmp_init_dispatch
1611  }
1612  if ( oldVictimIdx == victimIdx ) {
1613  break;
1614  }
1615  pr->u.p.parm4 = victimIdx;
1616 
1617  while( 1 ) {
1618  vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1619  vnew = vold;
1620 
1621  KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1622  if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1623  break;
1624  }
1625  vnew.p.ub -= (remaining >> 2);
1626  KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1627  #pragma warning( push )
1628  // disable warning on pointless comparison of unsigned with 0
1629  #pragma warning( disable: 186 )
1630  KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1631  #pragma warning( pop )
1632  // TODO: Should this be acquire or release?
1633  if ( KMP_COMPARE_AND_STORE_ACQ64(
1634  ( volatile kmp_int64 * )&victim->u.p.count,
1635  *VOLATILE_CAST(kmp_int64 *)&vold.b,
1636  *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1637  status = 1;
1638  while_index = 0;
1639  // now update own count and ub
1640  #if KMP_ARCH_X86
1641  // stealing executed on non-KMP_ARCH_X86 only
1642  // Atomic 64-bit write on ia32 is
1643  // unavailable, so we do this in steps.
1644  // This code is not tested.
1645  init = vold.p.count;
1646  pr->u.p.ub = 0;
1647  pr->u.p.count = init + 1;
1648  pr->u.p.ub = vnew.p.count;
1649  #else
1650  init = vnew.p.ub;
1651  vold.p.count = init + 1;
1652  // TODO: is it safe and enough?
1653  *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1654  #endif // KMP_ARCH_X86
1655  break;
1656  } // if
1657  KMP_CPU_PAUSE();
1658  } // while (1)
1659  } // while
1660  } // if
1661  } // if
1662  if ( !status ) {
1663  *p_lb = 0;
1664  *p_ub = 0;
1665  if ( p_st != NULL ) *p_st = 0;
1666  } else {
1667  start = pr->u.p.parm2;
1668  init *= chunk;
1669  limit = chunk + init - 1;
1670  incr = pr->u.p.st;
1671 
1672  KMP_DEBUG_ASSERT(init <= trip);
1673  if ( (last = (limit >= trip)) != 0 )
1674  limit = trip;
1675  if ( p_st != NULL ) *p_st = incr;
1676 
1677  if ( incr == 1 ) {
1678  *p_lb = start + init;
1679  *p_ub = start + limit;
1680  } else {
1681  *p_lb = start + init * incr;
1682  *p_ub = start + limit * incr;
1683  }
1684 
1685  if ( pr->ordered ) {
1686  pr->u.p.ordered_lower = init;
1687  pr->u.p.ordered_upper = limit;
1688  #ifdef KMP_DEBUG
1689  {
1690  const char * buff;
1691  // create format specifiers before the debug output
1692  buff = __kmp_str_format(
1693  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1694  traits_t< UT >::spec, traits_t< UT >::spec );
1695  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1696  __kmp_str_free( &buff );
1697  }
1698  #endif
1699  } // if
1700  } // if
1701  break;
1702  } // case
1703  #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1704  case kmp_sch_static_balanced:
1705  {
1706  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1707  if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1708  pr->u.p.count = 1;
1709  *p_lb = pr->u.p.lb;
1710  *p_ub = pr->u.p.ub;
1711  last = pr->u.p.parm1;
1712  if ( p_st != NULL )
1713  *p_st = pr->u.p.st;
1714  } else { /* no iterations to do */
1715  pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1716  }
1717  if ( pr->ordered ) {
1718  #ifdef KMP_DEBUG
1719  {
1720  const char * buff;
1721  // create format specifiers before the debug output
1722  buff = __kmp_str_format(
1723  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1724  traits_t< UT >::spec, traits_t< UT >::spec );
1725  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1726  __kmp_str_free( &buff );
1727  }
1728  #endif
1729  } // if
1730  } // case
1731  break;
1732  case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1733  case kmp_sch_static_chunked:
1734  {
1735  T parm1;
1736 
1737  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1738  gtid ) );
1739  parm1 = pr->u.p.parm1;
1740 
1741  trip = pr->u.p.tc - 1;
1742  init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1743 
1744  if ( (status = (init <= trip)) != 0 ) {
1745  start = pr->u.p.lb;
1746  incr = pr->u.p.st;
1747  limit = parm1 + init - 1;
1748 
1749  if ( (last = (limit >= trip)) != 0 )
1750  limit = trip;
1751 
1752  if ( p_st != NULL ) *p_st = incr;
1753 
1754  pr->u.p.count += team->t.t_nproc;
1755 
1756  if ( incr == 1 ) {
1757  *p_lb = start + init;
1758  *p_ub = start + limit;
1759  }
1760  else {
1761  *p_lb = start + init * incr;
1762  *p_ub = start + limit * incr;
1763  }
1764 
1765  if ( pr->ordered ) {
1766  pr->u.p.ordered_lower = init;
1767  pr->u.p.ordered_upper = limit;
1768  #ifdef KMP_DEBUG
1769  {
1770  const char * buff;
1771  // create format specifiers before the debug output
1772  buff = __kmp_str_format(
1773  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1774  traits_t< UT >::spec, traits_t< UT >::spec );
1775  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1776  __kmp_str_free( &buff );
1777  }
1778  #endif
1779  } // if
1780  } // if
1781  } // case
1782  break;
1783 
1784  case kmp_sch_dynamic_chunked:
1785  {
1786  T chunk = pr->u.p.parm1;
1787 
1788  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1789  gtid ) );
1790 
1791  init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1792  trip = pr->u.p.tc - 1;
1793 
1794  if ( (status = (init <= trip)) == 0 ) {
1795  *p_lb = 0;
1796  *p_ub = 0;
1797  if ( p_st != NULL ) *p_st = 0;
1798  } else {
1799  start = pr->u.p.lb;
1800  limit = chunk + init - 1;
1801  incr = pr->u.p.st;
1802 
1803  if ( (last = (limit >= trip)) != 0 )
1804  limit = trip;
1805 
1806  if ( p_st != NULL ) *p_st = incr;
1807 
1808  if ( incr == 1 ) {
1809  *p_lb = start + init;
1810  *p_ub = start + limit;
1811  } else {
1812  *p_lb = start + init * incr;
1813  *p_ub = start + limit * incr;
1814  }
1815 
1816  if ( pr->ordered ) {
1817  pr->u.p.ordered_lower = init;
1818  pr->u.p.ordered_upper = limit;
1819  #ifdef KMP_DEBUG
1820  {
1821  const char * buff;
1822  // create format specifiers before the debug output
1823  buff = __kmp_str_format(
1824  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1825  traits_t< UT >::spec, traits_t< UT >::spec );
1826  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1827  __kmp_str_free( &buff );
1828  }
1829  #endif
1830  } // if
1831  } // if
1832  } // case
1833  break;
1834 
1835  case kmp_sch_guided_iterative_chunked:
1836  {
1837  T chunkspec = pr->u.p.parm1;
1838  KD_TRACE(100,
1839  ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1840  trip = pr->u.p.tc;
1841  // Start atomic part of calculations
1842  while(1) {
1843  ST remaining; // signed, because can be < 0
1844  init = sh->u.s.iteration; // shared value
1845  remaining = trip - init;
1846  if ( remaining <= 0 ) { // AC: need to compare with 0 first
1847  // nothing to do, don't try atomic op
1848  status = 0;
1849  break;
1850  }
1851  if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1852  // use dynamic-style shcedule
1853  // atomically inrement iterations, get old value
1854  init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1855  remaining = trip - init;
1856  if (remaining <= 0) {
1857  status = 0; // all iterations got by other threads
1858  } else {
1859  // got some iterations to work on
1860  status = 1;
1861  if ( (T)remaining > chunkspec ) {
1862  limit = init + chunkspec - 1;
1863  } else {
1864  last = 1; // the last chunk
1865  limit = init + remaining - 1;
1866  } // if
1867  } // if
1868  break;
1869  } // if
1870  limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1871  if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1872  // CAS was successful, chunk obtained
1873  status = 1;
1874  --limit;
1875  break;
1876  } // if
1877  } // while
1878  if ( status != 0 ) {
1879  start = pr->u.p.lb;
1880  incr = pr->u.p.st;
1881  if ( p_st != NULL )
1882  *p_st = incr;
1883  *p_lb = start + init * incr;
1884  *p_ub = start + limit * incr;
1885  if ( pr->ordered ) {
1886  pr->u.p.ordered_lower = init;
1887  pr->u.p.ordered_upper = limit;
1888  #ifdef KMP_DEBUG
1889  {
1890  const char * buff;
1891  // create format specifiers before the debug output
1892  buff = __kmp_str_format(
1893  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1894  traits_t< UT >::spec, traits_t< UT >::spec );
1895  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1896  __kmp_str_free( &buff );
1897  }
1898  #endif
1899  } // if
1900  } else {
1901  *p_lb = 0;
1902  *p_ub = 0;
1903  if ( p_st != NULL )
1904  *p_st = 0;
1905  } // if
1906  } // case
1907  break;
1908 
1909  case kmp_sch_guided_analytical_chunked:
1910  {
1911  T chunkspec = pr->u.p.parm1;
1912  UT chunkIdx;
1913  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1914  /* for storing original FPCW value for Windows* OS on
1915  IA-32 architecture 8-byte version */
1916  unsigned int oldFpcw;
1917  unsigned int fpcwSet = 0;
1918  #endif
1919  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1920  gtid ) );
1921 
1922  trip = pr->u.p.tc;
1923 
1924  KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1925  KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1926 
1927  while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1928  chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1929  if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1930  --trip;
1931  /* use dynamic-style scheduling */
1932  init = chunkIdx * chunkspec + pr->u.p.count;
1933  /* need to verify init > 0 in case of overflow in the above calculation */
1934  if ( (status = (init > 0 && init <= trip)) != 0 ) {
1935  limit = init + chunkspec -1;
1936 
1937  if ( (last = (limit >= trip)) != 0 )
1938  limit = trip;
1939  }
1940  break;
1941  } else {
1942  /* use exponential-style scheduling */
1943  /* The following check is to workaround the lack of long double precision on Windows* OS.
1944  This check works around the possible effect that init != 0 for chunkIdx == 0.
1945  */
1946  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1947  /* If we haven't already done so, save original
1948  FPCW and set precision to 64-bit, as Windows* OS
1949  on IA-32 architecture defaults to 53-bit */
1950  if ( !fpcwSet ) {
1951  oldFpcw = _control87(0,0);
1952  _control87(_PC_64,_MCW_PC);
1953  fpcwSet = 0x30000;
1954  }
1955  #endif
1956  if ( chunkIdx ) {
1957  init = __kmp_dispatch_guided_remaining< T >(
1958  trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1959  KMP_DEBUG_ASSERT(init);
1960  init = trip - init;
1961  } else
1962  init = 0;
1963  limit = trip - __kmp_dispatch_guided_remaining< T >(
1964  trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1965  KMP_ASSERT(init <= limit);
1966  if ( init < limit ) {
1967  KMP_DEBUG_ASSERT(limit <= trip);
1968  --limit;
1969  status = 1;
1970  break;
1971  } // if
1972  } // if
1973  } // while (1)
1974  #if KMP_OS_WINDOWS && KMP_ARCH_X86
1975  /* restore FPCW if necessary
1976  AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1977  */
1978  if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1979  _control87(oldFpcw,_MCW_PC);
1980  #endif
1981  if ( status != 0 ) {
1982  start = pr->u.p.lb;
1983  incr = pr->u.p.st;
1984  if ( p_st != NULL )
1985  *p_st = incr;
1986  *p_lb = start + init * incr;
1987  *p_ub = start + limit * incr;
1988  if ( pr->ordered ) {
1989  pr->u.p.ordered_lower = init;
1990  pr->u.p.ordered_upper = limit;
1991  #ifdef KMP_DEBUG
1992  {
1993  const char * buff;
1994  // create format specifiers before the debug output
1995  buff = __kmp_str_format(
1996  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1997  traits_t< UT >::spec, traits_t< UT >::spec );
1998  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1999  __kmp_str_free( &buff );
2000  }
2001  #endif
2002  }
2003  } else {
2004  *p_lb = 0;
2005  *p_ub = 0;
2006  if ( p_st != NULL )
2007  *p_st = 0;
2008  }
2009  } // case
2010  break;
2011 
2012  case kmp_sch_trapezoidal:
2013  {
2014  UT index;
2015  T parm2 = pr->u.p.parm2;
2016  T parm3 = pr->u.p.parm3;
2017  T parm4 = pr->u.p.parm4;
2018  KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2019  gtid ) );
2020 
2021  index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2022 
2023  init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2024  trip = pr->u.p.tc - 1;
2025 
2026  if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2027  *p_lb = 0;
2028  *p_ub = 0;
2029  if ( p_st != NULL ) *p_st = 0;
2030  } else {
2031  start = pr->u.p.lb;
2032  limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2033  incr = pr->u.p.st;
2034 
2035  if ( (last = (limit >= trip)) != 0 )
2036  limit = trip;
2037 
2038  if ( p_st != NULL ) *p_st = incr;
2039 
2040  if ( incr == 1 ) {
2041  *p_lb = start + init;
2042  *p_ub = start + limit;
2043  } else {
2044  *p_lb = start + init * incr;
2045  *p_ub = start + limit * incr;
2046  }
2047 
2048  if ( pr->ordered ) {
2049  pr->u.p.ordered_lower = init;
2050  pr->u.p.ordered_upper = limit;
2051  #ifdef KMP_DEBUG
2052  {
2053  const char * buff;
2054  // create format specifiers before the debug output
2055  buff = __kmp_str_format(
2056  "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2057  traits_t< UT >::spec, traits_t< UT >::spec );
2058  KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2059  __kmp_str_free( &buff );
2060  }
2061  #endif
2062  } // if
2063  } // if
2064  } // case
2065  break;
2066  default:
2067  {
2068  status = 0; // to avoid complaints on uninitialized variable use
2069  __kmp_msg(
2070  kmp_ms_fatal, // Severity
2071  KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2072  KMP_HNT( GetNewerLibrary ), // Hint
2073  __kmp_msg_null // Variadic argument list terminator
2074  );
2075  }
2076  break;
2077  } // switch
2078  } // if tc == 0;
2079 
2080  if ( status == 0 ) {
2081  UT num_done;
2082 
2083  num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2084  #ifdef KMP_DEBUG
2085  {
2086  const char * buff;
2087  // create format specifiers before the debug output
2088  buff = __kmp_str_format(
2089  "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2090  traits_t< UT >::spec );
2091  KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2092  __kmp_str_free( &buff );
2093  }
2094  #endif
2095 
2096  if ( (ST)num_done == team->t.t_nproc-1 ) {
2097  /* NOTE: release this buffer to be reused */
2098 
2099  KMP_MB(); /* Flush all pending memory write invalidates. */
2100 
2101  sh->u.s.num_done = 0;
2102  sh->u.s.iteration = 0;
2103 
2104  /* TODO replace with general release procedure? */
2105  if ( pr->ordered ) {
2106  sh->u.s.ordered_iteration = 0;
2107  }
2108 
2109  KMP_MB(); /* Flush all pending memory write invalidates. */
2110 
2111  sh -> buffer_index += KMP_MAX_DISP_BUF;
2112  KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2113  gtid, sh->buffer_index) );
2114 
2115  KMP_MB(); /* Flush all pending memory write invalidates. */
2116 
2117  } // if
2118  if ( __kmp_env_consistency_check ) {
2119  if ( pr->pushed_ws != ct_none ) {
2120  pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2121  }
2122  }
2123 
2124  th -> th.th_dispatch -> th_deo_fcn = NULL;
2125  th -> th.th_dispatch -> th_dxo_fcn = NULL;
2126  th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2127  th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2128  } // if (status == 0)
2129 #if KMP_OS_WINDOWS
2130  else if ( last ) {
2131  pr->u.p.last_upper = pr->u.p.ub;
2132  }
2133 #endif /* KMP_OS_WINDOWS */
2134  if ( p_last != NULL && status != 0 )
2135  *p_last = last;
2136  } // if
2137 
2138  #ifdef KMP_DEBUG
2139  {
2140  const char * buff;
2141  // create format specifiers before the debug output
2142  buff = __kmp_str_format(
2143  "__kmp_dispatch_next: T#%%d normal case: " \
2144  "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2145  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2146  KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2147  __kmp_str_free( &buff );
2148  }
2149  #endif
2150 #if INCLUDE_SSC_MARKS
2151  SSC_MARK_DISPATCH_NEXT();
2152 #endif
2153  OMPT_LOOP_END;
2154  return status;
2155 }
2156 
2157 template< typename T >
2158 static void
2159 __kmp_dist_get_bounds(
2160  ident_t *loc,
2161  kmp_int32 gtid,
2162  kmp_int32 *plastiter,
2163  T *plower,
2164  T *pupper,
2165  typename traits_t< T >::signed_t incr
2166 ) {
2167  KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2168  typedef typename traits_t< T >::unsigned_t UT;
2169  typedef typename traits_t< T >::signed_t ST;
2170  register kmp_uint32 team_id;
2171  register kmp_uint32 nteams;
2172  register UT trip_count;
2173  register kmp_team_t *team;
2174  kmp_info_t * th;
2175 
2176  KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2177  KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2178  #ifdef KMP_DEBUG
2179  {
2180  const char * buff;
2181  // create format specifiers before the debug output
2182  buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2183  "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2184  traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2185  traits_t< T >::spec );
2186  KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2187  __kmp_str_free( &buff );
2188  }
2189  #endif
2190 
2191  if( __kmp_env_consistency_check ) {
2192  if( incr == 0 ) {
2193  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2194  }
2195  if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2196  // The loop is illegal.
2197  // Some zero-trip loops maintained by compiler, e.g.:
2198  // for(i=10;i<0;++i) // lower >= upper - run-time check
2199  // for(i=0;i>10;--i) // lower <= upper - run-time check
2200  // for(i=0;i>10;++i) // incr > 0 - compile-time check
2201  // for(i=10;i<0;--i) // incr < 0 - compile-time check
2202  // Compiler does not check the following illegal loops:
2203  // for(i=0;i<10;i+=incr) // where incr<0
2204  // for(i=10;i>0;i-=incr) // where incr<0
2205  __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2206  }
2207  }
2208  th = __kmp_threads[gtid];
2209  KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2210  team = th->th.th_team;
2211  #if OMP_40_ENABLED
2212  nteams = th->th.th_teams_size.nteams;
2213  #endif
2214  team_id = team->t.t_master_tid;
2215  KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2216 
2217  // compute global trip count
2218  if( incr == 1 ) {
2219  trip_count = *pupper - *plower + 1;
2220  } else if(incr == -1) {
2221  trip_count = *plower - *pupper + 1;
2222  } else {
2223  trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2224  }
2225  if( trip_count <= nteams ) {
2226  KMP_DEBUG_ASSERT(
2227  __kmp_static == kmp_sch_static_greedy || \
2228  __kmp_static == kmp_sch_static_balanced
2229  ); // Unknown static scheduling type.
2230  // only some teams get single iteration, others get nothing
2231  if( team_id < trip_count ) {
2232  *pupper = *plower = *plower + team_id * incr;
2233  } else {
2234  *plower = *pupper + incr; // zero-trip loop
2235  }
2236  if( plastiter != NULL )
2237  *plastiter = ( team_id == trip_count - 1 );
2238  } else {
2239  if( __kmp_static == kmp_sch_static_balanced ) {
2240  register UT chunk = trip_count / nteams;
2241  register UT extras = trip_count % nteams;
2242  *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2243  *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2244  if( plastiter != NULL )
2245  *plastiter = ( team_id == nteams - 1 );
2246  } else {
2247  register T chunk_inc_count =
2248  ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2249  register T upper = *pupper;
2250  KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2251  // Unknown static scheduling type.
2252  *plower += team_id * chunk_inc_count;
2253  *pupper = *plower + chunk_inc_count - incr;
2254  // Check/correct bounds if needed
2255  if( incr > 0 ) {
2256  if( *pupper < *plower )
2257  *pupper = i_maxmin< T >::mx;
2258  if( plastiter != NULL )
2259  *plastiter = *plower <= upper && *pupper > upper - incr;
2260  if( *pupper > upper )
2261  *pupper = upper; // tracker C73258
2262  } else {
2263  if( *pupper > *plower )
2264  *pupper = i_maxmin< T >::mn;
2265  if( plastiter != NULL )
2266  *plastiter = *plower >= upper && *pupper < upper - incr;
2267  if( *pupper < upper )
2268  *pupper = upper; // tracker C73258
2269  }
2270  }
2271  }
2272 }
2273 
2274 //-----------------------------------------------------------------------------------------
2275 // Dispatch routines
2276 // Transfer call to template< type T >
2277 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2278 // T lb, T ub, ST st, ST chunk )
2279 extern "C" {
2280 
2296 void
2297 __kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2298  kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2299 {
2300  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2301  KMP_DEBUG_ASSERT( __kmp_init_serial );
2302  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2303 }
2307 void
2308 __kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2309  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2310 {
2311  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2312  KMP_DEBUG_ASSERT( __kmp_init_serial );
2313  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2314 }
2315 
2319 void
2320 __kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2321  kmp_int64 lb, kmp_int64 ub,
2322  kmp_int64 st, kmp_int64 chunk )
2323 {
2324  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2325  KMP_DEBUG_ASSERT( __kmp_init_serial );
2326  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2327 }
2328 
2332 void
2333 __kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2334  kmp_uint64 lb, kmp_uint64 ub,
2335  kmp_int64 st, kmp_int64 chunk )
2336 {
2337  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2338  KMP_DEBUG_ASSERT( __kmp_init_serial );
2339  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2340 }
2341 
2351 void
2352 __kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2353  kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2354 {
2355  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2356  KMP_DEBUG_ASSERT( __kmp_init_serial );
2357  __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2358  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2359 }
2360 
2361 void
2362 __kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2363  kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2364 {
2365  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2366  KMP_DEBUG_ASSERT( __kmp_init_serial );
2367  __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2368  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2369 }
2370 
2371 void
2372 __kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2373  kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2374 {
2375  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2376  KMP_DEBUG_ASSERT( __kmp_init_serial );
2377  __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2378  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2379 }
2380 
2381 void
2382 __kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2383  kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2384 {
2385  KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2386  KMP_DEBUG_ASSERT( __kmp_init_serial );
2387  __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2388  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2389 }
2390 
2403 int
2404 __kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2405  kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2406 {
2407  return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2408 }
2409 
2413 int
2414 __kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2415  kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2416 {
2417  return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2418 }
2419 
2423 int
2424 __kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2425  kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2426 {
2427  return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2428 }
2429 
2433 int
2434 __kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2435  kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2436 {
2437  return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2438 }
2439 
2446 void
2447 __kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2448 {
2449  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2450 }
2451 
2455 void
2456 __kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2457 {
2458  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2459 }
2460 
2464 void
2465 __kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2466 {
2467  __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2468 }
2469 
2473 void
2474 __kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2475 {
2476  __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2477 }
2480 //-----------------------------------------------------------------------------------------
2481 //Non-template routines from kmp_dispatch.c used in other sources
2482 
2483 kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2484  return value == checker;
2485 }
2486 
2487 kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2488  return value != checker;
2489 }
2490 
2491 kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2492  return value < checker;
2493 }
2494 
2495 kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2496  return value >= checker;
2497 }
2498 
2499 kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2500  return value <= checker;
2501 }
2502 kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2503  return value == checker;
2504 }
2505 
2506 kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2507  return value != checker;
2508 }
2509 
2510 kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2511  return value < checker;
2512 }
2513 
2514 kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2515  return value >= checker;
2516 }
2517 
2518 kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2519  return value <= checker;
2520 }
2521 
2522 kmp_uint32
2523 __kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2524  kmp_uint32 checker,
2525  kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2526  , void * obj // Higher-level synchronization object, or NULL.
2527  )
2528 {
2529  // note: we may not belong to a team at this point
2530  register volatile kmp_uint32 * spin = spinner;
2531  register kmp_uint32 check = checker;
2532  register kmp_uint32 spins;
2533  register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2534  register kmp_uint32 r;
2535 
2536  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2537  KMP_INIT_YIELD( spins );
2538  // main wait spin loop
2539  while(!f(r = TCR_4(*spin), check)) {
2540  KMP_FSYNC_SPIN_PREPARE( obj );
2541  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2542  It causes problems with infinite recursion because of exit lock */
2543  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2544  __kmp_abort_thread(); */
2545 
2546  /* if we have waited a bit, or are oversubscribed, yield */
2547  /* pause is in the following code */
2548  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2549  KMP_YIELD_SPIN( spins );
2550  }
2551  KMP_FSYNC_SPIN_ACQUIRED( obj );
2552  return r;
2553 }
2554 
2555 kmp_uint64
2556 __kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2557  kmp_uint64 checker,
2558  kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2559  , void * obj // Higher-level synchronization object, or NULL.
2560  )
2561 {
2562  // note: we may not belong to a team at this point
2563  register volatile kmp_uint64 * spin = spinner;
2564  register kmp_uint64 check = checker;
2565  register kmp_uint32 spins;
2566  register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2567  register kmp_uint64 r;
2568 
2569  KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2570  KMP_INIT_YIELD( spins );
2571  // main wait spin loop
2572  while(!f(r = *spin, check))
2573  {
2574  KMP_FSYNC_SPIN_PREPARE( obj );
2575  /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2576  It causes problems with infinite recursion because of exit lock */
2577  /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2578  __kmp_abort_thread(); */
2579 
2580  // if we are oversubscribed,
2581  // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2582  // pause is in the following code
2583  KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2584  KMP_YIELD_SPIN( spins );
2585  }
2586  KMP_FSYNC_SPIN_ACQUIRED( obj );
2587  return r;
2588 }
2589 
2590 } // extern "C"
2591 
2592 #ifdef KMP_GOMP_COMPAT
2593 
2594 void
2595 __kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2596  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2597  kmp_int32 chunk, int push_ws )
2598 {
2599  __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2600  push_ws );
2601 }
2602 
2603 void
2604 __kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2605  kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2606  kmp_int32 chunk, int push_ws )
2607 {
2608  __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2609  push_ws );
2610 }
2611 
2612 void
2613 __kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2614  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2615  kmp_int64 chunk, int push_ws )
2616 {
2617  __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2618  push_ws );
2619 }
2620 
2621 void
2622 __kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2623  kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2624  kmp_int64 chunk, int push_ws )
2625 {
2626  __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2627  push_ws );
2628 }
2629 
2630 void
2631 __kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2632 {
2633  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2634 }
2635 
2636 void
2637 __kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2638 {
2639  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2640 }
2641 
2642 void
2643 __kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2644 {
2645  __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2646 }
2647 
2648 void
2649 __kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2650 {
2651  __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2652 }
2653 
2654 #endif /* KMP_GOMP_COMPAT */
2655 
2656 /* ------------------------------------------------------------------------ */
2657 /* ------------------------------------------------------------------------ */
2658 
void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st)
void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk)
int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st)
int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st)
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:635
Definition: kmp.h:198
void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid)
void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk)
void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk)
int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last, kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st)
sched_type
Definition: kmp.h:300
void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid, enum sched_type schedule, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk)