LLVM OpenMP* Runtime Library
kmp_csupport.c
1 /*
2  * kmp_csupport.c -- kfront linkage support for OpenMP.
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 // The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "omp.h" /* extern "C" declarations of user-visible routines */
17 #include "kmp.h"
18 #include "kmp_i18n.h"
19 #include "kmp_itt.h"
20 #include "kmp_error.h"
21 #include "kmp_stats.h"
22 
23 #if OMPT_SUPPORT
24 #include "ompt-internal.h"
25 #include "ompt-specific.h"
26 #endif
27 
28 #define MAX_MESSAGE 512
29 
30 /* ------------------------------------------------------------------------ */
31 /* ------------------------------------------------------------------------ */
32 
33 /* flags will be used in future, e.g., to implement */
34 /* openmp_strict library restrictions */
35 
45 void
46 __kmpc_begin(ident_t *loc, kmp_int32 flags)
47 {
48  // By default __kmp_ignore_mppbeg() returns TRUE.
49  if (__kmp_ignore_mppbeg() == FALSE) {
50  __kmp_internal_begin();
51 
52  KC_TRACE( 10, ("__kmpc_begin: called\n" ) );
53  }
54 }
55 
63 void
65 {
66  // By default, __kmp_ignore_mppend() returns TRUE which makes __kmpc_end() call no-op.
67  // However, this can be overridden with KMP_IGNORE_MPPEND environment variable.
68  // If KMP_IGNORE_MPPEND is 0, __kmp_ignore_mppend() returns FALSE and __kmpc_end()
69  // will unregister this root (it can cause library shut down).
70  if (__kmp_ignore_mppend() == FALSE) {
71  KC_TRACE( 10, ("__kmpc_end: called\n" ) );
72  KA_TRACE( 30, ("__kmpc_end\n" ));
73 
74  __kmp_internal_end_thread( -1 );
75  }
76 }
77 
97 kmp_int32
99 {
100  kmp_int32 gtid = __kmp_entry_gtid();
101 
102  KC_TRACE( 10, ("__kmpc_global_thread_num: T#%d\n", gtid ) );
103 
104  return gtid;
105 }
106 
120 kmp_int32
122 {
123  KC_TRACE( 10, ("__kmpc_global_num_threads: num_threads = %d\n", __kmp_nth ) );
124 
125  return TCR_4(__kmp_nth);
126 }
127 
134 kmp_int32
136 {
137  KC_TRACE( 10, ("__kmpc_bound_thread_num: called\n" ) );
138  return __kmp_tid_from_gtid( __kmp_entry_gtid() );
139 }
140 
146 kmp_int32
148 {
149  KC_TRACE( 10, ("__kmpc_bound_num_threads: called\n" ) );
150 
151  return __kmp_entry_thread() -> th.th_team -> t.t_nproc;
152 }
153 
160 kmp_int32
162 {
163 #ifndef KMP_DEBUG
164 
165  return TRUE;
166 
167 #else
168 
169  const char *semi2;
170  const char *semi3;
171  int line_no;
172 
173  if (__kmp_par_range == 0) {
174  return TRUE;
175  }
176  semi2 = loc->psource;
177  if (semi2 == NULL) {
178  return TRUE;
179  }
180  semi2 = strchr(semi2, ';');
181  if (semi2 == NULL) {
182  return TRUE;
183  }
184  semi2 = strchr(semi2 + 1, ';');
185  if (semi2 == NULL) {
186  return TRUE;
187  }
188  if (__kmp_par_range_filename[0]) {
189  const char *name = semi2 - 1;
190  while ((name > loc->psource) && (*name != '/') && (*name != ';')) {
191  name--;
192  }
193  if ((*name == '/') || (*name == ';')) {
194  name++;
195  }
196  if (strncmp(__kmp_par_range_filename, name, semi2 - name)) {
197  return __kmp_par_range < 0;
198  }
199  }
200  semi3 = strchr(semi2 + 1, ';');
201  if (__kmp_par_range_routine[0]) {
202  if ((semi3 != NULL) && (semi3 > semi2)
203  && (strncmp(__kmp_par_range_routine, semi2 + 1, semi3 - semi2 - 1))) {
204  return __kmp_par_range < 0;
205  }
206  }
207  if (KMP_SSCANF(semi3 + 1, "%d", &line_no) == 1) {
208  if ((line_no >= __kmp_par_range_lb) && (line_no <= __kmp_par_range_ub)) {
209  return __kmp_par_range > 0;
210  }
211  return __kmp_par_range < 0;
212  }
213  return TRUE;
214 
215 #endif /* KMP_DEBUG */
216 
217 }
218 
224 kmp_int32
226 {
227  return __kmp_entry_thread() -> th.th_root -> r.r_active;
228 }
229 
239 void
240 __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads )
241 {
242  KA_TRACE( 20, ("__kmpc_push_num_threads: enter T#%d num_threads=%d\n",
243  global_tid, num_threads ) );
244 
245  __kmp_push_num_threads( loc, global_tid, num_threads );
246 }
247 
248 void
249 __kmpc_pop_num_threads(ident_t *loc, kmp_int32 global_tid )
250 {
251  KA_TRACE( 20, ("__kmpc_pop_num_threads: enter\n" ) );
252 
253  /* the num_threads are automatically popped */
254 }
255 
256 
257 #if OMP_40_ENABLED
258 
259 void
260 __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, kmp_int32 proc_bind )
261 {
262  KA_TRACE( 20, ("__kmpc_push_proc_bind: enter T#%d proc_bind=%d\n",
263  global_tid, proc_bind ) );
264 
265  __kmp_push_proc_bind( loc, global_tid, (kmp_proc_bind_t)proc_bind );
266 }
267 
268 #endif /* OMP_40_ENABLED */
269 
270 
280 void
281 __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
282 {
283  KMP_STOP_EXPLICIT_TIMER(OMP_serial);
284  KMP_COUNT_BLOCK(OMP_PARALLEL);
285  int gtid = __kmp_entry_gtid();
286  // maybe to save thr_state is enough here
287  {
288  va_list ap;
289  va_start( ap, microtask );
290 
291 #if OMPT_SUPPORT
292  kmp_info_t *master_th = __kmp_threads[ gtid ];
293  kmp_team_t *parent_team = master_th->th.th_team;
294  int tid = __kmp_tid_from_gtid( gtid );
295  parent_team->t.t_implicit_task_taskdata[tid].
296  ompt_task_info.frame.reenter_runtime_frame = __builtin_frame_address(0);
297 #endif
298 
299 #if INCLUDE_SSC_MARKS
300  SSC_MARK_FORKING();
301 #endif
302  __kmp_fork_call( loc, gtid, fork_context_intel,
303  argc,
304 #if OMPT_SUPPORT
305  VOLATILE_CAST(void *) microtask, // "unwrapped" task
306 #endif
307  VOLATILE_CAST(microtask_t) microtask, // "wrapped" task
308  VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
309 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
310 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
311  &ap
312 #else
313  ap
314 #endif
315  );
316 #if INCLUDE_SSC_MARKS
317  SSC_MARK_JOINING();
318 #endif
319  __kmp_join_call( loc, gtid );
320 
321  va_end( ap );
322 
323 #if OMPT_SUPPORT
324  if (ompt_status & ompt_status_track) {
325  parent_team->t.t_implicit_task_taskdata[tid].
326  ompt_task_info.frame.reenter_runtime_frame = 0;
327  }
328 #endif
329  }
330  KMP_START_EXPLICIT_TIMER(OMP_serial);
331 }
332 
333 #if OMP_40_ENABLED
334 
345 void
346 __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads )
347 {
348  KA_TRACE( 20, ("__kmpc_push_num_teams: enter T#%d num_teams=%d num_threads=%d\n",
349  global_tid, num_teams, num_threads ) );
350 
351  __kmp_push_num_teams( loc, global_tid, num_teams, num_threads );
352 }
353 
363 void
364 __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask, ...)
365 {
366  int gtid = __kmp_entry_gtid();
367  kmp_info_t *this_thr = __kmp_threads[ gtid ];
368  va_list ap;
369  va_start( ap, microtask );
370 
371  // remember teams entry point and nesting level
372  this_thr->th.th_teams_microtask = microtask;
373  this_thr->th.th_teams_level = this_thr->th.th_team->t.t_level; // AC: can be >0 on host
374 
375  // check if __kmpc_push_num_teams called, set default number of teams otherwise
376  if ( this_thr->th.th_teams_size.nteams == 0 ) {
377  __kmp_push_num_teams( loc, gtid, 0, 0 );
378  }
379  KMP_DEBUG_ASSERT(this_thr->th.th_set_nproc >= 1);
380  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nteams >= 1);
381  KMP_DEBUG_ASSERT(this_thr->th.th_teams_size.nth >= 1);
382 
383  __kmp_fork_call( loc, gtid, fork_context_intel,
384  argc,
385 #if OMPT_SUPPORT
386  VOLATILE_CAST(void *) microtask, // "unwrapped" task
387 #endif
388  VOLATILE_CAST(microtask_t) __kmp_teams_master, // "wrapped" task
389  VOLATILE_CAST(launch_t) __kmp_invoke_teams_master,
390 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
391  &ap
392 #else
393  ap
394 #endif
395  );
396  __kmp_join_call( loc, gtid );
397  this_thr->th.th_teams_microtask = NULL;
398  this_thr->th.th_teams_level = 0;
399  *(kmp_int64*)(&this_thr->th.th_teams_size) = 0L;
400  va_end( ap );
401 }
402 #endif /* OMP_40_ENABLED */
403 
404 
405 //
406 // I don't think this function should ever have been exported.
407 // The __kmpc_ prefix was misapplied. I'm fairly certain that no generated
408 // openmp code ever called it, but it's been exported from the RTL for so
409 // long that I'm afraid to remove the definition.
410 //
411 int
412 __kmpc_invoke_task_func( int gtid )
413 {
414  return __kmp_invoke_task_func( gtid );
415 }
416 
429 void
430 __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
431 {
432  __kmp_serialized_parallel(loc, global_tid); /* The implementation is now in kmp_runtime.c so that it can share static functions with
433  * kmp_fork_call since the tasks to be done are similar in each case.
434  */
435 }
436 
444 void
445 __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
446 {
447  kmp_internal_control_t *top;
448  kmp_info_t *this_thr;
449  kmp_team_t *serial_team;
450 
451  KC_TRACE( 10, ("__kmpc_end_serialized_parallel: called by T#%d\n", global_tid ) );
452 
453  /* skip all this code for autopar serialized loops since it results in
454  unacceptable overhead */
455  if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
456  return;
457 
458  // Not autopar code
459  if( ! TCR_4( __kmp_init_parallel ) )
460  __kmp_parallel_initialize();
461 
462  this_thr = __kmp_threads[ global_tid ];
463  serial_team = this_thr->th.th_serial_team;
464 
465  #if OMP_41_ENABLED
466  kmp_task_team_t * task_team = this_thr->th.th_task_team;
467 
468  // we need to wait for the proxy tasks before finishing the thread
469  if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks )
470  __kmp_task_team_wait(this_thr, serial_team, NULL ); // is an ITT object needed here?
471  #endif
472 
473  KMP_MB();
474  KMP_DEBUG_ASSERT( serial_team );
475  KMP_ASSERT( serial_team -> t.t_serialized );
476  KMP_DEBUG_ASSERT( this_thr -> th.th_team == serial_team );
477  KMP_DEBUG_ASSERT( serial_team != this_thr->th.th_root->r.r_root_team );
478  KMP_DEBUG_ASSERT( serial_team -> t.t_threads );
479  KMP_DEBUG_ASSERT( serial_team -> t.t_threads[0] == this_thr );
480 
481  /* If necessary, pop the internal control stack values and replace the team values */
482  top = serial_team -> t.t_control_stack_top;
483  if ( top && top -> serial_nesting_level == serial_team -> t.t_serialized ) {
484  copy_icvs( &serial_team -> t.t_threads[0] -> th.th_current_task -> td_icvs, top );
485  serial_team -> t.t_control_stack_top = top -> next;
486  __kmp_free(top);
487  }
488 
489  //if( serial_team -> t.t_serialized > 1 )
490  serial_team -> t.t_level--;
491 
492  /* pop dispatch buffers stack */
493  KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
494  {
495  dispatch_private_info_t * disp_buffer = serial_team->t.t_dispatch->th_disp_buffer;
496  serial_team->t.t_dispatch->th_disp_buffer =
497  serial_team->t.t_dispatch->th_disp_buffer->next;
498  __kmp_free( disp_buffer );
499  }
500 
501  -- serial_team -> t.t_serialized;
502  if ( serial_team -> t.t_serialized == 0 ) {
503 
504  /* return to the parallel section */
505 
506 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
507  if ( __kmp_inherit_fp_control && serial_team->t.t_fp_control_saved ) {
508  __kmp_clear_x87_fpu_status_word();
509  __kmp_load_x87_fpu_control_word( &serial_team->t.t_x87_fpu_control_word );
510  __kmp_load_mxcsr( &serial_team->t.t_mxcsr );
511  }
512 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
513 
514  this_thr -> th.th_team = serial_team -> t.t_parent;
515  this_thr -> th.th_info.ds.ds_tid = serial_team -> t.t_master_tid;
516 
517  /* restore values cached in the thread */
518  this_thr -> th.th_team_nproc = serial_team -> t.t_parent -> t.t_nproc; /* JPH */
519  this_thr -> th.th_team_master = serial_team -> t.t_parent -> t.t_threads[0]; /* JPH */
520  this_thr -> th.th_team_serialized = this_thr -> th.th_team -> t.t_serialized;
521 
522  /* TODO the below shouldn't need to be adjusted for serialized teams */
523  this_thr -> th.th_dispatch = & this_thr -> th.th_team ->
524  t.t_dispatch[ serial_team -> t.t_master_tid ];
525 
526  __kmp_pop_current_task_from_thread( this_thr );
527 
528  KMP_ASSERT( this_thr -> th.th_current_task -> td_flags.executing == 0 );
529  this_thr -> th.th_current_task -> td_flags.executing = 1;
530 
531  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
532  // Copy the task team from the new child / old parent team to the thread.
533  this_thr->th.th_task_team = this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state];
534  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d restoring task_team %p / team %p\n",
535  global_tid, this_thr -> th.th_task_team, this_thr -> th.th_team ) );
536  }
537  } else {
538  if ( __kmp_tasking_mode != tskm_immediate_exec ) {
539  KA_TRACE( 20, ( "__kmpc_end_serialized_parallel: T#%d decreasing nesting depth of serial team %p to %d\n",
540  global_tid, serial_team, serial_team -> t.t_serialized ) );
541  }
542  }
543 
544 #if USE_ITT_BUILD
545  kmp_uint64 cur_time = 0;
546 #if USE_ITT_NOTIFY
547  if ( __itt_get_timestamp_ptr ) {
548  cur_time = __itt_get_timestamp();
549  }
550 #endif /* USE_ITT_NOTIFY */
551  if ( this_thr->th.th_team->t.t_level == 0
552 #if OMP_40_ENABLED
553  && this_thr->th.th_teams_microtask == NULL
554 #endif
555  ) {
556  // Report the barrier
557  this_thr->th.th_ident = loc;
558  if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
559  ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
560  {
561  __kmp_itt_frame_submit( global_tid, this_thr->th.th_frame_time_serialized,
562  cur_time, 0, loc, this_thr->th.th_team_nproc, 0 );
563  if ( __kmp_forkjoin_frames_mode == 3 )
564  // Since barrier frame for serialized region is equal to the region we use the same begin timestamp as for the barrier.
565  __kmp_itt_frame_submit( global_tid, serial_team->t.t_region_time,
566  cur_time, 0, loc, this_thr->th.th_team_nproc, 2 );
567  } else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
568  ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
569  // Mark the end of the "parallel" region for VTune. Only use one of frame notification scheme at the moment.
570  __kmp_itt_region_joined( global_tid, 1 );
571  }
572 #endif /* USE_ITT_BUILD */
573 
574  if ( __kmp_env_consistency_check )
575  __kmp_pop_parallel( global_tid, NULL );
576 }
577 
586 void
588 {
589  KC_TRACE( 10, ("__kmpc_flush: called\n" ) );
590 
591  /* need explicit __mf() here since use volatile instead in library */
592  KMP_MB(); /* Flush all pending memory write invalidates. */
593 
594  #if ( KMP_ARCH_X86 || KMP_ARCH_X86_64 )
595  #if KMP_MIC
596  // fence-style instructions do not exist, but lock; xaddl $0,(%rsp) can be used.
597  // We shouldn't need it, though, since the ABI rules require that
598  // * If the compiler generates NGO stores it also generates the fence
599  // * If users hand-code NGO stores they should insert the fence
600  // therefore no incomplete unordered stores should be visible.
601  #else
602  // C74404
603  // This is to address non-temporal store instructions (sfence needed).
604  // The clflush instruction is addressed either (mfence needed).
605  // Probably the non-temporal load monvtdqa instruction should also be addressed.
606  // mfence is a SSE2 instruction. Do not execute it if CPU is not SSE2.
607  if ( ! __kmp_cpuinfo.initialized ) {
608  __kmp_query_cpuid( & __kmp_cpuinfo );
609  }; // if
610  if ( ! __kmp_cpuinfo.sse2 ) {
611  // CPU cannot execute SSE2 instructions.
612  } else {
613  #if KMP_COMPILER_ICC || KMP_COMPILER_MSVC
614  _mm_mfence();
615  #else
616  __sync_synchronize();
617  #endif // KMP_COMPILER_ICC
618  }; // if
619  #endif // KMP_MIC
620  #elif (KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || KMP_ARCH_MIPS64)
621  // Nothing to see here move along
622  #elif KMP_ARCH_PPC64
623  // Nothing needed here (we have a real MB above).
624  #if KMP_OS_CNK
625  // The flushing thread needs to yield here; this prevents a
626  // busy-waiting thread from saturating the pipeline. flush is
627  // often used in loops like this:
628  // while (!flag) {
629  // #pragma omp flush(flag)
630  // }
631  // and adding the yield here is good for at least a 10x speedup
632  // when running >2 threads per core (on the NAS LU benchmark).
633  __kmp_yield(TRUE);
634  #endif
635  #else
636  #error Unknown or unsupported architecture
637  #endif
638 
639 }
640 
641 /* -------------------------------------------------------------------------- */
642 
643 /* -------------------------------------------------------------------------- */
644 
652 void
653 __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
654 {
655  KMP_COUNT_BLOCK(OMP_BARRIER);
656  KMP_TIME_BLOCK(OMP_barrier);
657  KC_TRACE( 10, ("__kmpc_barrier: called T#%d\n", global_tid ) );
658 
659  if (! TCR_4(__kmp_init_parallel))
660  __kmp_parallel_initialize();
661 
662  if ( __kmp_env_consistency_check ) {
663  if ( loc == 0 ) {
664  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
665  }; // if
666 
667  __kmp_check_barrier( global_tid, ct_barrier, loc );
668  }
669 
670  __kmp_threads[ global_tid ]->th.th_ident = loc;
671  // TODO: explicit barrier_wait_id:
672  // this function is called when 'barrier' directive is present or
673  // implicit barrier at the end of a worksharing construct.
674  // 1) better to add a per-thread barrier counter to a thread data structure
675  // 2) set to 0 when a new team is created
676  // 4) no sync is required
677 
678  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
679 }
680 
681 /* The BARRIER for a MASTER section is always explicit */
688 kmp_int32
689 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
690 {
691  KMP_COUNT_BLOCK(OMP_MASTER);
692  int status = 0;
693 
694  KC_TRACE( 10, ("__kmpc_master: called T#%d\n", global_tid ) );
695 
696  if( ! TCR_4( __kmp_init_parallel ) )
697  __kmp_parallel_initialize();
698 
699  if( KMP_MASTER_GTID( global_tid ))
700  status = 1;
701 
702 #if OMPT_SUPPORT && OMPT_TRACE
703  if (status) {
704  if ((ompt_status == ompt_status_track_callback) &&
705  ompt_callbacks.ompt_callback(ompt_event_master_begin)) {
706  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
707  kmp_team_t *team = this_thr -> th.th_team;
708 
709  int tid = __kmp_tid_from_gtid( global_tid );
710  ompt_callbacks.ompt_callback(ompt_event_master_begin)(
711  team->t.ompt_team_info.parallel_id,
712  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
713  }
714  }
715 #endif
716 
717  if ( __kmp_env_consistency_check ) {
718 #if KMP_USE_DYNAMIC_LOCK
719  if (status)
720  __kmp_push_sync( global_tid, ct_master, loc, NULL, 0 );
721  else
722  __kmp_check_sync( global_tid, ct_master, loc, NULL, 0 );
723 #else
724  if (status)
725  __kmp_push_sync( global_tid, ct_master, loc, NULL );
726  else
727  __kmp_check_sync( global_tid, ct_master, loc, NULL );
728 #endif
729  }
730 
731  return status;
732 }
733 
742 void
743 __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
744 {
745  KC_TRACE( 10, ("__kmpc_end_master: called T#%d\n", global_tid ) );
746 
747  KMP_DEBUG_ASSERT( KMP_MASTER_GTID( global_tid ));
748 
749 #if OMPT_SUPPORT && OMPT_TRACE
750  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
751  kmp_team_t *team = this_thr -> th.th_team;
752  if ((ompt_status == ompt_status_track_callback) &&
753  ompt_callbacks.ompt_callback(ompt_event_master_end)) {
754  int tid = __kmp_tid_from_gtid( global_tid );
755  ompt_callbacks.ompt_callback(ompt_event_master_end)(
756  team->t.ompt_team_info.parallel_id,
757  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
758  }
759 #endif
760 
761  if ( __kmp_env_consistency_check ) {
762  if( global_tid < 0 )
763  KMP_WARNING( ThreadIdentInvalid );
764 
765  if( KMP_MASTER_GTID( global_tid ))
766  __kmp_pop_sync( global_tid, ct_master, loc );
767  }
768 }
769 
777 void
778 __kmpc_ordered( ident_t * loc, kmp_int32 gtid )
779 {
780  int cid = 0;
781  kmp_info_t *th;
782  KMP_DEBUG_ASSERT( __kmp_init_serial );
783 
784  KC_TRACE( 10, ("__kmpc_ordered: called T#%d\n", gtid ));
785 
786  if (! TCR_4(__kmp_init_parallel))
787  __kmp_parallel_initialize();
788 
789 #if USE_ITT_BUILD
790  __kmp_itt_ordered_prep( gtid );
791  // TODO: ordered_wait_id
792 #endif /* USE_ITT_BUILD */
793 
794  th = __kmp_threads[ gtid ];
795 
796 #if OMPT_SUPPORT && OMPT_TRACE
797  if (ompt_status & ompt_status_track) {
798  /* OMPT state update */
799  th->th.ompt_thread_info.wait_id = (uint64_t) loc;
800  th->th.ompt_thread_info.state = ompt_state_wait_ordered;
801 
802  /* OMPT event callback */
803  if ((ompt_status == ompt_status_track_callback) &&
804  ompt_callbacks.ompt_callback(ompt_event_wait_ordered)) {
805  ompt_callbacks.ompt_callback(ompt_event_wait_ordered)(
806  th->th.ompt_thread_info.wait_id);
807  }
808  }
809 #endif
810 
811  if ( th -> th.th_dispatch -> th_deo_fcn != 0 )
812  (*th->th.th_dispatch->th_deo_fcn)( & gtid, & cid, loc );
813  else
814  __kmp_parallel_deo( & gtid, & cid, loc );
815 
816 #if OMPT_SUPPORT && OMPT_TRACE
817  if (ompt_status & ompt_status_track) {
818  /* OMPT state update */
819  th->th.ompt_thread_info.state = ompt_state_work_parallel;
820  th->th.ompt_thread_info.wait_id = 0;
821 
822  /* OMPT event callback */
823  if ((ompt_status == ompt_status_track_callback) &&
824  ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)) {
825  ompt_callbacks.ompt_callback(ompt_event_acquired_ordered)(
826  th->th.ompt_thread_info.wait_id);
827  }
828  }
829 #endif
830 
831 #if USE_ITT_BUILD
832  __kmp_itt_ordered_start( gtid );
833 #endif /* USE_ITT_BUILD */
834 }
835 
843 void
844 __kmpc_end_ordered( ident_t * loc, kmp_int32 gtid )
845 {
846  int cid = 0;
847  kmp_info_t *th;
848 
849  KC_TRACE( 10, ("__kmpc_end_ordered: called T#%d\n", gtid ) );
850 
851 #if USE_ITT_BUILD
852  __kmp_itt_ordered_end( gtid );
853  // TODO: ordered_wait_id
854 #endif /* USE_ITT_BUILD */
855 
856  th = __kmp_threads[ gtid ];
857 
858  if ( th -> th.th_dispatch -> th_dxo_fcn != 0 )
859  (*th->th.th_dispatch->th_dxo_fcn)( & gtid, & cid, loc );
860  else
861  __kmp_parallel_dxo( & gtid, & cid, loc );
862 
863 #if OMPT_SUPPORT && OMPT_BLAME
864  if ((ompt_status == ompt_status_track_callback) &&
865  ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
866  ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
867  th->th.ompt_thread_info.wait_id);
868  }
869 #endif
870 }
871 
872 #if KMP_USE_DYNAMIC_LOCK
873 
874 static __forceinline kmp_indirect_lock_t *
875 __kmp_get_indirect_csptr(kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid, kmp_dyna_lockseq_t seq)
876 {
877  // Code from __kmp_get_critical_section_ptr
878  // This function returns an indirect lock object instead of a user lock.
879  kmp_indirect_lock_t **lck, *ret;
880  lck = (kmp_indirect_lock_t **)crit;
881  ret = (kmp_indirect_lock_t *)TCR_PTR(*lck);
882  if (ret == NULL) {
883  void *idx;
884  kmp_indirect_locktag_t tag = DYNA_GET_I_TAG(seq);
885  kmp_indirect_lock_t *ilk = __kmp_allocate_indirect_lock(&idx, gtid, tag);
886  ret = ilk;
887  DYNA_I_LOCK_FUNC(ilk, init)(ilk->lock);
888  DYNA_SET_I_LOCK_LOCATION(ilk, loc);
889  DYNA_SET_I_LOCK_FLAGS(ilk, kmp_lf_critical_section);
890  KA_TRACE(20, ("__kmp_get_indirect_csptr: initialized indirect lock #%d\n", tag));
891 #if USE_ITT_BUILD
892  __kmp_itt_critical_creating(ilk->lock, loc);
893 #endif
894  int status = KMP_COMPARE_AND_STORE_PTR(lck, 0, ilk);
895  if (status == 0) {
896 #if USE_ITT_BUILD
897  __kmp_itt_critical_destroyed(ilk->lock);
898 #endif
899  // Postponing destroy, to avoid costly dispatch here.
900  //DYNA_D_LOCK_FUNC(&idx, destroy)((kmp_dyna_lock_t *)&idx);
901  ret = (kmp_indirect_lock_t *)TCR_PTR(*lck);
902  KMP_DEBUG_ASSERT(ret != NULL);
903  }
904  }
905  return ret;
906 }
907 
908 // Fast-path acquire tas lock
909 #define DYNA_ACQUIRE_TAS_LOCK(lock, gtid) { \
910  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
911  if (l->lk.poll != DYNA_LOCK_FREE(tas) || \
912  ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas))) { \
913  kmp_uint32 spins; \
914  KMP_FSYNC_PREPARE(l); \
915  KMP_INIT_YIELD(spins); \
916  if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
917  KMP_YIELD(TRUE); \
918  } else { \
919  KMP_YIELD_SPIN(spins); \
920  } \
921  while (l->lk.poll != DYNA_LOCK_FREE(tas) || \
922  ! KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas))) { \
923  if (TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)) { \
924  KMP_YIELD(TRUE); \
925  } else { \
926  KMP_YIELD_SPIN(spins); \
927  } \
928  } \
929  } \
930  KMP_FSYNC_ACQUIRED(l); \
931 }
932 
933 // Fast-path test tas lock
934 #define DYNA_TEST_TAS_LOCK(lock, gtid, rc) { \
935  kmp_tas_lock_t *l = (kmp_tas_lock_t *)lock; \
936  rc = l->lk.poll == DYNA_LOCK_FREE(tas) && \
937  KMP_COMPARE_AND_STORE_ACQ32(&(l->lk.poll), DYNA_LOCK_FREE(tas), DYNA_LOCK_BUSY(gtid+1, tas)); \
938 }
939 
940 // Fast-path release tas lock
941 #define DYNA_RELEASE_TAS_LOCK(lock, gtid) { \
942  TCW_4(((kmp_tas_lock_t *)lock)->lk.poll, DYNA_LOCK_FREE(tas)); \
943  KMP_MB(); \
944 }
945 
946 #if DYNA_HAS_FUTEX
947 
948 # include <unistd.h>
949 # include <sys/syscall.h>
950 # ifndef FUTEX_WAIT
951 # define FUTEX_WAIT 0
952 # endif
953 # ifndef FUTEX_WAKE
954 # define FUTEX_WAKE 1
955 # endif
956 
957 // Fast-path acquire futex lock
958 #define DYNA_ACQUIRE_FUTEX_LOCK(lock, gtid) { \
959  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
960  kmp_int32 gtid_code = (gtid+1) << 1; \
961  KMP_MB(); \
962  KMP_FSYNC_PREPARE(ftx); \
963  kmp_int32 poll_val; \
964  while ((poll_val = KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), DYNA_LOCK_FREE(futex), \
965  DYNA_LOCK_BUSY(gtid_code, futex))) != DYNA_LOCK_FREE(futex)) { \
966  kmp_int32 cond = DYNA_LOCK_STRIP(poll_val) & 1; \
967  if (!cond) { \
968  if (!KMP_COMPARE_AND_STORE_RET32(&(ftx->lk.poll), poll_val, poll_val | DYNA_LOCK_BUSY(1, futex))) { \
969  continue; \
970  } \
971  poll_val |= DYNA_LOCK_BUSY(1, futex); \
972  } \
973  kmp_int32 rc; \
974  if ((rc = syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAIT, poll_val, NULL, NULL, 0)) != 0) { \
975  continue; \
976  } \
977  gtid_code |= 1; \
978  } \
979  KMP_FSYNC_ACQUIRED(ftx); \
980 }
981 
982 // Fast-path test futex lock
983 #define DYNA_TEST_FUTEX_LOCK(lock, gtid, rc) { \
984  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
985  if (KMP_COMPARE_AND_STORE_ACQ32(&(ftx->lk.poll), DYNA_LOCK_FREE(futex), DYNA_LOCK_BUSY(gtid+1, futex) << 1)) { \
986  KMP_FSYNC_ACQUIRED(ftx); \
987  rc = TRUE; \
988  } else { \
989  rc = FALSE; \
990  } \
991 }
992 
993 // Fast-path release futex lock
994 #define DYNA_RELEASE_FUTEX_LOCK(lock, gtid) { \
995  kmp_futex_lock_t *ftx = (kmp_futex_lock_t *)lock; \
996  KMP_MB(); \
997  KMP_FSYNC_RELEASING(ftx); \
998  kmp_int32 poll_val = KMP_XCHG_FIXED32(&(ftx->lk.poll), DYNA_LOCK_FREE(futex)); \
999  if (DYNA_LOCK_STRIP(poll_val) & 1) { \
1000  syscall(__NR_futex, &(ftx->lk.poll), FUTEX_WAKE, DYNA_LOCK_BUSY(1, futex), NULL, NULL, 0); \
1001  } \
1002  KMP_MB(); \
1003  KMP_YIELD(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc)); \
1004 }
1005 
1006 #endif // DYNA_HAS_FUTEX
1007 
1008 #else // KMP_USE_DYNAMIC_LOCK
1009 
1010 static kmp_user_lock_p
1011 __kmp_get_critical_section_ptr( kmp_critical_name * crit, ident_t const * loc, kmp_int32 gtid )
1012 {
1013  kmp_user_lock_p *lck_pp = (kmp_user_lock_p *)crit;
1014 
1015  //
1016  // Because of the double-check, the following load
1017  // doesn't need to be volatile.
1018  //
1019  kmp_user_lock_p lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1020 
1021  if ( lck == NULL ) {
1022  void * idx;
1023 
1024  // Allocate & initialize the lock.
1025  // Remember allocated locks in table in order to free them in __kmp_cleanup()
1026  lck = __kmp_user_lock_allocate( &idx, gtid, kmp_lf_critical_section );
1027  __kmp_init_user_lock_with_checks( lck );
1028  __kmp_set_user_lock_location( lck, loc );
1029 #if USE_ITT_BUILD
1030  __kmp_itt_critical_creating( lck );
1031  // __kmp_itt_critical_creating() should be called *before* the first usage of underlying
1032  // lock. It is the only place where we can guarantee it. There are chances the lock will
1033  // destroyed with no usage, but it is not a problem, because this is not real event seen
1034  // by user but rather setting name for object (lock). See more details in kmp_itt.h.
1035 #endif /* USE_ITT_BUILD */
1036 
1037  //
1038  // Use a cmpxchg instruction to slam the start of the critical
1039  // section with the lock pointer. If another thread beat us
1040  // to it, deallocate the lock, and use the lock that the other
1041  // thread allocated.
1042  //
1043  int status = KMP_COMPARE_AND_STORE_PTR( lck_pp, 0, lck );
1044 
1045  if ( status == 0 ) {
1046  // Deallocate the lock and reload the value.
1047 #if USE_ITT_BUILD
1048  __kmp_itt_critical_destroyed( lck );
1049  // Let ITT know the lock is destroyed and the same memory location may be reused for
1050  // another purpose.
1051 #endif /* USE_ITT_BUILD */
1052  __kmp_destroy_user_lock_with_checks( lck );
1053  __kmp_user_lock_free( &idx, gtid, lck );
1054  lck = (kmp_user_lock_p)TCR_PTR( *lck_pp );
1055  KMP_DEBUG_ASSERT( lck != NULL );
1056  }
1057  }
1058  return lck;
1059 }
1060 
1061 #endif // KMP_USE_DYNAMIC_LOCK
1062 
1073 void
1074 __kmpc_critical( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
1075  KMP_COUNT_BLOCK(OMP_CRITICAL);
1076 
1077  kmp_user_lock_p lck;
1078 
1079  KC_TRACE( 10, ("__kmpc_critical: called T#%d\n", global_tid ) );
1080 
1081 #if KMP_USE_DYNAMIC_LOCK
1082  // Assumption: all direct locks fit in OMP_CRITICAL_SIZE.
1083  // The global sequence __kmp_user_lock_seq is used unless compiler pushes a value.
1084  if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
1085  lck = (kmp_user_lock_p)crit;
1086  // The thread that reaches here first needs to tag the lock word.
1087  if (*((kmp_dyna_lock_t *)lck) == 0) {
1088  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)lck, 0, DYNA_GET_D_TAG(__kmp_user_lock_seq));
1089  }
1090  if (__kmp_env_consistency_check) {
1091  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
1092  }
1093 # if USE_ITT_BUILD
1094  __kmp_itt_critical_acquiring(lck);
1095 # endif
1096 # if DYNA_USE_FAST_TAS
1097  if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1098  DYNA_ACQUIRE_TAS_LOCK(lck, global_tid);
1099  } else
1100 # elif DYNA_USE_FAST_FUTEX
1101  if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1102  DYNA_ACQUIRE_FUTEX_LOCK(lck, global_tid);
1103  } else
1104 # endif
1105  {
1106  DYNA_D_LOCK_FUNC(lck, set)((kmp_dyna_lock_t *)lck, global_tid);
1107  }
1108  } else {
1109  kmp_indirect_lock_t *ilk = __kmp_get_indirect_csptr(crit, loc, global_tid, __kmp_user_lock_seq);
1110  lck = ilk->lock;
1111  if (__kmp_env_consistency_check) {
1112  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
1113  }
1114 # if USE_ITT_BUILD
1115  __kmp_itt_critical_acquiring(lck);
1116 # endif
1117  DYNA_I_LOCK_FUNC(ilk, set)(lck, global_tid);
1118  }
1119 
1120 #else // KMP_USE_DYNAMIC_LOCK
1121 
1122  //TODO: add THR_OVHD_STATE
1123 
1124  KMP_CHECK_USER_LOCK_INIT();
1125 
1126  if ( ( __kmp_user_lock_kind == lk_tas )
1127  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1128  lck = (kmp_user_lock_p)crit;
1129  }
1130 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1131  else if ( ( __kmp_user_lock_kind == lk_futex )
1132  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1133  lck = (kmp_user_lock_p)crit;
1134  }
1135 #endif
1136  else { // ticket, queuing or drdpa
1137  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
1138  }
1139 
1140  if ( __kmp_env_consistency_check )
1141  __kmp_push_sync( global_tid, ct_critical, loc, lck );
1142 
1143  /* since the critical directive binds to all threads, not just
1144  * the current team we have to check this even if we are in a
1145  * serialized team */
1146  /* also, even if we are the uber thread, we still have to conduct the lock,
1147  * as we have to contend with sibling threads */
1148 
1149 #if USE_ITT_BUILD
1150  __kmp_itt_critical_acquiring( lck );
1151 #endif /* USE_ITT_BUILD */
1152  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1153  __kmp_acquire_user_lock_with_checks( lck, global_tid );
1154 
1155 #endif // KMP_USE_DYNAMIC_LOCK
1156 
1157 #if USE_ITT_BUILD
1158  __kmp_itt_critical_acquired( lck );
1159 #endif /* USE_ITT_BUILD */
1160 
1161  KA_TRACE( 15, ("__kmpc_critical: done T#%d\n", global_tid ));
1162 } // __kmpc_critical
1163 
1173 void
1174 __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
1175 {
1176  kmp_user_lock_p lck;
1177 
1178  KC_TRACE( 10, ("__kmpc_end_critical: called T#%d\n", global_tid ));
1179 
1180 #if KMP_USE_DYNAMIC_LOCK
1181  if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
1182  lck = (kmp_user_lock_p)crit;
1183  KMP_ASSERT(lck != NULL);
1184  if (__kmp_env_consistency_check) {
1185  __kmp_pop_sync(global_tid, ct_critical, loc);
1186  }
1187 # if USE_ITT_BUILD
1188  __kmp_itt_critical_releasing( lck );
1189 # endif
1190 # if DYNA_USE_FAST_TAS
1191  if (__kmp_user_lock_seq == lockseq_tas && !__kmp_env_consistency_check) {
1192  DYNA_RELEASE_TAS_LOCK(lck, global_tid);
1193  } else
1194 # elif DYNA_USE_FAST_FUTEX
1195  if (__kmp_user_lock_seq == lockseq_futex && !__kmp_env_consistency_check) {
1196  DYNA_RELEASE_FUTEX_LOCK(lck, global_tid);
1197  } else
1198 # endif
1199  {
1200  DYNA_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
1201  }
1202  } else {
1203  kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
1204  KMP_ASSERT(ilk != NULL);
1205  lck = ilk->lock;
1206  if (__kmp_env_consistency_check) {
1207  __kmp_pop_sync(global_tid, ct_critical, loc);
1208  }
1209 # if USE_ITT_BUILD
1210  __kmp_itt_critical_releasing( lck );
1211 # endif
1212  DYNA_I_LOCK_FUNC(ilk, unset)(lck, global_tid);
1213  }
1214 
1215 #else // KMP_USE_DYNAMIC_LOCK
1216 
1217  if ( ( __kmp_user_lock_kind == lk_tas )
1218  && ( sizeof( lck->tas.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1219  lck = (kmp_user_lock_p)crit;
1220  }
1221 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1222  else if ( ( __kmp_user_lock_kind == lk_futex )
1223  && ( sizeof( lck->futex.lk.poll ) <= OMP_CRITICAL_SIZE ) ) {
1224  lck = (kmp_user_lock_p)crit;
1225  }
1226 #endif
1227  else { // ticket, queuing or drdpa
1228  lck = (kmp_user_lock_p) TCR_PTR(*((kmp_user_lock_p *)crit));
1229  }
1230 
1231  KMP_ASSERT(lck != NULL);
1232 
1233  if ( __kmp_env_consistency_check )
1234  __kmp_pop_sync( global_tid, ct_critical, loc );
1235 
1236 #if USE_ITT_BUILD
1237  __kmp_itt_critical_releasing( lck );
1238 #endif /* USE_ITT_BUILD */
1239  // Value of 'crit' should be good for using as a critical_id of the critical section directive.
1240  __kmp_release_user_lock_with_checks( lck, global_tid );
1241 
1242 #if OMPT_SUPPORT && OMPT_BLAME
1243  if ((ompt_status == ompt_status_track_callback) &&
1244  ompt_callbacks.ompt_callback(ompt_event_release_critical)) {
1245  ompt_callbacks.ompt_callback(ompt_event_release_critical)(
1246  (uint64_t) lck);
1247  }
1248 #endif
1249 
1250 #endif // KMP_USE_DYNAMIC_LOCK
1251 
1252  KA_TRACE( 15, ("__kmpc_end_critical: done T#%d\n", global_tid ));
1253 }
1254 
1263 kmp_int32
1264 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
1265 {
1266  int status;
1267 
1268  KC_TRACE( 10, ("__kmpc_barrier_master: called T#%d\n", global_tid ) );
1269 
1270  if (! TCR_4(__kmp_init_parallel))
1271  __kmp_parallel_initialize();
1272 
1273  if ( __kmp_env_consistency_check )
1274  __kmp_check_barrier( global_tid, ct_barrier, loc );
1275 
1276 #if USE_ITT_NOTIFY
1277  __kmp_threads[global_tid]->th.th_ident = loc;
1278 #endif
1279  status = __kmp_barrier( bs_plain_barrier, global_tid, TRUE, 0, NULL, NULL );
1280 
1281  return (status != 0) ? 0 : 1;
1282 }
1283 
1293 void
1294 __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
1295 {
1296  KC_TRACE( 10, ("__kmpc_end_barrier_master: called T#%d\n", global_tid ));
1297 
1298  __kmp_end_split_barrier ( bs_plain_barrier, global_tid );
1299 }
1300 
1311 kmp_int32
1312 __kmpc_barrier_master_nowait( ident_t * loc, kmp_int32 global_tid )
1313 {
1314  kmp_int32 ret;
1315 
1316  KC_TRACE( 10, ("__kmpc_barrier_master_nowait: called T#%d\n", global_tid ));
1317 
1318  if (! TCR_4(__kmp_init_parallel))
1319  __kmp_parallel_initialize();
1320 
1321  if ( __kmp_env_consistency_check ) {
1322  if ( loc == 0 ) {
1323  KMP_WARNING( ConstructIdentInvalid ); // ??? What does it mean for the user?
1324  }
1325  __kmp_check_barrier( global_tid, ct_barrier, loc );
1326  }
1327 
1328 #if USE_ITT_NOTIFY
1329  __kmp_threads[global_tid]->th.th_ident = loc;
1330 #endif
1331  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
1332 
1333  ret = __kmpc_master (loc, global_tid);
1334 
1335  if ( __kmp_env_consistency_check ) {
1336  /* there's no __kmpc_end_master called; so the (stats) */
1337  /* actions of __kmpc_end_master are done here */
1338 
1339  if ( global_tid < 0 ) {
1340  KMP_WARNING( ThreadIdentInvalid );
1341  }
1342  if (ret) {
1343  /* only one thread should do the pop since only */
1344  /* one did the push (see __kmpc_master()) */
1345 
1346  __kmp_pop_sync( global_tid, ct_master, loc );
1347  }
1348  }
1349 
1350  return (ret);
1351 }
1352 
1353 /* The BARRIER for a SINGLE process section is always explicit */
1365 kmp_int32
1366 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
1367 {
1368  KMP_COUNT_BLOCK(OMP_SINGLE);
1369  kmp_int32 rc = __kmp_enter_single( global_tid, loc, TRUE );
1370 
1371 #if OMPT_SUPPORT && OMPT_TRACE
1372  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
1373  kmp_team_t *team = this_thr -> th.th_team;
1374  int tid = __kmp_tid_from_gtid( global_tid );
1375 
1376  if ((ompt_status == ompt_status_track_callback)) {
1377  if (rc) {
1378  if (ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)) {
1379  ompt_callbacks.ompt_callback(ompt_event_single_in_block_begin)(
1380  team->t.ompt_team_info.parallel_id,
1381  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id,
1382  team->t.ompt_team_info.microtask);
1383  }
1384  } else {
1385  if (ompt_callbacks.ompt_callback(ompt_event_single_others_begin)) {
1386  ompt_callbacks.ompt_callback(ompt_event_single_others_begin)(
1387  team->t.ompt_team_info.parallel_id,
1388  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1389  }
1390  this_thr->th.ompt_thread_info.state = ompt_state_wait_single;
1391  }
1392  }
1393 #endif
1394 
1395  return rc;
1396 }
1397 
1407 void
1408 __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
1409 {
1410  __kmp_exit_single( global_tid );
1411 
1412 #if OMPT_SUPPORT && OMPT_TRACE
1413  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
1414  kmp_team_t *team = this_thr -> th.th_team;
1415  int tid = __kmp_tid_from_gtid( global_tid );
1416 
1417  if ((ompt_status == ompt_status_track_callback) &&
1418  ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)) {
1419  ompt_callbacks.ompt_callback(ompt_event_single_in_block_end)(
1420  team->t.ompt_team_info.parallel_id,
1421  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1422  }
1423 #endif
1424 }
1425 
1433 void
1434 __kmpc_for_static_fini( ident_t *loc, kmp_int32 global_tid )
1435 {
1436  KE_TRACE( 10, ("__kmpc_for_static_fini called T#%d\n", global_tid));
1437 
1438 #if OMPT_SUPPORT && OMPT_TRACE
1439  kmp_info_t *this_thr = __kmp_threads[ global_tid ];
1440  kmp_team_t *team = this_thr -> th.th_team;
1441  int tid = __kmp_tid_from_gtid( global_tid );
1442 
1443  if ((ompt_status == ompt_status_track_callback) &&
1444  ompt_callbacks.ompt_callback(ompt_event_loop_end)) {
1445  ompt_callbacks.ompt_callback(ompt_event_loop_end)(
1446  team->t.ompt_team_info.parallel_id,
1447  team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1448  }
1449 #endif
1450 
1451  if ( __kmp_env_consistency_check )
1452  __kmp_pop_workshare( global_tid, ct_pdo, loc );
1453 }
1454 
1455 /*
1456  * User routines which take C-style arguments (call by value)
1457  * different from the Fortran equivalent routines
1458  */
1459 
1460 void
1461 ompc_set_num_threads( int arg )
1462 {
1463 // !!!!! TODO: check the per-task binding
1464  __kmp_set_num_threads( arg, __kmp_entry_gtid() );
1465 }
1466 
1467 void
1468 ompc_set_dynamic( int flag )
1469 {
1470  kmp_info_t *thread;
1471 
1472  /* For the thread-private implementation of the internal controls */
1473  thread = __kmp_entry_thread();
1474 
1475  __kmp_save_internal_controls( thread );
1476 
1477  set__dynamic( thread, flag ? TRUE : FALSE );
1478 }
1479 
1480 void
1481 ompc_set_nested( int flag )
1482 {
1483  kmp_info_t *thread;
1484 
1485  /* For the thread-private internal controls implementation */
1486  thread = __kmp_entry_thread();
1487 
1488  __kmp_save_internal_controls( thread );
1489 
1490  set__nested( thread, flag ? TRUE : FALSE );
1491 }
1492 
1493 void
1494 ompc_set_max_active_levels( int max_active_levels )
1495 {
1496  /* TO DO */
1497  /* we want per-task implementation of this internal control */
1498 
1499  /* For the per-thread internal controls implementation */
1500  __kmp_set_max_active_levels( __kmp_entry_gtid(), max_active_levels );
1501 }
1502 
1503 void
1504 ompc_set_schedule( omp_sched_t kind, int modifier )
1505 {
1506 // !!!!! TODO: check the per-task binding
1507  __kmp_set_schedule( __kmp_entry_gtid(), ( kmp_sched_t ) kind, modifier );
1508 }
1509 
1510 int
1511 ompc_get_ancestor_thread_num( int level )
1512 {
1513  return __kmp_get_ancestor_thread_num( __kmp_entry_gtid(), level );
1514 }
1515 
1516 int
1517 ompc_get_team_size( int level )
1518 {
1519  return __kmp_get_team_size( __kmp_entry_gtid(), level );
1520 }
1521 
1522 void
1523 kmpc_set_stacksize( int arg )
1524 {
1525  // __kmp_aux_set_stacksize initializes the library if needed
1526  __kmp_aux_set_stacksize( arg );
1527 }
1528 
1529 void
1530 kmpc_set_stacksize_s( size_t arg )
1531 {
1532  // __kmp_aux_set_stacksize initializes the library if needed
1533  __kmp_aux_set_stacksize( arg );
1534 }
1535 
1536 void
1537 kmpc_set_blocktime( int arg )
1538 {
1539  int gtid, tid;
1540  kmp_info_t *thread;
1541 
1542  gtid = __kmp_entry_gtid();
1543  tid = __kmp_tid_from_gtid(gtid);
1544  thread = __kmp_thread_from_gtid(gtid);
1545 
1546  __kmp_aux_set_blocktime( arg, thread, tid );
1547 }
1548 
1549 void
1550 kmpc_set_library( int arg )
1551 {
1552  // __kmp_user_set_library initializes the library if needed
1553  __kmp_user_set_library( (enum library_type)arg );
1554 }
1555 
1556 void
1557 kmpc_set_defaults( char const * str )
1558 {
1559  // __kmp_aux_set_defaults initializes the library if needed
1560  __kmp_aux_set_defaults( str, KMP_STRLEN( str ) );
1561 }
1562 
1563 int
1564 kmpc_set_affinity_mask_proc( int proc, void **mask )
1565 {
1566 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1567  return -1;
1568 #else
1569  if ( ! TCR_4(__kmp_init_middle) ) {
1570  __kmp_middle_initialize();
1571  }
1572  return __kmp_aux_set_affinity_mask_proc( proc, mask );
1573 #endif
1574 }
1575 
1576 int
1577 kmpc_unset_affinity_mask_proc( int proc, void **mask )
1578 {
1579 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1580  return -1;
1581 #else
1582  if ( ! TCR_4(__kmp_init_middle) ) {
1583  __kmp_middle_initialize();
1584  }
1585  return __kmp_aux_unset_affinity_mask_proc( proc, mask );
1586 #endif
1587 }
1588 
1589 int
1590 kmpc_get_affinity_mask_proc( int proc, void **mask )
1591 {
1592 #if defined(KMP_STUB) || !KMP_AFFINITY_SUPPORTED
1593  return -1;
1594 #else
1595  if ( ! TCR_4(__kmp_init_middle) ) {
1596  __kmp_middle_initialize();
1597  }
1598  return __kmp_aux_get_affinity_mask_proc( proc, mask );
1599 #endif
1600 }
1601 
1602 
1603 /* -------------------------------------------------------------------------- */
1644 void
1645 __kmpc_copyprivate( ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void*,void*), kmp_int32 didit )
1646 {
1647  void **data_ptr;
1648 
1649  KC_TRACE( 10, ("__kmpc_copyprivate: called T#%d\n", gtid ));
1650 
1651  KMP_MB();
1652 
1653  data_ptr = & __kmp_team_from_gtid( gtid )->t.t_copypriv_data;
1654 
1655  if ( __kmp_env_consistency_check ) {
1656  if ( loc == 0 ) {
1657  KMP_WARNING( ConstructIdentInvalid );
1658  }
1659  }
1660 
1661  /* ToDo: Optimize the following two barriers into some kind of split barrier */
1662 
1663  if (didit) *data_ptr = cpy_data;
1664 
1665  /* This barrier is not a barrier region boundary */
1666 #if USE_ITT_NOTIFY
1667  __kmp_threads[gtid]->th.th_ident = loc;
1668 #endif
1669  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1670 
1671  if (! didit) (*cpy_func)( cpy_data, *data_ptr );
1672 
1673  /* Consider next barrier the user-visible barrier for barrier region boundaries */
1674  /* Nesting checks are already handled by the single construct checks */
1675 
1676 #if USE_ITT_NOTIFY
1677  __kmp_threads[gtid]->th.th_ident = loc; // TODO: check if it is needed (e.g. tasks can overwrite the location)
1678 #endif
1679  __kmp_barrier( bs_plain_barrier, gtid, FALSE , 0, NULL, NULL );
1680 }
1681 
1682 /* -------------------------------------------------------------------------- */
1683 
1684 #define INIT_LOCK __kmp_init_user_lock_with_checks
1685 #define INIT_NESTED_LOCK __kmp_init_nested_user_lock_with_checks
1686 #define ACQUIRE_LOCK __kmp_acquire_user_lock_with_checks
1687 #define ACQUIRE_LOCK_TIMED __kmp_acquire_user_lock_with_checks_timed
1688 #define ACQUIRE_NESTED_LOCK __kmp_acquire_nested_user_lock_with_checks
1689 #define ACQUIRE_NESTED_LOCK_TIMED __kmp_acquire_nested_user_lock_with_checks_timed
1690 #define RELEASE_LOCK __kmp_release_user_lock_with_checks
1691 #define RELEASE_NESTED_LOCK __kmp_release_nested_user_lock_with_checks
1692 #define TEST_LOCK __kmp_test_user_lock_with_checks
1693 #define TEST_NESTED_LOCK __kmp_test_nested_user_lock_with_checks
1694 #define DESTROY_LOCK __kmp_destroy_user_lock_with_checks
1695 #define DESTROY_NESTED_LOCK __kmp_destroy_nested_user_lock_with_checks
1696 
1697 
1698 /*
1699  * TODO: Make check abort messages use location info & pass it
1700  * into with_checks routines
1701  */
1702 
1703 /* initialize the lock */
1704 void
1705 __kmpc_init_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1706 #if KMP_USE_DYNAMIC_LOCK
1707  KMP_DEBUG_ASSERT(__kmp_init_serial);
1708  if (__kmp_env_consistency_check && user_lock == NULL) {
1709  KMP_FATAL(LockIsUninitialized, "omp_init_lock");
1710  }
1711  if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
1712  DYNA_INIT_D_LOCK(user_lock, __kmp_user_lock_seq);
1713 # if USE_ITT_BUILD
1714  __kmp_itt_lock_creating((kmp_user_lock_p)user_lock, NULL);
1715 # endif
1716  } else {
1717  DYNA_INIT_I_LOCK(user_lock, __kmp_user_lock_seq);
1718  kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(user_lock);
1719  DYNA_SET_I_LOCK_LOCATION(ilk, loc);
1720 # if USE_ITT_BUILD
1721  __kmp_itt_lock_creating(ilk->lock, loc);
1722 # endif
1723  }
1724 
1725 #else // KMP_USE_DYNAMIC_LOCK
1726 
1727  static char const * const func = "omp_init_lock";
1728  kmp_user_lock_p lck;
1729  KMP_DEBUG_ASSERT( __kmp_init_serial );
1730 
1731  if ( __kmp_env_consistency_check ) {
1732  if ( user_lock == NULL ) {
1733  KMP_FATAL( LockIsUninitialized, func );
1734  }
1735  }
1736 
1737  KMP_CHECK_USER_LOCK_INIT();
1738 
1739  if ( ( __kmp_user_lock_kind == lk_tas )
1740  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1741  lck = (kmp_user_lock_p)user_lock;
1742  }
1743 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1744  else if ( ( __kmp_user_lock_kind == lk_futex )
1745  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1746  lck = (kmp_user_lock_p)user_lock;
1747  }
1748 #endif
1749  else {
1750  lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1751  }
1752  INIT_LOCK( lck );
1753  __kmp_set_user_lock_location( lck, loc );
1754 
1755 #if USE_ITT_BUILD
1756  __kmp_itt_lock_creating( lck );
1757 #endif /* USE_ITT_BUILD */
1758 
1759 #endif // KMP_USE_DYNAMIC_LOCK
1760 } // __kmpc_init_lock
1761 
1762 /* initialize the lock */
1763 void
1764 __kmpc_init_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1765 #if KMP_USE_DYNAMIC_LOCK
1766 
1767  KMP_DEBUG_ASSERT(__kmp_init_serial);
1768  if (__kmp_env_consistency_check && user_lock == NULL) {
1769  KMP_FATAL(LockIsUninitialized, "omp_init_nest_lock");
1770  }
1771  // Invoke init function after converting to nested version.
1772  kmp_dyna_lockseq_t nested_seq;
1773  switch (__kmp_user_lock_seq) {
1774  case lockseq_tas: nested_seq = lockseq_nested_tas; break;
1775 #if DYNA_HAS_FUTEX
1776  case lockseq_futex: nested_seq = lockseq_nested_futex; break;
1777 #endif
1778  case lockseq_ticket: nested_seq = lockseq_nested_ticket; break;
1779  case lockseq_queuing: nested_seq = lockseq_nested_queuing; break;
1780  case lockseq_drdpa: nested_seq = lockseq_nested_drdpa; break;
1781  default: nested_seq = lockseq_nested_queuing; break;
1782  // Use nested queuing lock for lock kinds without "nested" implementation.
1783  }
1784  DYNA_INIT_I_LOCK(user_lock, nested_seq);
1785  // All nested locks are indirect locks.
1786  kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(user_lock);
1787  DYNA_SET_I_LOCK_LOCATION(ilk, loc);
1788 # if USE_ITT_BUILD
1789  __kmp_itt_lock_creating(ilk->lock, loc);
1790 # endif
1791 
1792 #else // KMP_USE_DYNAMIC_LOCK
1793 
1794  static char const * const func = "omp_init_nest_lock";
1795  kmp_user_lock_p lck;
1796  KMP_DEBUG_ASSERT( __kmp_init_serial );
1797 
1798  if ( __kmp_env_consistency_check ) {
1799  if ( user_lock == NULL ) {
1800  KMP_FATAL( LockIsUninitialized, func );
1801  }
1802  }
1803 
1804  KMP_CHECK_USER_LOCK_INIT();
1805 
1806  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1807  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1808  lck = (kmp_user_lock_p)user_lock;
1809  }
1810 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1811  else if ( ( __kmp_user_lock_kind == lk_futex )
1812  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1813  <= OMP_NEST_LOCK_T_SIZE ) ) {
1814  lck = (kmp_user_lock_p)user_lock;
1815  }
1816 #endif
1817  else {
1818  lck = __kmp_user_lock_allocate( user_lock, gtid, 0 );
1819  }
1820 
1821  INIT_NESTED_LOCK( lck );
1822  __kmp_set_user_lock_location( lck, loc );
1823 
1824 #if USE_ITT_BUILD
1825  __kmp_itt_lock_creating( lck );
1826 #endif /* USE_ITT_BUILD */
1827 
1828 #endif // KMP_USE_DYNAMIC_LOCK
1829 } // __kmpc_init_nest_lock
1830 
1831 void
1832 __kmpc_destroy_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1833 #if KMP_USE_DYNAMIC_LOCK
1834 
1835 # if USE_ITT_BUILD
1836  kmp_user_lock_p lck;
1837  if (DYNA_EXTRACT_D_TAG(user_lock) == 0) {
1838  lck = ((kmp_indirect_lock_t *)DYNA_LOOKUP_I_LOCK(user_lock))->lock;
1839  } else {
1840  lck = (kmp_user_lock_p)user_lock;
1841  }
1842  __kmp_itt_lock_destroyed(lck);
1843 # endif
1844  DYNA_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
1845 #else
1846  kmp_user_lock_p lck;
1847 
1848  if ( ( __kmp_user_lock_kind == lk_tas )
1849  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1850  lck = (kmp_user_lock_p)user_lock;
1851  }
1852 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1853  else if ( ( __kmp_user_lock_kind == lk_futex )
1854  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1855  lck = (kmp_user_lock_p)user_lock;
1856  }
1857 #endif
1858  else {
1859  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_lock" );
1860  }
1861 
1862 #if USE_ITT_BUILD
1863  __kmp_itt_lock_destroyed( lck );
1864 #endif /* USE_ITT_BUILD */
1865  DESTROY_LOCK( lck );
1866 
1867  if ( ( __kmp_user_lock_kind == lk_tas )
1868  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1869  ;
1870  }
1871 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1872  else if ( ( __kmp_user_lock_kind == lk_futex )
1873  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1874  ;
1875  }
1876 #endif
1877  else {
1878  __kmp_user_lock_free( user_lock, gtid, lck );
1879  }
1880 #endif // KMP_USE_DYNAMIC_LOCK
1881 } // __kmpc_destroy_lock
1882 
1883 /* destroy the lock */
1884 void
1885 __kmpc_destroy_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1886 #if KMP_USE_DYNAMIC_LOCK
1887 
1888 # if USE_ITT_BUILD
1889  kmp_indirect_lock_t *ilk = DYNA_LOOKUP_I_LOCK(user_lock);
1890  __kmp_itt_lock_destroyed(ilk->lock);
1891 # endif
1892  DYNA_D_LOCK_FUNC(user_lock, destroy)((kmp_dyna_lock_t *)user_lock);
1893 
1894 #else // KMP_USE_DYNAMIC_LOCK
1895 
1896  kmp_user_lock_p lck;
1897 
1898  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1899  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1900  lck = (kmp_user_lock_p)user_lock;
1901  }
1902 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1903  else if ( ( __kmp_user_lock_kind == lk_futex )
1904  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1905  <= OMP_NEST_LOCK_T_SIZE ) ) {
1906  lck = (kmp_user_lock_p)user_lock;
1907  }
1908 #endif
1909  else {
1910  lck = __kmp_lookup_user_lock( user_lock, "omp_destroy_nest_lock" );
1911  }
1912 
1913 #if USE_ITT_BUILD
1914  __kmp_itt_lock_destroyed( lck );
1915 #endif /* USE_ITT_BUILD */
1916 
1917  DESTROY_NESTED_LOCK( lck );
1918 
1919  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
1920  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
1921  ;
1922  }
1923 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1924  else if ( ( __kmp_user_lock_kind == lk_futex )
1925  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
1926  <= OMP_NEST_LOCK_T_SIZE ) ) {
1927  ;
1928  }
1929 #endif
1930  else {
1931  __kmp_user_lock_free( user_lock, gtid, lck );
1932  }
1933 #endif // KMP_USE_DYNAMIC_LOCK
1934 } // __kmpc_destroy_nest_lock
1935 
1936 void
1937 __kmpc_set_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1938  KMP_COUNT_BLOCK(OMP_set_lock);
1939 #if KMP_USE_DYNAMIC_LOCK
1940  int tag = DYNA_EXTRACT_D_TAG(user_lock);
1941 # if USE_ITT_BUILD
1942  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock); // itt function will get to the right lock object.
1943 # endif
1944 # if DYNA_USE_FAST_TAS
1945  if (tag == locktag_tas && !__kmp_env_consistency_check) {
1946  DYNA_ACQUIRE_TAS_LOCK(user_lock, gtid);
1947  } else
1948 # elif DYNA_USE_FAST_FUTEX
1949  if (tag == locktag_futex && !__kmp_env_consistency_check) {
1950  DYNA_ACQUIRE_FUTEX_LOCK(user_lock, gtid);
1951  } else
1952 # endif
1953  {
1954  __kmp_direct_set_ops[tag]((kmp_dyna_lock_t *)user_lock, gtid);
1955  }
1956 # if USE_ITT_BUILD
1957  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
1958 # endif
1959 
1960 #else // KMP_USE_DYNAMIC_LOCK
1961 
1962  kmp_user_lock_p lck;
1963 
1964  if ( ( __kmp_user_lock_kind == lk_tas )
1965  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1966  lck = (kmp_user_lock_p)user_lock;
1967  }
1968 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1969  else if ( ( __kmp_user_lock_kind == lk_futex )
1970  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
1971  lck = (kmp_user_lock_p)user_lock;
1972  }
1973 #endif
1974  else {
1975  lck = __kmp_lookup_user_lock( user_lock, "omp_set_lock" );
1976  }
1977 
1978 #if USE_ITT_BUILD
1979  __kmp_itt_lock_acquiring( lck );
1980 #endif /* USE_ITT_BUILD */
1981 
1982  ACQUIRE_LOCK( lck, gtid );
1983 
1984 #if USE_ITT_BUILD
1985  __kmp_itt_lock_acquired( lck );
1986 #endif /* USE_ITT_BUILD */
1987 
1988 #endif // KMP_USE_DYNAMIC_LOCK
1989 }
1990 
1991 void
1992 __kmpc_set_nest_lock( ident_t * loc, kmp_int32 gtid, void ** user_lock ) {
1993 #if KMP_USE_DYNAMIC_LOCK
1994 
1995 # if USE_ITT_BUILD
1996  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
1997 # endif
1998  DYNA_D_LOCK_FUNC(user_lock, set)((kmp_dyna_lock_t *)user_lock, gtid);
1999 # if USE_ITT_BUILD
2000  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2001 #endif
2002 
2003 #else // KMP_USE_DYNAMIC_LOCK
2004  kmp_user_lock_p lck;
2005 
2006  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2007  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2008  lck = (kmp_user_lock_p)user_lock;
2009  }
2010 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2011  else if ( ( __kmp_user_lock_kind == lk_futex )
2012  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2013  <= OMP_NEST_LOCK_T_SIZE ) ) {
2014  lck = (kmp_user_lock_p)user_lock;
2015  }
2016 #endif
2017  else {
2018  lck = __kmp_lookup_user_lock( user_lock, "omp_set_nest_lock" );
2019  }
2020 
2021 #if USE_ITT_BUILD
2022  __kmp_itt_lock_acquiring( lck );
2023 #endif /* USE_ITT_BUILD */
2024 
2025  ACQUIRE_NESTED_LOCK( lck, gtid );
2026 
2027 #if USE_ITT_BUILD
2028  __kmp_itt_lock_acquired( lck );
2029 #endif /* USE_ITT_BUILD */
2030 #endif // KMP_USE_DYNAMIC_LOCK
2031 }
2032 
2033 void
2034 __kmpc_unset_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2035 {
2036 #if KMP_USE_DYNAMIC_LOCK
2037 
2038  int tag = DYNA_EXTRACT_D_TAG(user_lock);
2039 # if USE_ITT_BUILD
2040  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2041 # endif
2042 # if DYNA_USE_FAST_TAS
2043  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2044  DYNA_RELEASE_TAS_LOCK(user_lock, gtid);
2045  } else
2046 # elif DYNA_USE_FAST_FUTEX
2047  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2048  DYNA_RELEASE_FUTEX_LOCK(user_lock, gtid);
2049  } else
2050 # endif
2051  {
2052  __kmp_direct_unset_ops[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2053  }
2054 
2055 #else // KMP_USE_DYNAMIC_LOCK
2056 
2057  kmp_user_lock_p lck;
2058 
2059  /* Can't use serial interval since not block structured */
2060  /* release the lock */
2061 
2062  if ( ( __kmp_user_lock_kind == lk_tas )
2063  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2064 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2065  // "fast" path implemented to fix customer performance issue
2066 #if USE_ITT_BUILD
2067  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2068 #endif /* USE_ITT_BUILD */
2069  TCW_4(((kmp_user_lock_p)user_lock)->tas.lk.poll, 0);
2070  KMP_MB();
2071  return;
2072 #else
2073  lck = (kmp_user_lock_p)user_lock;
2074 #endif
2075  }
2076 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2077  else if ( ( __kmp_user_lock_kind == lk_futex )
2078  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2079  lck = (kmp_user_lock_p)user_lock;
2080  }
2081 #endif
2082  else {
2083  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_lock" );
2084  }
2085 
2086 #if USE_ITT_BUILD
2087  __kmp_itt_lock_releasing( lck );
2088 #endif /* USE_ITT_BUILD */
2089 
2090  RELEASE_LOCK( lck, gtid );
2091 
2092 #if OMPT_SUPPORT && OMPT_BLAME
2093  if ((ompt_status == ompt_status_track_callback) &&
2094  ompt_callbacks.ompt_callback(ompt_event_release_lock)) {
2095  ompt_callbacks.ompt_callback(ompt_event_release_lock)((uint64_t) lck);
2096  }
2097 #endif
2098 
2099 #endif // KMP_USE_DYNAMIC_LOCK
2100 }
2101 
2102 /* release the lock */
2103 void
2104 __kmpc_unset_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2105 {
2106 #if KMP_USE_DYNAMIC_LOCK
2107 
2108 # if USE_ITT_BUILD
2109  __kmp_itt_lock_releasing((kmp_user_lock_p)user_lock);
2110 # endif
2111  DYNA_D_LOCK_FUNC(user_lock, unset)((kmp_dyna_lock_t *)user_lock, gtid);
2112 
2113 #else // KMP_USE_DYNAMIC_LOCK
2114 
2115  kmp_user_lock_p lck;
2116 
2117  /* Can't use serial interval since not block structured */
2118 
2119  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2120  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2121 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2122  // "fast" path implemented to fix customer performance issue
2123  kmp_tas_lock_t *tl = (kmp_tas_lock_t*)user_lock;
2124 #if USE_ITT_BUILD
2125  __kmp_itt_lock_releasing( (kmp_user_lock_p)user_lock );
2126 #endif /* USE_ITT_BUILD */
2127  if ( --(tl->lk.depth_locked) == 0 ) {
2128  TCW_4(tl->lk.poll, 0);
2129  }
2130  KMP_MB();
2131  return;
2132 #else
2133  lck = (kmp_user_lock_p)user_lock;
2134 #endif
2135  }
2136 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2137  else if ( ( __kmp_user_lock_kind == lk_futex )
2138  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2139  <= OMP_NEST_LOCK_T_SIZE ) ) {
2140  lck = (kmp_user_lock_p)user_lock;
2141  }
2142 #endif
2143  else {
2144  lck = __kmp_lookup_user_lock( user_lock, "omp_unset_nest_lock" );
2145  }
2146 
2147 #if USE_ITT_BUILD
2148  __kmp_itt_lock_releasing( lck );
2149 #endif /* USE_ITT_BUILD */
2150 
2151  int release_status;
2152  release_status = RELEASE_NESTED_LOCK( lck, gtid );
2153 #if OMPT_SUPPORT && OMPT_BLAME
2154  if (ompt_status == ompt_status_track_callback) {
2155  if (release_status == KMP_LOCK_RELEASED) {
2156  if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)) {
2157  ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_last)(
2158  (uint64_t) lck);
2159  }
2160  } else if (ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)) {
2161  ompt_callbacks.ompt_callback(ompt_event_release_nest_lock_prev)(
2162  (uint64_t) lck);
2163  }
2164  }
2165 #endif
2166 
2167 #endif // KMP_USE_DYNAMIC_LOCK
2168 }
2169 
2170 /* try to acquire the lock */
2171 int
2172 __kmpc_test_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2173 {
2174  KMP_COUNT_BLOCK(OMP_test_lock);
2175  KMP_TIME_BLOCK(OMP_test_lock);
2176 
2177 #if KMP_USE_DYNAMIC_LOCK
2178  int rc;
2179  int tag = DYNA_EXTRACT_D_TAG(user_lock);
2180 # if USE_ITT_BUILD
2181  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2182 # endif
2183 # if DYNA_USE_FAST_TAS
2184  if (tag == locktag_tas && !__kmp_env_consistency_check) {
2185  DYNA_TEST_TAS_LOCK(user_lock, gtid, rc);
2186  } else
2187 # elif DYNA_USE_FAST_FUTEX
2188  if (tag == locktag_futex && !__kmp_env_consistency_check) {
2189  DYNA_TEST_FUTEX_LOCK(user_lock, gtid, rc);
2190  } else
2191 # endif
2192  {
2193  rc = __kmp_direct_test_ops[tag]((kmp_dyna_lock_t *)user_lock, gtid);
2194  }
2195  if (rc) {
2196 # if USE_ITT_BUILD
2197  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2198 # endif
2199  return FTN_TRUE;
2200  } else {
2201 # if USE_ITT_BUILD
2202  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2203 # endif
2204  return FTN_FALSE;
2205  }
2206 
2207 #else // KMP_USE_DYNAMIC_LOCK
2208 
2209  kmp_user_lock_p lck;
2210  int rc;
2211 
2212  if ( ( __kmp_user_lock_kind == lk_tas )
2213  && ( sizeof( lck->tas.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2214  lck = (kmp_user_lock_p)user_lock;
2215  }
2216 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2217  else if ( ( __kmp_user_lock_kind == lk_futex )
2218  && ( sizeof( lck->futex.lk.poll ) <= OMP_LOCK_T_SIZE ) ) {
2219  lck = (kmp_user_lock_p)user_lock;
2220  }
2221 #endif
2222  else {
2223  lck = __kmp_lookup_user_lock( user_lock, "omp_test_lock" );
2224  }
2225 
2226 #if USE_ITT_BUILD
2227  __kmp_itt_lock_acquiring( lck );
2228 #endif /* USE_ITT_BUILD */
2229 
2230  rc = TEST_LOCK( lck, gtid );
2231 #if USE_ITT_BUILD
2232  if ( rc ) {
2233  __kmp_itt_lock_acquired( lck );
2234  } else {
2235  __kmp_itt_lock_cancelled( lck );
2236  }
2237 #endif /* USE_ITT_BUILD */
2238  return ( rc ? FTN_TRUE : FTN_FALSE );
2239 
2240  /* Can't use serial interval since not block structured */
2241 
2242 #endif // KMP_USE_DYNAMIC_LOCK
2243 }
2244 
2245 /* try to acquire the lock */
2246 int
2247 __kmpc_test_nest_lock( ident_t *loc, kmp_int32 gtid, void **user_lock )
2248 {
2249 #if KMP_USE_DYNAMIC_LOCK
2250  int rc;
2251 # if USE_ITT_BUILD
2252  __kmp_itt_lock_acquiring((kmp_user_lock_p)user_lock);
2253 # endif
2254  rc = DYNA_D_LOCK_FUNC(user_lock, test)((kmp_dyna_lock_t *)user_lock, gtid);
2255 # if USE_ITT_BUILD
2256  if (rc) {
2257  __kmp_itt_lock_acquired((kmp_user_lock_p)user_lock);
2258  } else {
2259  __kmp_itt_lock_cancelled((kmp_user_lock_p)user_lock);
2260  }
2261 # endif
2262  return rc;
2263 
2264 #else // KMP_USE_DYNAMIC_LOCK
2265 
2266  kmp_user_lock_p lck;
2267  int rc;
2268 
2269  if ( ( __kmp_user_lock_kind == lk_tas ) && ( sizeof( lck->tas.lk.poll )
2270  + sizeof( lck->tas.lk.depth_locked ) <= OMP_NEST_LOCK_T_SIZE ) ) {
2271  lck = (kmp_user_lock_p)user_lock;
2272  }
2273 #if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
2274  else if ( ( __kmp_user_lock_kind == lk_futex )
2275  && ( sizeof( lck->futex.lk.poll ) + sizeof( lck->futex.lk.depth_locked )
2276  <= OMP_NEST_LOCK_T_SIZE ) ) {
2277  lck = (kmp_user_lock_p)user_lock;
2278  }
2279 #endif
2280  else {
2281  lck = __kmp_lookup_user_lock( user_lock, "omp_test_nest_lock" );
2282  }
2283 
2284 #if USE_ITT_BUILD
2285  __kmp_itt_lock_acquiring( lck );
2286 #endif /* USE_ITT_BUILD */
2287 
2288  rc = TEST_NESTED_LOCK( lck, gtid );
2289 #if USE_ITT_BUILD
2290  if ( rc ) {
2291  __kmp_itt_lock_acquired( lck );
2292  } else {
2293  __kmp_itt_lock_cancelled( lck );
2294  }
2295 #endif /* USE_ITT_BUILD */
2296  return rc;
2297 
2298  /* Can't use serial interval since not block structured */
2299 
2300 #endif // KMP_USE_DYNAMIC_LOCK
2301 }
2302 
2303 
2304 /*--------------------------------------------------------------------------------------------------------------------*/
2305 
2306 /*
2307  * Interface to fast scalable reduce methods routines
2308  */
2309 
2310 // keep the selected method in a thread local structure for cross-function usage: will be used in __kmpc_end_reduce* functions;
2311 // another solution: to re-determine the method one more time in __kmpc_end_reduce* functions (new prototype required then)
2312 // AT: which solution is better?
2313 #define __KMP_SET_REDUCTION_METHOD(gtid,rmethod) \
2314  ( ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method ) = ( rmethod ) )
2315 
2316 #define __KMP_GET_REDUCTION_METHOD(gtid) \
2317  ( __kmp_threads[ ( gtid ) ] -> th.th_local.packed_reduction_method )
2318 
2319 // description of the packed_reduction_method variable: look at the macros in kmp.h
2320 
2321 
2322 // used in a critical section reduce block
2323 static __forceinline void
2324 __kmp_enter_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2325 
2326  // this lock was visible to a customer and to the thread profiler as a serial overhead span
2327  // (although it's used for an internal purpose only)
2328  // why was it visible in previous implementation?
2329  // should we keep it visible in new reduce block?
2330  kmp_user_lock_p lck;
2331 
2332 #if KMP_USE_DYNAMIC_LOCK
2333 
2334  if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
2335  lck = (kmp_user_lock_p)crit;
2336  if (*((kmp_dyna_lock_t *)lck) == 0) {
2337  KMP_COMPARE_AND_STORE_ACQ32((volatile kmp_int32 *)lck, 0, DYNA_GET_D_TAG(__kmp_user_lock_seq));
2338  }
2339  KMP_DEBUG_ASSERT(lck != NULL);
2340  if (__kmp_env_consistency_check) {
2341  __kmp_push_sync(global_tid, ct_critical, loc, lck, __kmp_user_lock_seq);
2342  }
2343  DYNA_D_LOCK_FUNC(lck, set)((kmp_dyna_lock_t *)lck, global_tid);
2344  } else {
2345  kmp_indirect_lock_t *ilk = __kmp_get_indirect_csptr(crit, loc, global_tid, __kmp_user_lock_seq);
2346  KMP_DEBUG_ASSERT(ilk != NULL);
2347  if (__kmp_env_consistency_check) {
2348  __kmp_push_sync(global_tid, ct_critical, loc, ilk->lock, __kmp_user_lock_seq);
2349  }
2350  DYNA_I_LOCK_FUNC(ilk, set)(ilk->lock, global_tid);
2351  }
2352 
2353 #else // KMP_USE_DYNAMIC_LOCK
2354 
2355  // We know that the fast reduction code is only emitted by Intel compilers
2356  // with 32 byte critical sections. If there isn't enough space, then we
2357  // have to use a pointer.
2358  if ( __kmp_base_user_lock_size <= INTEL_CRITICAL_SIZE ) {
2359  lck = (kmp_user_lock_p)crit;
2360  }
2361  else {
2362  lck = __kmp_get_critical_section_ptr( crit, loc, global_tid );
2363  }
2364  KMP_DEBUG_ASSERT( lck != NULL );
2365 
2366  if ( __kmp_env_consistency_check )
2367  __kmp_push_sync( global_tid, ct_critical, loc, lck );
2368 
2369  __kmp_acquire_user_lock_with_checks( lck, global_tid );
2370 
2371 #endif // KMP_USE_DYNAMIC_LOCK
2372 }
2373 
2374 // used in a critical section reduce block
2375 static __forceinline void
2376 __kmp_end_critical_section_reduce_block( ident_t * loc, kmp_int32 global_tid, kmp_critical_name * crit ) {
2377 
2378  kmp_user_lock_p lck;
2379 
2380 #if KMP_USE_DYNAMIC_LOCK
2381 
2382  if (DYNA_IS_D_LOCK(__kmp_user_lock_seq)) {
2383  lck = (kmp_user_lock_p)crit;
2384  if (__kmp_env_consistency_check)
2385  __kmp_pop_sync(global_tid, ct_critical, loc);
2386  DYNA_D_LOCK_FUNC(lck, unset)((kmp_dyna_lock_t *)lck, global_tid);
2387  } else {
2388  kmp_indirect_lock_t *ilk = (kmp_indirect_lock_t *)TCR_PTR(*((kmp_indirect_lock_t **)crit));
2389  if (__kmp_env_consistency_check)
2390  __kmp_pop_sync(global_tid, ct_critical, loc);
2391  DYNA_I_LOCK_FUNC(ilk, unset)(ilk->lock, global_tid);
2392  }
2393 
2394 #else // KMP_USE_DYNAMIC_LOCK
2395 
2396  // We know that the fast reduction code is only emitted by Intel compilers with 32 byte critical
2397  // sections. If there isn't enough space, then we have to use a pointer.
2398  if ( __kmp_base_user_lock_size > 32 ) {
2399  lck = *( (kmp_user_lock_p *) crit );
2400  KMP_ASSERT( lck != NULL );
2401  } else {
2402  lck = (kmp_user_lock_p) crit;
2403  }
2404 
2405  if ( __kmp_env_consistency_check )
2406  __kmp_pop_sync( global_tid, ct_critical, loc );
2407 
2408  __kmp_release_user_lock_with_checks( lck, global_tid );
2409 
2410 #endif // KMP_USE_DYNAMIC_LOCK
2411 } // __kmp_end_critical_section_reduce_block
2412 
2413 
2414 /* 2.a.i. Reduce Block without a terminating barrier */
2428 kmp_int32
2430  ident_t *loc, kmp_int32 global_tid,
2431  kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
2432  kmp_critical_name *lck ) {
2433 
2434  KMP_COUNT_BLOCK(REDUCE_nowait);
2435  int retval = 0;
2436  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2437 #if OMP_40_ENABLED
2438  kmp_team_t *team;
2439  kmp_info_t *th;
2440  int teams_swapped = 0, task_state;
2441 #endif
2442  KA_TRACE( 10, ( "__kmpc_reduce_nowait() enter: called T#%d\n", global_tid ) );
2443 
2444  // why do we need this initialization here at all?
2445  // Reduction clause can not be used as a stand-alone directive.
2446 
2447  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2448  // possible detection of false-positive race by the threadchecker ???
2449  if( ! TCR_4( __kmp_init_parallel ) )
2450  __kmp_parallel_initialize();
2451 
2452  // check correctness of reduce block nesting
2453 #if KMP_USE_DYNAMIC_LOCK
2454  if ( __kmp_env_consistency_check )
2455  __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2456 #else
2457  if ( __kmp_env_consistency_check )
2458  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2459 #endif
2460 
2461 #if OMP_40_ENABLED
2462  th = __kmp_thread_from_gtid(global_tid);
2463  if( th->th.th_teams_microtask ) { // AC: check if we are inside the teams construct?
2464  team = th->th.th_team;
2465  if( team->t.t_level == th->th.th_teams_level ) {
2466  // this is reduction at teams construct
2467  KMP_DEBUG_ASSERT(!th->th.th_info.ds.ds_tid); // AC: check that tid == 0
2468  // Let's swap teams temporarily for the reduction barrier
2469  teams_swapped = 1;
2470  th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2471  th->th.th_team = team->t.t_parent;
2472  th->th.th_team_nproc = th->th.th_team->t.t_nproc;
2473  th->th.th_task_team = th->th.th_team->t.t_task_team[0];
2474  task_state = th->th.th_task_state;
2475  th->th.th_task_state = 0;
2476  }
2477  }
2478 #endif // OMP_40_ENABLED
2479 
2480  // packed_reduction_method value will be reused by __kmp_end_reduce* function, the value should be kept in a variable
2481  // the variable should be either a construct-specific or thread-specific property, not a team specific property
2482  // (a thread can reach the next reduce block on the next construct, reduce method may differ on the next construct)
2483  // an ident_t "loc" parameter could be used as a construct-specific property (what if loc == 0?)
2484  // (if both construct-specific and team-specific variables were shared, then unness extra syncs should be needed)
2485  // a thread-specific variable is better regarding two issues above (next construct and extra syncs)
2486  // a thread-specific "th_local.reduction_method" variable is used currently
2487  // each thread executes 'determine' and 'set' lines (no need to execute by one thread, to avoid unness extra syncs)
2488 
2489  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2490  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2491 
2492  if( packed_reduction_method == critical_reduce_block ) {
2493 
2494  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2495  retval = 1;
2496 
2497  } else if( packed_reduction_method == empty_reduce_block ) {
2498 
2499  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2500  retval = 1;
2501 
2502  } else if( packed_reduction_method == atomic_reduce_block ) {
2503 
2504  retval = 2;
2505 
2506  // all threads should do this pop here (because __kmpc_end_reduce_nowait() won't be called by the code gen)
2507  // (it's not quite good, because the checking block has been closed by this 'pop',
2508  // but atomic operation has not been executed yet, will be executed slightly later, literally on next instruction)
2509  if ( __kmp_env_consistency_check )
2510  __kmp_pop_sync( global_tid, ct_reduce, loc );
2511 
2512  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2513 
2514  //AT: performance issue: a real barrier here
2515  //AT: (if master goes slow, other threads are blocked here waiting for the master to come and release them)
2516  //AT: (it's not what a customer might expect specifying NOWAIT clause)
2517  //AT: (specifying NOWAIT won't result in improvement of performance, it'll be confusing to a customer)
2518  //AT: another implementation of *barrier_gather*nowait() (or some other design) might go faster
2519  // and be more in line with sense of NOWAIT
2520  //AT: TO DO: do epcc test and compare times
2521 
2522  // this barrier should be invisible to a customer and to the thread profiler
2523  // (it's neither a terminating barrier nor customer's code, it's used for an internal purpose)
2524 #if USE_ITT_NOTIFY
2525  __kmp_threads[global_tid]->th.th_ident = loc;
2526 #endif
2527  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, FALSE, reduce_size, reduce_data, reduce_func );
2528  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2529 
2530  // all other workers except master should do this pop here
2531  // ( none of other workers will get to __kmpc_end_reduce_nowait() )
2532  if ( __kmp_env_consistency_check ) {
2533  if( retval == 0 ) {
2534  __kmp_pop_sync( global_tid, ct_reduce, loc );
2535  }
2536  }
2537 
2538  } else {
2539 
2540  // should never reach this block
2541  KMP_ASSERT( 0 ); // "unexpected method"
2542 
2543  }
2544 #if OMP_40_ENABLED
2545  if( teams_swapped ) {
2546  // Restore thread structure
2547  th->th.th_info.ds.ds_tid = 0;
2548  th->th.th_team = team;
2549  th->th.th_team_nproc = team->t.t_nproc;
2550  th->th.th_task_team = team->t.t_task_team[task_state];
2551  th->th.th_task_state = task_state;
2552  }
2553 #endif
2554  KA_TRACE( 10, ( "__kmpc_reduce_nowait() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2555 
2556  return retval;
2557 }
2558 
2567 void
2568 __kmpc_end_reduce_nowait( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2569 
2570  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2571 
2572  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() enter: called T#%d\n", global_tid ) );
2573 
2574  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2575 
2576  if( packed_reduction_method == critical_reduce_block ) {
2577 
2578  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2579 
2580  } else if( packed_reduction_method == empty_reduce_block ) {
2581 
2582  // usage: if team size == 1, no synchronization is required ( on Intel platforms only )
2583 
2584  } else if( packed_reduction_method == atomic_reduce_block ) {
2585 
2586  // neither master nor other workers should get here
2587  // (code gen does not generate this call in case 2: atomic reduce block)
2588  // actually it's better to remove this elseif at all;
2589  // after removal this value will checked by the 'else' and will assert
2590 
2591  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2592 
2593  // only master gets here
2594 
2595  } else {
2596 
2597  // should never reach this block
2598  KMP_ASSERT( 0 ); // "unexpected method"
2599 
2600  }
2601 
2602  if ( __kmp_env_consistency_check )
2603  __kmp_pop_sync( global_tid, ct_reduce, loc );
2604 
2605  KA_TRACE( 10, ( "__kmpc_end_reduce_nowait() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2606 
2607  return;
2608 }
2609 
2610 /* 2.a.ii. Reduce Block with a terminating barrier */
2611 
2625 kmp_int32
2627  ident_t *loc, kmp_int32 global_tid,
2628  kmp_int32 num_vars, size_t reduce_size, void *reduce_data,
2629  void (*reduce_func)(void *lhs_data, void *rhs_data),
2630  kmp_critical_name *lck )
2631 {
2632  KMP_COUNT_BLOCK(REDUCE_wait);
2633  int retval = 0;
2634  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2635 
2636  KA_TRACE( 10, ( "__kmpc_reduce() enter: called T#%d\n", global_tid ) );
2637 
2638  // why do we need this initialization here at all?
2639  // Reduction clause can not be a stand-alone directive.
2640 
2641  // do not call __kmp_serial_initialize(), it will be called by __kmp_parallel_initialize() if needed
2642  // possible detection of false-positive race by the threadchecker ???
2643  if( ! TCR_4( __kmp_init_parallel ) )
2644  __kmp_parallel_initialize();
2645 
2646  // check correctness of reduce block nesting
2647 #if KMP_USE_DYNAMIC_LOCK
2648  if ( __kmp_env_consistency_check )
2649  __kmp_push_sync( global_tid, ct_reduce, loc, NULL, 0 );
2650 #else
2651  if ( __kmp_env_consistency_check )
2652  __kmp_push_sync( global_tid, ct_reduce, loc, NULL );
2653 #endif
2654 
2655  packed_reduction_method = __kmp_determine_reduction_method( loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck );
2656  __KMP_SET_REDUCTION_METHOD( global_tid, packed_reduction_method );
2657 
2658  if( packed_reduction_method == critical_reduce_block ) {
2659 
2660  __kmp_enter_critical_section_reduce_block( loc, global_tid, lck );
2661  retval = 1;
2662 
2663  } else if( packed_reduction_method == empty_reduce_block ) {
2664 
2665  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2666  retval = 1;
2667 
2668  } else if( packed_reduction_method == atomic_reduce_block ) {
2669 
2670  retval = 2;
2671 
2672  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2673 
2674  //case tree_reduce_block:
2675  // this barrier should be visible to a customer and to the thread profiler
2676  // (it's a terminating barrier on constructs if NOWAIT not specified)
2677 #if USE_ITT_NOTIFY
2678  __kmp_threads[global_tid]->th.th_ident = loc; // needed for correct notification of frames
2679 #endif
2680  retval = __kmp_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid, TRUE, reduce_size, reduce_data, reduce_func );
2681  retval = ( retval != 0 ) ? ( 0 ) : ( 1 );
2682 
2683  // all other workers except master should do this pop here
2684  // ( none of other workers except master will enter __kmpc_end_reduce() )
2685  if ( __kmp_env_consistency_check ) {
2686  if( retval == 0 ) { // 0: all other workers; 1: master
2687  __kmp_pop_sync( global_tid, ct_reduce, loc );
2688  }
2689  }
2690 
2691  } else {
2692 
2693  // should never reach this block
2694  KMP_ASSERT( 0 ); // "unexpected method"
2695 
2696  }
2697 
2698  KA_TRACE( 10, ( "__kmpc_reduce() exit: called T#%d: method %08x, returns %08x\n", global_tid, packed_reduction_method, retval ) );
2699 
2700  return retval;
2701 }
2702 
2712 void
2713 __kmpc_end_reduce( ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck ) {
2714 
2715  PACKED_REDUCTION_METHOD_T packed_reduction_method;
2716 
2717  KA_TRACE( 10, ( "__kmpc_end_reduce() enter: called T#%d\n", global_tid ) );
2718 
2719  packed_reduction_method = __KMP_GET_REDUCTION_METHOD( global_tid );
2720 
2721  // this barrier should be visible to a customer and to the thread profiler
2722  // (it's a terminating barrier on constructs if NOWAIT not specified)
2723 
2724  if( packed_reduction_method == critical_reduce_block ) {
2725 
2726  __kmp_end_critical_section_reduce_block( loc, global_tid, lck );
2727 
2728  // TODO: implicit barrier: should be exposed
2729 #if USE_ITT_NOTIFY
2730  __kmp_threads[global_tid]->th.th_ident = loc;
2731 #endif
2732  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2733 
2734  } else if( packed_reduction_method == empty_reduce_block ) {
2735 
2736  // usage: if team size == 1, no synchronization is required ( Intel platforms only )
2737 
2738  // TODO: implicit barrier: should be exposed
2739 #if USE_ITT_NOTIFY
2740  __kmp_threads[global_tid]->th.th_ident = loc;
2741 #endif
2742  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2743 
2744  } else if( packed_reduction_method == atomic_reduce_block ) {
2745 
2746  // TODO: implicit barrier: should be exposed
2747 #if USE_ITT_NOTIFY
2748  __kmp_threads[global_tid]->th.th_ident = loc;
2749 #endif
2750  __kmp_barrier( bs_plain_barrier, global_tid, FALSE, 0, NULL, NULL );
2751 
2752  } else if( TEST_REDUCTION_METHOD( packed_reduction_method, tree_reduce_block ) ) {
2753 
2754  // only master executes here (master releases all other workers)
2755  __kmp_end_split_barrier( UNPACK_REDUCTION_BARRIER( packed_reduction_method ), global_tid );
2756 
2757  } else {
2758 
2759  // should never reach this block
2760  KMP_ASSERT( 0 ); // "unexpected method"
2761 
2762  }
2763 
2764  if ( __kmp_env_consistency_check )
2765  __kmp_pop_sync( global_tid, ct_reduce, loc );
2766 
2767  KA_TRACE( 10, ( "__kmpc_end_reduce() exit: called T#%d: method %08x\n", global_tid, packed_reduction_method ) );
2768 
2769  return;
2770 }
2771 
2772 #undef __KMP_GET_REDUCTION_METHOD
2773 #undef __KMP_SET_REDUCTION_METHOD
2774 
2775 /*-- end of interface to fast scalable reduce routines ---------------------------------------------------------------*/
2776 
2777 kmp_uint64
2778 __kmpc_get_taskid() {
2779 
2780  kmp_int32 gtid;
2781  kmp_info_t * thread;
2782 
2783  gtid = __kmp_get_gtid();
2784  if ( gtid < 0 ) {
2785  return 0;
2786  }; // if
2787  thread = __kmp_thread_from_gtid( gtid );
2788  return thread->th.th_current_task->td_task_id;
2789 
2790 } // __kmpc_get_taskid
2791 
2792 
2793 kmp_uint64
2794 __kmpc_get_parent_taskid() {
2795 
2796  kmp_int32 gtid;
2797  kmp_info_t * thread;
2798  kmp_taskdata_t * parent_task;
2799 
2800  gtid = __kmp_get_gtid();
2801  if ( gtid < 0 ) {
2802  return 0;
2803  }; // if
2804  thread = __kmp_thread_from_gtid( gtid );
2805  parent_task = thread->th.th_current_task->td_parent;
2806  return ( parent_task == NULL ? 0 : parent_task->td_task_id );
2807 
2808 } // __kmpc_get_parent_taskid
2809 
2810 void __kmpc_place_threads(int nC, int nT, int nO)
2811 {
2812  if ( ! __kmp_init_serial ) {
2813  __kmp_serial_initialize();
2814  }
2815  __kmp_place_num_cores = nC;
2816  __kmp_place_num_threads_per_core = nT;
2817  __kmp_place_core_offset = nO;
2818 }
2819 
2820 // end of file //
2821 
kmp_int32 __kmpc_master(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:689
kmp_int32 __kmpc_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_single(ident_t *loc, kmp_int32 global_tid)
kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
kmp_int32 __kmpc_global_thread_num(ident_t *loc)
Definition: kmp_csupport.c:98
void __kmpc_for_static_fini(ident_t *loc, kmp_int32 global_tid)
#define KMP_START_EXPLICIT_TIMER(name)
"Starts" an explicit timer which will need a corresponding KMP_STOP_EXPLICIT_TIMER() macro...
Definition: kmp_stats.h:649
#define KMP_STOP_EXPLICIT_TIMER(name)
"Stops" an explicit timer.
Definition: kmp_stats.h:663
#define KMP_TIME_BLOCK(name)
Uses specified timer (name) to time code block.
Definition: kmp_stats.h:610
void __kmpc_flush(ident_t *loc)
Definition: kmp_csupport.c:587
kmp_int32 __kmpc_single(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end(ident_t *loc)
Definition: kmp_csupport.c:64
void __kmpc_end_ordered(ident_t *loc, kmp_int32 gtid)
Definition: kmp_csupport.c:844
void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:445
#define KMP_IDENT_AUTOPAR
Definition: kmp.h:181
void __kmpc_begin(ident_t *loc, kmp_int32 flags)
Definition: kmp_csupport.c:46
kmp_int32 __kmpc_bound_thread_num(ident_t *loc)
Definition: kmp_csupport.c:135
kmp_int32 __kmpc_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void(*reduce_func)(void *lhs_data, void *rhs_data), kmp_critical_name *lck)
void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, void *cpy_data, void(*cpy_func)(void *, void *), kmp_int32 didit)
void __kmpc_ordered(ident_t *loc, kmp_int32 gtid)
Definition: kmp_csupport.c:778
#define KMP_COUNT_BLOCK(name)
Increments specified counter (name).
Definition: kmp_stats.h:635
void __kmpc_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
Definition: kmp.h:198
void __kmpc_end_barrier_master(ident_t *loc, kmp_int32 global_tid)
void __kmpc_end_master(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:743
void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)
Definition: kmp_csupport.c:240
void __kmpc_fork_teams(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
Definition: kmp_csupport.c:364
kmp_int32 __kmpc_in_parallel(ident_t *loc)
Definition: kmp_csupport.c:225
kmp_int32 __kmpc_ok_to_fork(ident_t *loc)
Definition: kmp_csupport.c:161
kmp_int32 __kmpc_global_num_threads(ident_t *loc)
Definition: kmp_csupport.c:121
kmp_int32 __kmpc_bound_num_threads(ident_t *loc)
Definition: kmp_csupport.c:147
void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:653
void __kmpc_end_reduce_nowait(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *lck)
void __kmpc_end_critical(ident_t *loc, kmp_int32 global_tid, kmp_critical_name *crit)
void __kmpc_push_num_teams(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_teams, kmp_int32 num_threads)
Definition: kmp_csupport.c:346
void(* kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid,...)
Definition: kmp.h:1285
kmp_int32 __kmpc_barrier_master_nowait(ident_t *loc, kmp_int32 global_tid)
void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
Definition: kmp_csupport.c:430
void __kmpc_fork_call(ident_t *loc, kmp_int32 argc, kmpc_micro microtask,...)
Definition: kmp_csupport.c:281
char const * psource
Definition: kmp.h:207
kmp_int32 flags
Definition: kmp.h:200