Halide  13.0.2
Halide compiler and libraries
Func.h
Go to the documentation of this file.
1 #ifndef HALIDE_FUNC_H
2 #define HALIDE_FUNC_H
3 
4 /** \file
5  *
6  * Defines Func - the front-end handle on a halide function, and related classes.
7  */
8 
9 #include "Argument.h"
10 #include "Expr.h"
11 #include "JITModule.h"
12 #include "Module.h"
13 #include "Param.h"
14 #include "Pipeline.h"
15 #include "RDom.h"
16 #include "Target.h"
17 #include "Tuple.h"
18 #include "Var.h"
19 
20 #include <map>
21 #include <utility>
22 
23 namespace Halide {
24 
25 class OutputImageParam;
26 class ParamMap;
27 
28 /** A class that can represent Vars or RVars. Used for reorder calls
29  * which can accept a mix of either. */
30 struct VarOrRVar {
31  VarOrRVar(const std::string &n, bool r)
32  : var(n), rvar(n), is_rvar(r) {
33  }
34  VarOrRVar(const Var &v)
35  : var(v), is_rvar(false) {
36  }
37  VarOrRVar(const RVar &r)
38  : rvar(r), is_rvar(true) {
39  }
40  VarOrRVar(const RDom &r)
41  : rvar(RVar(r)), is_rvar(true) {
42  }
43  template<int N>
45  : var(u), is_rvar(false) {
46  }
47 
48  const std::string &name() const {
49  if (is_rvar) {
50  return rvar.name();
51  } else {
52  return var.name();
53  }
54  }
55 
58  bool is_rvar;
59 };
60 
61 class ImageParam;
62 
63 namespace Internal {
64 class Function;
65 struct Split;
66 struct StorageDim;
67 } // namespace Internal
68 
69 /** A single definition of a Func. May be a pure or update definition. */
70 class Stage {
71  /** Reference to the Function this stage (or definition) belongs to. */
72  Internal::Function function;
73  Internal::Definition definition;
74  /** Indicate which stage the definition belongs to (0 for initial
75  * definition, 1 for first update, etc.). */
76  size_t stage_index;
77  /** Pure Vars of the Function (from the init definition). */
78  std::vector<Var> dim_vars;
79 
80  void set_dim_type(const VarOrRVar &var, Internal::ForType t);
81  void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
82  void split(const std::string &old, const std::string &outer, const std::string &inner,
83  const Expr &factor, bool exact, TailStrategy tail);
84  void remove(const std::string &var);
85  Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
86 
87  const std::vector<Internal::StorageDim> &storage_dims() const {
88  return function.schedule().storage_dims();
89  }
90 
91  Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
92 
93 public:
94  Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
95  : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
96  internal_assert(definition.defined());
97  definition.schedule().touched() = true;
98 
99  dim_vars.reserve(function.args().size());
100  for (const auto &arg : function.args()) {
101  dim_vars.emplace_back(arg);
102  }
103  internal_assert(definition.args().size() == dim_vars.size());
104  }
105 
106  /** Return the current StageSchedule associated with this Stage. For
107  * introspection only: to modify schedule, use the Func interface. */
109  return definition.schedule();
110  }
111 
112  /** Return a string describing the current var list taking into
113  * account all the splits, reorders, and tiles. */
114  std::string dump_argument_list() const;
115 
116  /** Return the name of this stage, e.g. "f.update(2)" */
117  std::string name() const;
118 
119  /** Calling rfactor() on an associative update definition a Func will split
120  * the update into an intermediate which computes the partial results and
121  * replaces the current update definition with a new definition which merges
122  * the partial results. If called on a init/pure definition, this will
123  * throw an error. rfactor() will automatically infer the associative reduction
124  * operator and identity of the operator. If it can't prove the operation
125  * is associative or if it cannot find an identity for that operator, this
126  * will throw an error. In addition, commutativity of the operator is required
127  * if rfactor() is called on the inner dimension but excluding the outer
128  * dimensions.
129  *
130  * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
131  * The rvars not listed in 'preserved' are removed from the original Func and
132  * are lifted to the intermediate Func. The remaining rvars (the ones in
133  * 'preserved') are made pure in the intermediate Func. The intermediate Func's
134  * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
135  * applied to the original Func's update definition. The loop order of the
136  * intermediate Func's update definition is the same as the original, although
137  * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
138  * intermediate Func's init definition from innermost to outermost is the args'
139  * order of the original Func's init definition followed by the new pure Vars.
140  *
141  * The intermediate Func also inherits storage order from the original Func
142  * with the new pure Vars added to the outermost.
143  *
144  * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
145  \code
146  f(x, y) = 0;
147  f(x, y) += g(r.x, r.y);
148  \endcode
149  * into a pipeline like this:
150  \code
151  f_intm(x, y, u) = 0;
152  f_intm(x, y, u) += g(r.x, u);
153 
154  f(x, y) = 0;
155  f(x, y) += f_intm(x, y, r.y);
156  \endcode
157  *
158  * This has a variety of uses. You can use it to split computation of an associative reduction:
159  \code
160  f(x, y) = 10;
161  RDom r(0, 96);
162  f(x, y) = max(f(x, y), g(x, y, r.x));
163  f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
164  f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
165  \endcode
166  *
167  *, which is equivalent to:
168  \code
169  parallel for u = 0 to 11:
170  for y:
171  for x:
172  f_intm(x, y, u) = -inf
173  parallel for x:
174  for y:
175  parallel for u = 0 to 11:
176  for rxi = 0 to 7:
177  f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
178  for y:
179  for x:
180  f(x, y) = 10
181  parallel for x:
182  for y:
183  for rxo = 0 to 11:
184  f(x, y) = max(f(x, y), f_intm(x, y, rxo))
185  \endcode
186  *
187  */
188  // @{
189  Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
190  Func rfactor(const RVar &r, const Var &v);
191  // @}
192 
193  /** Schedule the iteration over this stage to be fused with another
194  * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
195  * be computed AFTER 's' in the innermost fused dimension. There should not
196  * be any dependencies between those two fused stages. If either of the
197  * stages being fused is a stage of an extern Func, this will throw an error.
198  *
199  * Note that the two stages that are fused together should have the same
200  * exact schedule from the outermost to the innermost fused dimension, and
201  * the stage we are calling compute_with on should not have specializations,
202  * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
203  *
204  * Also, if a producer is desired to be computed at the fused loop level,
205  * the function passed to the compute_at() needs to be the "parent". Consider
206  * the following code:
207  \code
208  input(x, y) = x + y;
209  f(x, y) = input(x, y);
210  f(x, y) += 5;
211  g(x, y) = x - y;
212  g(x, y) += 10;
213  f.compute_with(g, y);
214  f.update().compute_with(g.update(), y);
215  \endcode
216  *
217  * To compute 'input' at the fused loop level at dimension y, we specify
218  * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
219  * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
220  * is computed). On the other hand, to compute 'input' at the innermost
221  * dimension of 'f', we specify input.compute_at(f, x) instead of
222  * input.compute_at(g, x) since the x dimension of 'f' is not fused
223  * (only the y dimension is).
224  *
225  * Given the constraints, this has a variety of uses. Consider the
226  * following code:
227  \code
228  f(x, y) = x + y;
229  g(x, y) = x - y;
230  h(x, y) = f(x, y) + g(x, y);
231  f.compute_root();
232  g.compute_root();
233  f.split(x, xo, xi, 8);
234  g.split(x, xo, xi, 8);
235  g.compute_with(f, xo);
236  \endcode
237  *
238  * This is equivalent to:
239  \code
240  for y:
241  for xo:
242  for xi:
243  f(8*xo + xi) = (8*xo + xi) + y
244  for xi:
245  g(8*xo + xi) = (8*xo + xi) - y
246  for y:
247  for x:
248  h(x, y) = f(x, y) + g(x, y)
249  \endcode
250  *
251  * The size of the dimensions of the stages computed_with do not have
252  * to match. Consider the following code where 'g' is half the size of 'f':
253  \code
254  Image<int> f_im(size, size), g_im(size/2, size/2);
255  input(x, y) = x + y;
256  f(x, y) = input(x, y);
257  g(x, y) = input(2*x, 2*y);
258  g.compute_with(f, y);
259  input.compute_at(f, y);
260  Pipeline({f, g}).realize({f_im, g_im});
261  \endcode
262  *
263  * This is equivalent to:
264  \code
265  for y = 0 to size-1:
266  for x = 0 to size-1:
267  input(x, y) = x + y;
268  for x = 0 to size-1:
269  f(x, y) = input(x, y)
270  for x = 0 to size/2-1:
271  if (y < size/2-1):
272  g(x, y) = input(2*x, 2*y)
273  \endcode
274  *
275  * 'align' specifies how the loop iteration of each dimension of the
276  * two stages being fused should be aligned in the fused loop nests
277  * (see LoopAlignStrategy for options). Consider the following loop nests:
278  \code
279  for z = f_min_z to f_max_z:
280  for y = f_min_y to f_max_y:
281  for x = f_min_x to f_max_x:
282  f(x, y, z) = x + y + z
283  for z = g_min_z to g_max_z:
284  for y = g_min_y to g_max_y:
285  for x = g_min_x to g_max_x:
286  g(x, y, z) = x - y - z
287  \endcode
288  *
289  * If no alignment strategy is specified, the following loop nest will be
290  * generated:
291  \code
292  for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
293  for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
294  for x = f_min_x to f_max_x:
295  if (f_min_z <= z <= f_max_z):
296  if (f_min_y <= y <= f_max_y):
297  f(x, y, z) = x + y + z
298  for x = g_min_x to g_max_x:
299  if (g_min_z <= z <= g_max_z):
300  if (g_min_y <= y <= g_max_y):
301  g(x, y, z) = x - y - z
302  \endcode
303  *
304  * Instead, these alignment strategies:
305  \code
306  g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
307  \endcode
308  * will produce the following loop nest:
309  \code
310  f_loop_min_z = f_min_z
311  f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
312  for z = f_min_z to f_loop_max_z:
313  f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
314  f_loop_max_y = f_max_y
315  for y = f_loop_min_y to f_loop_max_y:
316  for x = f_min_x to f_max_x:
317  if (f_loop_min_z <= z <= f_loop_max_z):
318  if (f_loop_min_y <= y <= f_loop_max_y):
319  f(x, y, z) = x + y + z
320  for x = g_min_x to g_max_x:
321  g_shift_z = g_min_z - f_loop_min_z
322  g_shift_y = g_max_y - f_loop_max_y
323  if (g_min_z <= (z + g_shift_z) <= g_max_z):
324  if (g_min_y <= (y + g_shift_y) <= g_max_y):
325  g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
326  \endcode
327  *
328  * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
329  * of 'g' at dimension z so that its starting value matches that of 'f'.
330  * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
331  * iteration of 'g' at dimension y so that its end value matches that of 'f'.
332  */
333  // @{
334  Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
336  Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
338  // @}
339 
340  /** Scheduling calls that control how the domain of this stage is
341  * traversed. See the documentation for Func for the meanings. */
342  // @{
343 
344  Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
345  Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
346  Stage &serial(const VarOrRVar &var);
347  Stage &parallel(const VarOrRVar &var);
348  Stage &vectorize(const VarOrRVar &var);
349  Stage &unroll(const VarOrRVar &var);
350  Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
351  Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
352  Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
353  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
354  const VarOrRVar &xo, const VarOrRVar &yo,
355  const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
357  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
358  const VarOrRVar &xi, const VarOrRVar &yi,
359  const Expr &xfactor, const Expr &yfactor,
361  Stage &tile(const std::vector<VarOrRVar> &previous,
362  const std::vector<VarOrRVar> &outers,
363  const std::vector<VarOrRVar> &inners,
364  const std::vector<Expr> &factors,
365  const std::vector<TailStrategy> &tails);
366  Stage &tile(const std::vector<VarOrRVar> &previous,
367  const std::vector<VarOrRVar> &outers,
368  const std::vector<VarOrRVar> &inners,
369  const std::vector<Expr> &factors,
371  Stage &tile(const std::vector<VarOrRVar> &previous,
372  const std::vector<VarOrRVar> &inners,
373  const std::vector<Expr> &factors,
375  Stage &reorder(const std::vector<VarOrRVar> &vars);
376 
377  template<typename... Args>
378  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
379  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
380  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
381  return reorder(collected_args);
382  }
383 
384  Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
385  Stage specialize(const Expr &condition);
386  void specialize_fail(const std::string &message);
387 
388  Stage &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
389  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
390  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
391 
392  Stage &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
393 
395 
397  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
398  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
399 
400  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
401  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
402  const VarOrRVar &thread_x, const VarOrRVar &thread_y,
403  DeviceAPI device_api = DeviceAPI::Default_GPU);
404  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
405  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
406  DeviceAPI device_api = DeviceAPI::Default_GPU);
407 
408  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
410  DeviceAPI device_api = DeviceAPI::Default_GPU);
411 
412  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
414  DeviceAPI device_api = DeviceAPI::Default_GPU);
415  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
416  const VarOrRVar &bx, const VarOrRVar &by,
417  const VarOrRVar &tx, const VarOrRVar &ty,
418  const Expr &x_size, const Expr &y_size,
420  DeviceAPI device_api = DeviceAPI::Default_GPU);
421 
422  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
423  const VarOrRVar &tx, const VarOrRVar &ty,
424  const Expr &x_size, const Expr &y_size,
426  DeviceAPI device_api = DeviceAPI::Default_GPU);
427 
428  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
429  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
430  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
431  const Expr &x_size, const Expr &y_size, const Expr &z_size,
433  DeviceAPI device_api = DeviceAPI::Default_GPU);
434  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
435  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
436  const Expr &x_size, const Expr &y_size, const Expr &z_size,
438  DeviceAPI device_api = DeviceAPI::Default_GPU);
439 
441  Stage &atomic(bool override_associativity_test = false);
442 
444 
445  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
446  Stage &prefetch(const Func &f, const VarOrRVar &var, int offset = 1,
447  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
448  return prefetch(f, var, var, offset, strategy);
449  }
450  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
451  Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &var, int offset = 1,
452  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
453  return prefetch(param, var, var, offset, strategy);
454  }
455  template<typename T>
456  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
457  Stage &prefetch(const T &image, VarOrRVar var, int offset = 1,
458  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
459  return prefetch(image.parameter(), var, var, offset, strategy);
460  }
461  Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
463  Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
465  template<typename T>
466  Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
468  return prefetch(image.parameter(), at, from, std::move(offset), strategy);
469  }
470  // @}
471 
472  /** Attempt to get the source file and line where this stage was
473  * defined by parsing the process's own debug symbols. Returns an
474  * empty string if no debug symbols were found or the debug
475  * symbols were not understood. Works on OS X and Linux only. */
476  std::string source_location() const;
477 };
478 
479 // For backwards compatibility, keep the ScheduleHandle name.
481 
482 class FuncTupleElementRef;
483 
484 /** A fragment of front-end syntax of the form f(x, y, z), where x, y,
485  * z are Vars or Exprs. If could be the left hand side of a definition or
486  * an update definition, or it could be a call to a function. We don't know
487  * until we see how this object gets used.
488  */
489 class FuncRef {
490  Internal::Function func;
491  int implicit_placeholder_pos;
492  int implicit_count;
493  std::vector<Expr> args;
494  std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
495 
496  /** Helper for function update by Tuple. If the function does not
497  * already have a pure definition, init_val will be used as RHS of
498  * each tuple element in the initial function definition. */
499  template<typename BinaryOp>
500  Stage func_ref_update(const Tuple &e, int init_val);
501 
502  /** Helper for function update by Expr. If the function does not
503  * already have a pure definition, init_val will be used as RHS in
504  * the initial function definition. */
505  template<typename BinaryOp>
506  Stage func_ref_update(Expr e, int init_val);
507 
508 public:
509  FuncRef(const Internal::Function &, const std::vector<Expr> &,
510  int placeholder_pos = -1, int count = 0);
511  FuncRef(Internal::Function, const std::vector<Var> &,
512  int placeholder_pos = -1, int count = 0);
513 
514  /** Use this as the left-hand-side of a definition or an update definition
515  * (see \ref RDom).
516  */
517  Stage operator=(const Expr &);
518 
519  /** Use this as the left-hand-side of a definition or an update definition
520  * for a Func with multiple outputs. */
522 
523  /** Define a stage that adds the given expression to this Func. If the
524  * expression refers to some RDom, this performs a sum reduction of the
525  * expression over the domain. If the function does not already have a
526  * pure definition, this sets it to zero.
527  */
528  // @{
532  // @}
533 
534  /** Define a stage that adds the negative of the given expression to this
535  * Func. If the expression refers to some RDom, this performs a sum reduction
536  * of the negative of the expression over the domain. If the function does
537  * not already have a pure definition, this sets it to zero.
538  */
539  // @{
543  // @}
544 
545  /** Define a stage that multiplies this Func by the given expression. If the
546  * expression refers to some RDom, this performs a product reduction of the
547  * expression over the domain. If the function does not already have a pure
548  * definition, this sets it to 1.
549  */
550  // @{
554  // @}
555 
556  /** Define a stage that divides this Func by the given expression.
557  * If the expression refers to some RDom, this performs a product
558  * reduction of the inverse of the expression over the domain. If the
559  * function does not already have a pure definition, this sets it to 1.
560  */
561  // @{
565  // @}
566 
567  /* Override the usual assignment operator, so that
568  * f(x, y) = g(x, y) defines f.
569  */
571 
572  /** Use this as a call to the function, and not the left-hand-side
573  * of a definition. Only works for single-output Funcs. */
574  operator Expr() const;
575 
576  /** When a FuncRef refers to a function that provides multiple
577  * outputs, you can access each output as an Expr using
578  * operator[].
579  */
581 
582  /** How many outputs does the function this refers to produce. */
583  size_t size() const;
584 
585  /** What function is this calling? */
586  Internal::Function function() const {
587  return func;
588  }
589 };
590 
591 /** Explicit overloads of min and max for FuncRef. These exist to
592  * disambiguate calls to min on FuncRefs when a user has pulled both
593  * Halide::min and std::min into their namespace. */
594 // @{
595 inline Expr min(const FuncRef &a, const FuncRef &b) {
596  return min(Expr(a), Expr(b));
597 }
598 inline Expr max(const FuncRef &a, const FuncRef &b) {
599  return max(Expr(a), Expr(b));
600 }
601 // @}
602 
603 /** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
604  * z are Vars or Exprs. If could be the left hand side of an update
605  * definition, or it could be a call to a function. We don't know
606  * until we see how this object gets used.
607  */
609  FuncRef func_ref;
610  std::vector<Expr> args; // args to the function
611  int idx; // Index to function outputs
612 
613  /** Helper function that generates a Tuple where element at 'idx' is set
614  * to 'e' and the rests are undef. */
615  Tuple values_with_undefs(const Expr &e) const;
616 
617 public:
618  FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
619 
620  /** Use this as the left-hand-side of an update definition of Tuple
621  * component 'idx' of a Func (see \ref RDom). The function must
622  * already have an initial definition.
623  */
624  Stage operator=(const Expr &e);
625 
626  /** Define a stage that adds the given expression to Tuple component 'idx'
627  * of this Func. The other Tuple components are unchanged. If the expression
628  * refers to some RDom, this performs a sum reduction of the expression over
629  * the domain. The function must already have an initial definition.
630  */
631  Stage operator+=(const Expr &e);
632 
633  /** Define a stage that adds the negative of the given expression to Tuple
634  * component 'idx' of this Func. The other Tuple components are unchanged.
635  * If the expression refers to some RDom, this performs a sum reduction of
636  * the negative of the expression over the domain. The function must already
637  * have an initial definition.
638  */
639  Stage operator-=(const Expr &e);
640 
641  /** Define a stage that multiplies Tuple component 'idx' of this Func by
642  * the given expression. The other Tuple components are unchanged. If the
643  * expression refers to some RDom, this performs a product reduction of
644  * the expression over the domain. The function must already have an
645  * initial definition.
646  */
647  Stage operator*=(const Expr &e);
648 
649  /** Define a stage that divides Tuple component 'idx' of this Func by
650  * the given expression. The other Tuple components are unchanged.
651  * If the expression refers to some RDom, this performs a product
652  * reduction of the inverse of the expression over the domain. The function
653  * must already have an initial definition.
654  */
655  Stage operator/=(const Expr &e);
656 
657  /* Override the usual assignment operator, so that
658  * f(x, y)[index] = g(x, y) defines f.
659  */
661 
662  /** Use this as a call to Tuple component 'idx' of a Func, and not the
663  * left-hand-side of a definition. */
664  operator Expr() const;
665 
666  /** What function is this calling? */
667  Internal::Function function() const {
668  return func_ref.function();
669  }
670 
671  /** Return index to the function outputs. */
672  int index() const {
673  return idx;
674  }
675 };
676 
677 namespace Internal {
678 class IRMutator;
679 } // namespace Internal
680 
681 /** Helper class for identifying purpose of an Expr passed to memoize.
682  */
683 class EvictionKey {
684 protected:
686  friend class Func;
687 
688 public:
689  explicit EvictionKey(const Expr &expr = Expr())
690  : key(expr) {
691  }
692 };
693 
694 /** A halide function. This class represents one stage in a Halide
695  * pipeline, and is the unit by which we schedule things. By default
696  * they are aggressively inlined, so you are encouraged to make lots
697  * of little functions, rather than storing things in Exprs. */
698 class Func {
699 
700  /** A handle on the internal halide function that this
701  * represents */
702  Internal::Function func;
703 
704  /** When you make a reference to this function with fewer
705  * arguments than it has dimensions, the argument list is bulked
706  * up with 'implicit' vars with canonical names. This lets you
707  * pass around partially applied Halide functions. */
708  // @{
709  std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
710  std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
711  // @}
712 
713  /** The imaging pipeline that outputs this Func alone. */
714  Pipeline pipeline_;
715 
716  /** Get the imaging pipeline that outputs this Func alone,
717  * creating it (and freezing the Func) if necessary. */
718  Pipeline pipeline();
719 
720  // Helper function for recursive reordering support
721  Func &reorder_storage(const std::vector<Var> &dims, size_t start);
722 
723  void invalidate_cache();
724 
725 public:
726  /** Declare a new undefined function with the given name */
727  explicit Func(const std::string &name);
728 
729  /** Declare a new undefined function with an
730  * automatically-generated unique name */
731  Func();
732 
733  /** Declare a new function with an automatically-generated unique
734  * name, and define it to return the given expression (which may
735  * not contain free variables). */
736  explicit Func(const Expr &e);
737 
738  /** Construct a new Func to wrap an existing, already-define
739  * Function object. */
741 
742  /** Construct a new Func to wrap a Buffer. */
743  template<typename T>
745  : Func() {
746  (*this)(_) = im(_);
747  }
748 
749  /** Evaluate this function over some rectangular domain and return
750  * the resulting buffer or buffers. Performs compilation if the
751  * Func has not previously been realized and compile_jit has not
752  * been called. If the final stage of the pipeline is on the GPU,
753  * data is copied back to the host before being returned. The
754  * returned Realization should probably be instantly converted to
755  * a Buffer class of the appropriate type. That is, do this:
756  *
757  \code
758  f(x) = sin(x);
759  Buffer<float> im = f.realize(...);
760  \endcode
761  *
762  * If your Func has multiple values, because you defined it using
763  * a Tuple, then casting the result of a realize call to a buffer
764  * or image will produce a run-time error. Instead you should do the
765  * following:
766  *
767  \code
768  f(x) = Tuple(x, sin(x));
769  Realization r = f.realize(...);
770  Buffer<int> im0 = r[0];
771  Buffer<float> im1 = r[1];
772  \endcode
773  *
774  * In Halide formal arguments of a computation are specified using
775  * Param<T> and ImageParam objects in the expressions defining the
776  * computation. The param_map argument to realize allows
777  * specifying a set of per-call parameters to be used for a
778  * specific computation. This method is thread-safe where the
779  * globals used by Param<T> and ImageParam are not. Any parameters
780  * that are not in the param_map are taken from the global values,
781  * so those can continue to be used if they are not changing
782  * per-thread.
783  *
784  * One can explicitly construct a ParamMap and
785  * use its set method to insert Parameter to scalar or Buffer
786  * value mappings:
787  *
788  \code
789  Param<int32> p(42);
790  ImageParam img(Int(32), 1);
791  f(x) = img(x) + p;
792 
793  Buffer<int32_t) arg_img(10, 10);
794  <fill in arg_img...>
795  ParamMap params;
796  params.set(p, 17);
797  params.set(img, arg_img);
798 
799  Target t = get_jit_target_from_environment();
800  Buffer<int32_t> result = f.realize({10, 10}, t, params);
801  \endcode
802  *
803  * Alternatively, an initializer list can be used
804  * directly in the realize call to pass this information:
805  *
806  \code
807  Param<int32> p(42);
808  ImageParam img(Int(32), 1);
809  f(x) = img(x) + p;
810 
811  Buffer<int32_t) arg_img(10, 10);
812  <fill in arg_img...>
813 
814  Target t = get_jit_target_from_environment();
815  Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
816  \endcode
817  *
818  * If the Func cannot be realized into a buffer of the given size
819  * due to scheduling constraints on scattering update definitions,
820  * it will be realized into a larger buffer of the minimum size
821  * possible, and a cropped view at the requested size will be
822  * returned. It is thus not safe to assume the returned buffers
823  * are contiguous in memory. This behavior can be disabled with
824  * the NoBoundsQuery target flag, in which case an error about
825  * writing out of bounds on the output buffer will trigger
826  * instead.
827  *
828  */
829  Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target(),
830  const ParamMap &param_map = ParamMap::empty_map());
831 
832  /** Evaluate this function into an existing allocated buffer or
833  * buffers. If the buffer is also one of the arguments to the
834  * function, strange things may happen, as the pipeline isn't
835  * necessarily safe to run in-place. If you pass multiple buffers,
836  * they must have matching sizes. This form of realize does *not*
837  * automatically copy data back from the GPU. */
839  const ParamMap &param_map = ParamMap::empty_map());
840 
841  /** For a given size of output, or a given output buffer,
842  * determine the bounds required of all unbound ImageParams
843  * referenced. Communicates the result by allocating new buffers
844  * of the appropriate size and binding them to the unbound
845  * ImageParams.
846  *
847  * Set the documentation for Func::realize regarding the
848  * ParamMap. There is one difference in that input Buffer<>
849  * arguments that are being inferred are specified as a pointer to
850  * the Buffer<> in the ParamMap. E.g.
851  *
852  \code
853  Param<int32> p(42);
854  ImageParam img(Int(32), 1);
855  f(x) = img(x) + p;
856 
857  Target t = get_jit_target_from_environment();
858  Buffer<> in;
859  f.infer_input_bounds({10, 10}, t, { { img, &in } });
860  \endcode
861  * On return, in will be an allocated buffer of the correct size
862  * to evaulate f over a 10x10 region.
863  */
864  // @{
865  void infer_input_bounds(const std::vector<int32_t> &sizes,
866  const Target &target = get_jit_target_from_environment(),
867  const ParamMap &param_map = ParamMap::empty_map());
869  const Target &target = get_jit_target_from_environment(),
870  const ParamMap &param_map = ParamMap::empty_map());
871  // @}
872 
873  /** Statically compile this function to llvm bitcode, with the
874  * given filename (which should probably end in .bc), type
875  * signature, and C function name (which defaults to the same name
876  * as this halide function */
877  //@{
878  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
879  const Target &target = get_target_from_environment());
880  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
881  const Target &target = get_target_from_environment());
882  // @}
883 
884  /** Statically compile this function to llvm assembly, with the
885  * given filename (which should probably end in .ll), type
886  * signature, and C function name (which defaults to the same name
887  * as this halide function */
888  //@{
889  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
890  const Target &target = get_target_from_environment());
891  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
892  const Target &target = get_target_from_environment());
893  // @}
894 
895  /** Statically compile this function to an object file, with the
896  * given filename (which should probably end in .o or .obj), type
897  * signature, and C function name (which defaults to the same name
898  * as this halide function. You probably don't want to use this
899  * directly; call compile_to_static_library or compile_to_file instead. */
900  //@{
901  void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
902  const Target &target = get_target_from_environment());
903  void compile_to_object(const std::string &filename, const std::vector<Argument> &,
904  const Target &target = get_target_from_environment());
905  // @}
906 
907  /** Emit a header file with the given filename for this
908  * function. The header will define a function with the type
909  * signature given by the second argument, and a name given by the
910  * third. The name defaults to the same name as this halide
911  * function. You don't actually have to have defined this function
912  * yet to call this. You probably don't want to use this directly;
913  * call compile_to_static_library or compile_to_file instead. */
914  void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
915  const Target &target = get_target_from_environment());
916 
917  /** Statically compile this function to text assembly equivalent
918  * to the object file generated by compile_to_object. This is
919  * useful for checking what Halide is producing without having to
920  * disassemble anything, or if you need to feed the assembly into
921  * some custom toolchain to produce an object file (e.g. iOS) */
922  //@{
923  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
924  const Target &target = get_target_from_environment());
925  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
926  const Target &target = get_target_from_environment());
927  // @}
928 
929  /** Statically compile this function to C source code. This is
930  * useful for providing fallback code paths that will compile on
931  * many platforms. Vectorization will fail, and parallelization
932  * will produce serial code. */
933  void compile_to_c(const std::string &filename,
934  const std::vector<Argument> &,
935  const std::string &fn_name = "",
936  const Target &target = get_target_from_environment());
937 
938  /** Write out an internal representation of lowered code. Useful
939  * for analyzing and debugging scheduling. Can emit html or plain
940  * text. */
941  void compile_to_lowered_stmt(const std::string &filename,
942  const std::vector<Argument> &args,
943  StmtOutputFormat fmt = Text,
944  const Target &target = get_target_from_environment());
945 
946  /** Write out the loop nests specified by the schedule for this
947  * Function. Helpful for understanding what a schedule is
948  * doing. */
950 
951  /** Compile to object file and header pair, with the given
952  * arguments. The name defaults to the same name as this halide
953  * function.
954  */
955  void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
956  const std::string &fn_name = "",
957  const Target &target = get_target_from_environment());
958 
959  /** Compile to static-library file and header pair, with the given
960  * arguments. The name defaults to the same name as this halide
961  * function.
962  */
963  void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
964  const std::string &fn_name = "",
965  const Target &target = get_target_from_environment());
966 
967  /** Compile to static-library file and header pair once for each target;
968  * each resulting function will be considered (in order) via halide_can_use_target_features()
969  * at runtime, with the first appropriate match being selected for subsequent use.
970  * This is typically useful for specializations that may vary unpredictably by machine
971  * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
972  * All targets must have identical arch-os-bits.
973  */
974  void compile_to_multitarget_static_library(const std::string &filename_prefix,
975  const std::vector<Argument> &args,
976  const std::vector<Target> &targets);
977 
978  /** Like compile_to_multitarget_static_library(), except that the object files
979  * are all output as object files (rather than bundled into a static library).
980  *
981  * `suffixes` is an optional list of strings to use for as the suffix for each object
982  * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
983  * will be used for each suffix.)
984  *
985  * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
986  * will be generated with the filename `${filename_prefix}_wrapper.o`
987  *
988  * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
989  * will be generated with the filename `${filename_prefix}_runtime.o`
990  */
991  void compile_to_multitarget_object_files(const std::string &filename_prefix,
992  const std::vector<Argument> &args,
993  const std::vector<Target> &targets,
994  const std::vector<std::string> &suffixes);
995 
996  /** Store an internal representation of lowered code as a self
997  * contained Module suitable for further compilation. */
998  Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
999  const Target &target = get_target_from_environment());
1000 
1001  /** Compile and generate multiple target files with single call.
1002  * Deduces target files based on filenames specified in
1003  * output_files map.
1004  */
1005  void compile_to(const std::map<Output, std::string> &output_files,
1006  const std::vector<Argument> &args,
1007  const std::string &fn_name,
1008  const Target &target = get_target_from_environment());
1009 
1010  /** Eagerly jit compile the function to machine code. This
1011  * normally happens on the first call to realize. If you're
1012  * running your halide pipeline inside time-sensitive code and
1013  * wish to avoid including the time taken to compile a pipeline,
1014  * then you can call this ahead of time. Default is to use the Target
1015  * returned from Halide::get_jit_target_from_environment()
1016  */
1018 
1019  /** Set the error handler function that be called in the case of
1020  * runtime errors during halide pipelines. If you are compiling
1021  * statically, you can also just define your own function with
1022  * signature
1023  \code
1024  extern "C" void halide_error(void *user_context, const char *);
1025  \endcode
1026  * This will clobber Halide's version.
1027  */
1028  void set_error_handler(void (*handler)(void *, const char *));
1029 
1030  /** Set a custom malloc and free for halide to use. Malloc should
1031  * return 32-byte aligned chunks of memory, and it should be safe
1032  * for Halide to read slightly out of bounds (up to 8 bytes before
1033  * the start or beyond the end). If compiling statically, routines
1034  * with appropriate signatures can be provided directly
1035  \code
1036  extern "C" void *halide_malloc(void *, size_t)
1037  extern "C" void halide_free(void *, void *)
1038  \endcode
1039  * These will clobber Halide's versions. See HalideRuntime.h
1040  * for declarations.
1041  */
1042  void set_custom_allocator(void *(*malloc)(void *, size_t),
1043  void (*free)(void *, void *));
1044 
1045  /** Set a custom task handler to be called by the parallel for
1046  * loop. It is useful to set this if you want to do some
1047  * additional bookkeeping at the granularity of parallel
1048  * tasks. The default implementation does this:
1049  \code
1050  extern "C" int halide_do_task(void *user_context,
1051  int (*f)(void *, int, uint8_t *),
1052  int idx, uint8_t *state) {
1053  return f(user_context, idx, state);
1054  }
1055  \endcode
1056  * If you are statically compiling, you can also just define your
1057  * own version of the above function, and it will clobber Halide's
1058  * version.
1059  *
1060  * If you're trying to use a custom parallel runtime, you probably
1061  * don't want to call this. See instead \ref Func::set_custom_do_par_for .
1062  */
1064  int (*custom_do_task)(void *, int (*)(void *, int, uint8_t *),
1065  int, uint8_t *));
1066 
1067  /** Set a custom parallel for loop launcher. Useful if your app
1068  * already manages a thread pool. The default implementation is
1069  * equivalent to this:
1070  \code
1071  extern "C" int halide_do_par_for(void *user_context,
1072  int (*f)(void *, int, uint8_t *),
1073  int min, int extent, uint8_t *state) {
1074  int exit_status = 0;
1075  parallel for (int idx = min; idx < min+extent; idx++) {
1076  int job_status = halide_do_task(user_context, f, idx, state);
1077  if (job_status) exit_status = job_status;
1078  }
1079  return exit_status;
1080  }
1081  \endcode
1082  *
1083  * However, notwithstanding the above example code, if one task
1084  * fails, we may skip over other tasks, and if two tasks return
1085  * different error codes, we may select one arbitrarily to return.
1086  *
1087  * If you are statically compiling, you can also just define your
1088  * own version of the above function, and it will clobber Halide's
1089  * version.
1090  */
1092  int (*custom_do_par_for)(void *, int (*)(void *, int, uint8_t *), int,
1093  int, uint8_t *));
1094 
1095  /** Set custom routines to call when tracing is enabled. Call this
1096  * on the output Func of your pipeline. This then sets custom
1097  * routines for the entire pipeline, not just calls to this
1098  * Func.
1099  *
1100  * If you are statically compiling, you can also just define your
1101  * own versions of the tracing functions (see HalideRuntime.h),
1102  * and they will clobber Halide's versions. */
1103  void set_custom_trace(int (*trace_fn)(void *, const halide_trace_event_t *));
1104 
1105  /** Set the function called to print messages from the runtime.
1106  * If you are compiling statically, you can also just define your
1107  * own function with signature
1108  \code
1109  extern "C" void halide_print(void *user_context, const char *);
1110  \endcode
1111  * This will clobber Halide's version.
1112  */
1113  void set_custom_print(void (*handler)(void *, const char *));
1114 
1115  /** Get a struct containing the currently set custom functions
1116  * used by JIT. */
1118 
1119  /** Add a custom pass to be used during lowering. It is run after
1120  * all other lowering passes. Can be used to verify properties of
1121  * the lowered Stmt, instrument it with extra code, or otherwise
1122  * modify it. The Func takes ownership of the pass, and will call
1123  * delete on it when the Func goes out of scope. So don't pass a
1124  * stack object, or share pass instances between multiple
1125  * Funcs. */
1126  template<typename T>
1128  // Template instantiate a custom deleter for this type, then
1129  // wrap in a lambda. The custom deleter lives in user code, so
1130  // that deletion is on the same heap as construction (I hate Windows).
1131  add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
1132  }
1133 
1134  /** Add a custom pass to be used during lowering, with the
1135  * function that will be called to delete it also passed in. Set
1136  * it to nullptr if you wish to retain ownership of the object. */
1137  void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
1138 
1139  /** Remove all previously-set custom lowering passes */
1141 
1142  /** Get the custom lowering passes. */
1143  const std::vector<CustomLoweringPass> &custom_lowering_passes();
1144 
1145  /** When this function is compiled, include code that dumps its
1146  * values to a file after it is realized, for the purpose of
1147  * debugging.
1148  *
1149  * If filename ends in ".tif" or ".tiff" (case insensitive) the file
1150  * is in TIFF format and can be read by standard tools. Oherwise, the
1151  * file format is as follows:
1152  *
1153  * All data is in the byte-order of the target platform. First, a
1154  * 20 byte-header containing four 32-bit ints, giving the extents
1155  * of the first four dimensions. Dimensions beyond four are
1156  * folded into the fourth. Then, a fifth 32-bit int giving the
1157  * data type of the function. The typecodes are given by: float =
1158  * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
1159  * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
1160  * data follows the header, as a densely packed array of the given
1161  * size and the given type. If given the extension .tmp, this file
1162  * format can be natively read by the program ImageStack. */
1163  void debug_to_file(const std::string &filename);
1164 
1165  /** The name of this function, either given during construction,
1166  * or automatically generated. */
1167  const std::string &name() const;
1168 
1169  /** Get the pure arguments. */
1170  std::vector<Var> args() const;
1171 
1172  /** The right-hand-side value of the pure definition of this
1173  * function. Causes an error if there's no pure definition, or if
1174  * the function is defined to return multiple values. */
1175  Expr value() const;
1176 
1177  /** The values returned by this function. An error if the function
1178  * has not been been defined. Returns a Tuple with one element for
1179  * functions defined to return a single value. */
1180  Tuple values() const;
1181 
1182  /** Does this function have at least a pure definition. */
1183  bool defined() const;
1184 
1185  /** Get the left-hand-side of the update definition. An empty
1186  * vector if there's no update definition. If there are
1187  * multiple update definitions for this function, use the
1188  * argument to select which one you want. */
1189  const std::vector<Expr> &update_args(int idx = 0) const;
1190 
1191  /** Get the right-hand-side of an update definition. An error if
1192  * there's no update definition. If there are multiple
1193  * update definitions for this function, use the argument to
1194  * select which one you want. */
1195  Expr update_value(int idx = 0) const;
1196 
1197  /** Get the right-hand-side of an update definition for
1198  * functions that returns multiple values. An error if there's no
1199  * update definition. Returns a Tuple with one element for
1200  * functions that return a single value. */
1201  Tuple update_values(int idx = 0) const;
1202 
1203  /** Get the RVars of the reduction domain for an update definition, if there is
1204  * one. */
1205  std::vector<RVar> rvars(int idx = 0) const;
1206 
1207  /** Does this function have at least one update definition? */
1209 
1210  /** How many update definitions does this function have? */
1212 
1213  /** Is this function an external stage? That is, was it defined
1214  * using define_extern? */
1215  bool is_extern() const;
1216 
1217  /** Add an extern definition for this Func. This lets you define a
1218  * Func that represents an external pipeline stage. You can, for
1219  * example, use it to wrap a call to an extern library such as
1220  * fftw. */
1221  // @{
1222  void define_extern(const std::string &function_name,
1223  const std::vector<ExternFuncArgument> &params, Type t,
1224  int dimensionality,
1226  DeviceAPI device_api = DeviceAPI::Host) {
1227  define_extern(function_name, params, t,
1228  Internal::make_argument_list(dimensionality), mangling,
1229  device_api);
1230  }
1231 
1232  void define_extern(const std::string &function_name,
1233  const std::vector<ExternFuncArgument> &params,
1234  const std::vector<Type> &types, int dimensionality,
1235  NameMangling mangling) {
1236  define_extern(function_name, params, types,
1237  Internal::make_argument_list(dimensionality), mangling);
1238  }
1239 
1240  void define_extern(const std::string &function_name,
1241  const std::vector<ExternFuncArgument> &params,
1242  const std::vector<Type> &types, int dimensionality,
1244  DeviceAPI device_api = DeviceAPI::Host) {
1245  define_extern(function_name, params, types,
1246  Internal::make_argument_list(dimensionality), mangling,
1247  device_api);
1248  }
1249 
1250  void define_extern(const std::string &function_name,
1251  const std::vector<ExternFuncArgument> &params, Type t,
1252  const std::vector<Var> &arguments,
1254  DeviceAPI device_api = DeviceAPI::Host) {
1255  define_extern(function_name, params, std::vector<Type>{t}, arguments,
1256  mangling, device_api);
1257  }
1258 
1259  void define_extern(const std::string &function_name,
1260  const std::vector<ExternFuncArgument> &params,
1261  const std::vector<Type> &types,
1262  const std::vector<Var> &arguments,
1264  DeviceAPI device_api = DeviceAPI::Host);
1265  // @}
1266 
1267  /** Get the types of the outputs of this Func. */
1268  const std::vector<Type> &output_types() const;
1269 
1270  /** Get the number of outputs of this Func. Corresponds to the
1271  * size of the Tuple this Func was defined to return. */
1272  int outputs() const;
1273 
1274  /** Get the name of the extern function called for an extern
1275  * definition. */
1276  const std::string &extern_function_name() const;
1277 
1278  /** The dimensionality (number of arguments) of this
1279  * function. Zero if the function is not yet defined. */
1280  int dimensions() const;
1281 
1282  /** Construct either the left-hand-side of a definition, or a call
1283  * to a functions that happens to only contain vars as
1284  * arguments. If the function has already been defined, and fewer
1285  * arguments are given than the function has dimensions, then
1286  * enough implicit vars are added to the end of the argument list
1287  * to make up the difference (see \ref Var::implicit) */
1288  // @{
1289  FuncRef operator()(std::vector<Var>) const;
1290 
1291  template<typename... Args>
1292  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, FuncRef>::type
1293  operator()(Args &&...args) const {
1294  std::vector<Var> collected_args{std::forward<Args>(args)...};
1295  return this->operator()(collected_args);
1296  }
1297  // @}
1298 
1299  /** Either calls to the function, or the left-hand-side of
1300  * an update definition (see \ref RDom). If the function has
1301  * already been defined, and fewer arguments are given than the
1302  * function has dimensions, then enough implicit vars are added to
1303  * the end of the argument list to make up the difference. (see
1304  * \ref Var::implicit)*/
1305  // @{
1306  FuncRef operator()(std::vector<Expr>) const;
1307 
1308  template<typename... Args>
1309  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Expr, Args...>::value, FuncRef>::type
1310  operator()(const Expr &x, Args &&...args) const {
1311  std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1312  return (*this)(collected_args);
1313  }
1314  // @}
1315 
1316  /** Creates and returns a new identity Func that wraps this Func. During
1317  * compilation, Halide replaces all calls to this Func done by 'f'
1318  * with calls to the wrapper. If this Func is already wrapped for
1319  * use in 'f', will return the existing wrapper.
1320  *
1321  * For example, g.in(f) would rewrite a pipeline like this:
1322  \code
1323  g(x, y) = ...
1324  f(x, y) = ... g(x, y) ...
1325  \endcode
1326  * into a pipeline like this:
1327  \code
1328  g(x, y) = ...
1329  g_wrap(x, y) = g(x, y)
1330  f(x, y) = ... g_wrap(x, y)
1331  \endcode
1332  *
1333  * This has a variety of uses. You can use it to schedule this
1334  * Func differently in the different places it is used:
1335  \code
1336  g(x, y) = ...
1337  f1(x, y) = ... g(x, y) ...
1338  f2(x, y) = ... g(x, y) ...
1339  g.in(f1).compute_at(f1, y).vectorize(x, 8);
1340  g.in(f2).compute_at(f2, x).unroll(x);
1341  \endcode
1342  *
1343  * You can also use it to stage loads from this Func via some
1344  * intermediate buffer (perhaps on the stack as in
1345  * test/performance/block_transpose.cpp, or in shared GPU memory
1346  * as in test/performance/wrap.cpp). In this we compute the
1347  * wrapper at tiles of the consuming Funcs like so:
1348  \code
1349  g.compute_root()...
1350  g.in(f).compute_at(f, tiles)...
1351  \endcode
1352  *
1353  * Func::in() can also be used to compute pieces of a Func into a
1354  * smaller scratch buffer (perhaps on the GPU) and then copy them
1355  * into a larger output buffer one tile at a time. See
1356  * apps/interpolate/interpolate.cpp for an example of this. In
1357  * this case we compute the Func at tiles of its own wrapper:
1358  \code
1359  f.in(g).compute_root().gpu_tile(...)...
1360  f.compute_at(f.in(g), tiles)...
1361  \endcode
1362  *
1363  * A similar use of Func::in() wrapping Funcs with multiple update
1364  * stages in a pure wrapper. The following code:
1365  \code
1366  f(x, y) = x + y;
1367  f(x, y) += 5;
1368  g(x, y) = f(x, y);
1369  f.compute_root();
1370  \endcode
1371  *
1372  * Is equivalent to:
1373  \code
1374  for y:
1375  for x:
1376  f(x, y) = x + y;
1377  for y:
1378  for x:
1379  f(x, y) += 5
1380  for y:
1381  for x:
1382  g(x, y) = f(x, y)
1383  \endcode
1384  * using Func::in(), we can write:
1385  \code
1386  f(x, y) = x + y;
1387  f(x, y) += 5;
1388  g(x, y) = f(x, y);
1389  f.in(g).compute_root();
1390  \endcode
1391  * which instead produces:
1392  \code
1393  for y:
1394  for x:
1395  f(x, y) = x + y;
1396  f(x, y) += 5
1397  f_wrap(x, y) = f(x, y)
1398  for y:
1399  for x:
1400  g(x, y) = f_wrap(x, y)
1401  \endcode
1402  */
1403  Func in(const Func &f);
1404 
1405  /** Create and return an identity wrapper shared by all the Funcs in
1406  * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1407  * this will throw an error. */
1408  Func in(const std::vector<Func> &fs);
1409 
1410  /** Create and return a global identity wrapper, which wraps all calls to
1411  * this Func by any other Func. If a global wrapper already exists,
1412  * returns it. The global identity wrapper is only used by callers for
1413  * which no custom wrapper has been specified.
1414  */
1416 
1417  /** Similar to \ref Func::in; however, instead of replacing the call to
1418  * this Func with an identity Func that refers to it, this replaces the
1419  * call with a clone of this Func.
1420  *
1421  * For example, f.clone_in(g) would rewrite a pipeline like this:
1422  \code
1423  f(x, y) = x + y;
1424  g(x, y) = f(x, y) + 2;
1425  h(x, y) = f(x, y) - 3;
1426  \endcode
1427  * into a pipeline like this:
1428  \code
1429  f(x, y) = x + y;
1430  f_clone(x, y) = x + y;
1431  g(x, y) = f_clone(x, y) + 2;
1432  h(x, y) = f(x, y) - 3;
1433  \endcode
1434  *
1435  */
1436  //@{
1437  Func clone_in(const Func &f);
1438  Func clone_in(const std::vector<Func> &fs);
1439  //@}
1440 
1441  /** Declare that this function should be implemented by a call to
1442  * halide_buffer_copy with the given target device API. Asserts
1443  * that the Func has a pure definition which is a simple call to a
1444  * single input, and no update definitions. The wrapper Funcs
1445  * returned by in() are suitable candidates. Consumes all pure
1446  * variables, and rewrites the Func to have an extern definition
1447  * that calls halide_buffer_copy. */
1449 
1450  /** Declare that this function should be implemented by a call to
1451  * halide_buffer_copy with a NULL target device API. Equivalent to
1452  * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1453  * pure definition which is a simple call to a single input, and
1454  * no update definitions. The wrapper Funcs returned by in() are
1455  * suitable candidates. Consumes all pure variables, and rewrites
1456  * the Func to have an extern definition that calls
1457  * halide_buffer_copy.
1458  *
1459  * Note that if the source Func is already valid in host memory,
1460  * this compiles to code that does the minimum number of calls to
1461  * memcpy.
1462  */
1464 
1465  /** Split a dimension into inner and outer subdimensions with the
1466  * given names, where the inner dimension iterates from 0 to
1467  * factor-1. The inner and outer subdimensions can then be dealt
1468  * with using the other scheduling calls. It's ok to reuse the old
1469  * variable name as either the inner or outer variable. The final
1470  * argument specifies how the tail should be handled if the split
1471  * factor does not provably divide the extent. */
1472  Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1473 
1474  /** Join two dimensions into a single fused dimenion. The fused
1475  * dimension covers the product of the extents of the inner and
1476  * outer dimensions given. */
1477  Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
1478 
1479  /** Mark a dimension to be traversed serially. This is the default. */
1480  Func &serial(const VarOrRVar &var);
1481 
1482  /** Mark a dimension to be traversed in parallel */
1483  Func &parallel(const VarOrRVar &var);
1484 
1485  /** Split a dimension by the given task_size, and the parallelize the
1486  * outer dimension. This creates parallel tasks that have size
1487  * task_size. After this call, var refers to the outer dimension of
1488  * the split. The inner dimension has a new anonymous name. If you
1489  * wish to mutate it, or schedule with respect to it, do the split
1490  * manually. */
1491  Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
1492 
1493  /** Mark a dimension to be computed all-at-once as a single
1494  * vector. The dimension should have constant extent -
1495  * e.g. because it is the inner dimension following a split by a
1496  * constant factor. For most uses of vectorize you want the two
1497  * argument form. The variable to be vectorized should be the
1498  * innermost one. */
1499  Func &vectorize(const VarOrRVar &var);
1500 
1501  /** Mark a dimension to be completely unrolled. The dimension
1502  * should have constant extent - e.g. because it is the inner
1503  * dimension following a split by a constant factor. For most uses
1504  * of unroll you want the two-argument form. */
1505  Func &unroll(const VarOrRVar &var);
1506 
1507  /** Split a dimension by the given factor, then vectorize the
1508  * inner dimension. This is how you vectorize a loop of unknown
1509  * size. The variable to be vectorized should be the innermost
1510  * one. After this call, var refers to the outer dimension of the
1511  * split. 'factor' must be an integer. */
1512  Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1513 
1514  /** Split a dimension by the given factor, then unroll the inner
1515  * dimension. This is how you unroll a loop of unknown size by
1516  * some constant factor. After this call, var refers to the outer
1517  * dimension of the split. 'factor' must be an integer. */
1518  Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1519 
1520  /** Statically declare that the range over which a function should
1521  * be evaluated is given by the second and third arguments. This
1522  * can let Halide perform some optimizations. E.g. if you know
1523  * there are going to be 4 color channels, you can completely
1524  * vectorize the color channel dimension without the overhead of
1525  * splitting it up. If bounds inference decides that it requires
1526  * more of this function than the bounds you have stated, a
1527  * runtime error will occur when you try to run your pipeline. */
1528  Func &bound(const Var &var, Expr min, Expr extent);
1529 
1530  /** Statically declare the range over which the function will be
1531  * evaluated in the general case. This provides a basis for the auto
1532  * scheduler to make trade-offs and scheduling decisions. The auto
1533  * generated schedules might break when the sizes of the dimensions are
1534  * very different from the estimates specified. These estimates are used
1535  * only by the auto scheduler if the function is a pipeline output. */
1536  Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
1537 
1538  /** Set (min, extent) estimates for all dimensions in the Func
1539  * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
1540  * repeatedly, but slightly terser. The size of the estimates vector
1541  * must match the dimensionality of the Func. */
1542  Func &set_estimates(const Region &estimates);
1543 
1544  /** Expand the region computed so that the min coordinates is
1545  * congruent to 'remainder' modulo 'modulus', and the extent is a
1546  * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1547  * the min and extent realized to be even, and calling
1548  * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1549  * to be even. The region computed always contains the region that
1550  * would have been computed without this directive, so no
1551  * assertions are injected.
1552  */
1553  Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
1554 
1555  /** Expand the region computed so that the extent is a
1556  * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
1557  * the extent realized to be even. The region computed always contains the
1558  * region that would have been computed without this directive, so no
1559  * assertions are injected. (This is essentially equivalent to align_bounds(),
1560  * but always leaving the min untouched.)
1561  */
1562  Func &align_extent(const Var &var, Expr modulus);
1563 
1564  /** Bound the extent of a Func's realization, but not its
1565  * min. This means the dimension can be unrolled or vectorized
1566  * even when its min is not fixed (for example because it is
1567  * compute_at tiles of another Func). This can also be useful for
1568  * forcing a function's allocation to be a fixed size, which often
1569  * means it can go on the stack. */
1570  Func &bound_extent(const Var &var, Expr extent);
1571 
1572  /** Split two dimensions at once by the given factors, and then
1573  * reorder the resulting dimensions to be xi, yi, xo, yo from
1574  * innermost outwards. This gives a tiled traversal. */
1575  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1576  const VarOrRVar &xo, const VarOrRVar &yo,
1577  const VarOrRVar &xi, const VarOrRVar &yi,
1578  const Expr &xfactor, const Expr &yfactor,
1580 
1581  /** A shorter form of tile, which reuses the old variable names as
1582  * the new outer dimensions */
1583  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1584  const VarOrRVar &xi, const VarOrRVar &yi,
1585  const Expr &xfactor, const Expr &yfactor,
1587 
1588  /** A more general form of tile, which defines tiles of any dimensionality. */
1589  Func &tile(const std::vector<VarOrRVar> &previous,
1590  const std::vector<VarOrRVar> &outers,
1591  const std::vector<VarOrRVar> &inners,
1592  const std::vector<Expr> &factors,
1593  const std::vector<TailStrategy> &tails);
1594 
1595  /** The generalized tile, with a single tail strategy to apply to all vars. */
1596  Func &tile(const std::vector<VarOrRVar> &previous,
1597  const std::vector<VarOrRVar> &outers,
1598  const std::vector<VarOrRVar> &inners,
1599  const std::vector<Expr> &factors,
1601 
1602  /** Generalized tiling, reusing the previous names as the outer names. */
1603  Func &tile(const std::vector<VarOrRVar> &previous,
1604  const std::vector<VarOrRVar> &inners,
1605  const std::vector<Expr> &factors,
1607 
1608  /** Reorder variables to have the given nesting order, from
1609  * innermost out */
1610  Func &reorder(const std::vector<VarOrRVar> &vars);
1611 
1612  template<typename... Args>
1613  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Func &>::type
1614  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
1615  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1616  return reorder(collected_args);
1617  }
1618 
1619  /** Rename a dimension. Equivalent to split with a inner size of one. */
1620  Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
1621 
1622  /** Specify that race conditions are permitted for this Func,
1623  * which enables parallelizing over RVars even when Halide cannot
1624  * prove that it is safe to do so. Use this with great caution,
1625  * and only if you can prove to yourself that this is safe, as it
1626  * may result in a non-deterministic routine that returns
1627  * different values at different times or on different machines. */
1629 
1630  /** Issue atomic updates for this Func. This allows parallelization
1631  * on associative RVars. The function throws a compile error when
1632  * Halide fails to prove associativity. Use override_associativity_test
1633  * to disable the associativity test if you believe the function is
1634  * associative or the order of reduction variable execution does not
1635  * matter.
1636  * Halide compiles this into hardware atomic operations whenever possible,
1637  * and falls back to a mutex lock per storage element if it is impossible
1638  * to atomically update.
1639  * There are three possible outcomes of the compiled code:
1640  * atomic add, compare-and-swap loop, and mutex lock.
1641  * For example:
1642  *
1643  * hist(x) = 0;
1644  * hist(im(r)) += 1;
1645  * hist.compute_root();
1646  * hist.update().atomic().parallel();
1647  *
1648  * will be compiled to atomic add operations.
1649  *
1650  * hist(x) = 0;
1651  * hist(im(r)) = min(hist(im(r)) + 1, 100);
1652  * hist.compute_root();
1653  * hist.update().atomic().parallel();
1654  *
1655  * will be compiled to compare-and-swap loops.
1656  *
1657  * arg_max() = {0, im(0)};
1658  * Expr old_index = arg_max()[0];
1659  * Expr old_max = arg_max()[1];
1660  * Expr new_index = select(old_max < im(r), r, old_index);
1661  * Expr new_max = max(im(r), old_max);
1662  * arg_max() = {new_index, new_max};
1663  * arg_max.compute_root();
1664  * arg_max.update().atomic().parallel();
1665  *
1666  * will be compiled to updates guarded by a mutex lock,
1667  * since it is impossible to atomically update two different locations.
1668  *
1669  * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
1670  * Compiling to other backends results in a compile error.
1671  * If an operation is compiled into a mutex lock, and is vectorized or is
1672  * compiled to CUDA or OpenCL, it also results in a compile error,
1673  * since per-element mutex lock on vectorized operation leads to a
1674  * deadlock.
1675  * Vectorization of predicated RVars (through rdom.where()) on CPU
1676  * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
1677  * 8-bit and 16-bit atomics on GPU are also not supported. */
1678  Func &atomic(bool override_associativity_test = false);
1679 
1680  /** Specialize a Func. This creates a special-case version of the
1681  * Func where the given condition is true. The most effective
1682  * conditions are those of the form param == value, and boolean
1683  * Params. Consider a simple example:
1684  \code
1685  f(x) = x + select(cond, 0, 1);
1686  f.compute_root();
1687  \endcode
1688  * This is equivalent to:
1689  \code
1690  for (int x = 0; x < width; x++) {
1691  f[x] = x + (cond ? 0 : 1);
1692  }
1693  \endcode
1694  * Adding the scheduling directive:
1695  \code
1696  f.specialize(cond)
1697  \endcode
1698  * makes it equivalent to:
1699  \code
1700  if (cond) {
1701  for (int x = 0; x < width; x++) {
1702  f[x] = x;
1703  }
1704  } else {
1705  for (int x = 0; x < width; x++) {
1706  f[x] = x + 1;
1707  }
1708  }
1709  \endcode
1710  * Note that the inner loops have been simplified. In the first
1711  * path Halide knows that cond is true, and in the second path
1712  * Halide knows that it is false.
1713  *
1714  * The specialized version gets its own schedule, which inherits
1715  * every directive made about the parent Func's schedule so far
1716  * except for its specializations. This method returns a handle to
1717  * the new schedule. If you wish to retrieve the specialized
1718  * sub-schedule again later, you can call this method with the
1719  * same condition. Consider the following example of scheduling
1720  * the specialized version:
1721  *
1722  \code
1723  f(x) = x;
1724  f.compute_root();
1725  f.specialize(width > 1).unroll(x, 2);
1726  \endcode
1727  * Assuming for simplicity that width is even, this is equivalent to:
1728  \code
1729  if (width > 1) {
1730  for (int x = 0; x < width/2; x++) {
1731  f[2*x] = 2*x;
1732  f[2*x + 1] = 2*x + 1;
1733  }
1734  } else {
1735  for (int x = 0; x < width/2; x++) {
1736  f[x] = x;
1737  }
1738  }
1739  \endcode
1740  * For this case, it may be better to schedule the un-specialized
1741  * case instead:
1742  \code
1743  f(x) = x;
1744  f.compute_root();
1745  f.specialize(width == 1); // Creates a copy of the schedule so far.
1746  f.unroll(x, 2); // Only applies to the unspecialized case.
1747  \endcode
1748  * This is equivalent to:
1749  \code
1750  if (width == 1) {
1751  f[0] = 0;
1752  } else {
1753  for (int x = 0; x < width/2; x++) {
1754  f[2*x] = 2*x;
1755  f[2*x + 1] = 2*x + 1;
1756  }
1757  }
1758  \endcode
1759  * This can be a good way to write a pipeline that splits,
1760  * vectorizes, or tiles, but can still handle small inputs.
1761  *
1762  * If a Func has several specializations, the first matching one
1763  * will be used, so the order in which you define specializations
1764  * is significant. For example:
1765  *
1766  \code
1767  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1768  f.specialize(cond1);
1769  f.specialize(cond2);
1770  \endcode
1771  * is equivalent to:
1772  \code
1773  if (cond1) {
1774  for (int x = 0; x < width; x++) {
1775  f[x] = x + a - (cond2 ? c : d);
1776  }
1777  } else if (cond2) {
1778  for (int x = 0; x < width; x++) {
1779  f[x] = x + b - c;
1780  }
1781  } else {
1782  for (int x = 0; x < width; x++) {
1783  f[x] = x + b - d;
1784  }
1785  }
1786  \endcode
1787  *
1788  * Specializations may in turn be specialized, which creates a
1789  * nested if statement in the generated code.
1790  *
1791  \code
1792  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1793  f.specialize(cond1).specialize(cond2);
1794  \endcode
1795  * This is equivalent to:
1796  \code
1797  if (cond1) {
1798  if (cond2) {
1799  for (int x = 0; x < width; x++) {
1800  f[x] = x + a - c;
1801  }
1802  } else {
1803  for (int x = 0; x < width; x++) {
1804  f[x] = x + a - d;
1805  }
1806  }
1807  } else {
1808  for (int x = 0; x < width; x++) {
1809  f[x] = x + b - (cond2 ? c : d);
1810  }
1811  }
1812  \endcode
1813  * To create a 4-way if statement that simplifies away all of the
1814  * ternary operators above, you could say:
1815  \code
1816  f.specialize(cond1).specialize(cond2);
1817  f.specialize(cond2);
1818  \endcode
1819  * or
1820  \code
1821  f.specialize(cond1 && cond2);
1822  f.specialize(cond1);
1823  f.specialize(cond2);
1824  \endcode
1825  *
1826  * Any prior Func which is compute_at some variable of this Func
1827  * gets separately included in all paths of the generated if
1828  * statement. The Var in the compute_at call to must exist in all
1829  * paths, but it may have been generated via a different path of
1830  * splits, fuses, and renames. This can be used somewhat
1831  * creatively. Consider the following code:
1832  \code
1833  g(x, y) = 8*x;
1834  f(x, y) = g(x, y) + 1;
1835  f.compute_root().specialize(cond);
1836  Var g_loop;
1837  f.specialize(cond).rename(y, g_loop);
1838  f.rename(x, g_loop);
1839  g.compute_at(f, g_loop);
1840  \endcode
1841  * When cond is true, this is equivalent to g.compute_at(f,y).
1842  * When it is false, this is equivalent to g.compute_at(f,x).
1843  */
1844  Stage specialize(const Expr &condition);
1845 
1846  /** Add a specialization to a Func that always terminates execution
1847  * with a call to halide_error(). By itself, this is of limited use,
1848  * but can be useful to terminate chains of specialize() calls where
1849  * no "default" case is expected (thus avoiding unnecessary code generation).
1850  *
1851  * For instance, say we want to optimize a pipeline to process images
1852  * in planar and interleaved format; we might typically do something like:
1853  \code
1854  ImageParam im(UInt(8), 3);
1855  Func f = do_something_with(im);
1856  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1857  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1858  \endcode
1859  * This code will vectorize along rows for the planar case, and across pixel
1860  * components for the interleaved case... but there is an implicit "else"
1861  * for the unhandled cases, which generates unoptimized code. If we never
1862  * anticipate passing any other sort of images to this, we code streamline
1863  * our code by adding specialize_fail():
1864  \code
1865  ImageParam im(UInt(8), 3);
1866  Func f = do_something(im);
1867  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1868  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1869  f.specialize_fail("Unhandled image format");
1870  \endcode
1871  * Conceptually, this produces codes like:
1872  \code
1873  if (im.dim(0).stride() == 1) {
1874  do_something_planar();
1875  } else if (im.dim(2).stride() == 1) {
1876  do_something_interleaved();
1877  } else {
1878  halide_error("Unhandled image format");
1879  }
1880  \endcode
1881  *
1882  * Note that calling specialize_fail() terminates the specialization chain
1883  * for a given Func; you cannot create new specializations for the Func
1884  * afterwards (though you can retrieve handles to previous specializations).
1885  */
1886  void specialize_fail(const std::string &message);
1887 
1888  /** Tell Halide that the following dimensions correspond to GPU
1889  * thread indices. This is useful if you compute a producer
1890  * function within the block indices of a consumer function, and
1891  * want to control how that function's dimensions map to GPU
1892  * threads. If the selected target is not an appropriate GPU, this
1893  * just marks those dimensions as parallel. */
1894  // @{
1895  Func &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1896  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1897  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1898  // @}
1899 
1900  /** The given dimension corresponds to the lanes in a GPU
1901  * warp. GPU warp lanes are distinguished from GPU threads by the
1902  * fact that all warp lanes run together in lockstep, which
1903  * permits lightweight communication of data from one lane to
1904  * another. */
1905  Func &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1906 
1907  /** Tell Halide to run this stage using a single gpu thread and
1908  * block. This is not an efficient use of your GPU, but it can be
1909  * useful to avoid copy-back for intermediate update stages that
1910  * touch a very small part of your Func. */
1912 
1913  /** Tell Halide that the following dimensions correspond to GPU
1914  * block indices. This is useful for scheduling stages that will
1915  * run serially within each GPU block. If the selected target is
1916  * not ptx, this just marks those dimensions as parallel. */
1917  // @{
1919  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1920  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1921  // @}
1922 
1923  /** Tell Halide that the following dimensions correspond to GPU
1924  * block indices and thread indices. If the selected target is not
1925  * ptx, these just mark the given dimensions as parallel. The
1926  * dimensions are consumed by this call, so do all other
1927  * unrolling, reordering, etc first. */
1928  // @{
1929  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1930  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
1931  const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1932  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
1933  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1934  // @}
1935 
1936  /** Short-hand for tiling a domain and mapping the tile indices
1937  * to GPU block indices and the coordinates within each tile to
1938  * GPU thread indices. Consumes the variables given, so do all
1939  * other scheduling first. */
1940  // @{
1941  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
1943  DeviceAPI device_api = DeviceAPI::Default_GPU);
1944 
1945  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
1947  DeviceAPI device_api = DeviceAPI::Default_GPU);
1948  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1949  const VarOrRVar &bx, const VarOrRVar &by,
1950  const VarOrRVar &tx, const VarOrRVar &ty,
1951  const Expr &x_size, const Expr &y_size,
1953  DeviceAPI device_api = DeviceAPI::Default_GPU);
1954 
1955  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1956  const VarOrRVar &tx, const VarOrRVar &ty,
1957  const Expr &x_size, const Expr &y_size,
1959  DeviceAPI device_api = DeviceAPI::Default_GPU);
1960 
1961  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1962  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
1963  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1964  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1966  DeviceAPI device_api = DeviceAPI::Default_GPU);
1967  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1968  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1969  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1971  DeviceAPI device_api = DeviceAPI::Default_GPU);
1972  // @}
1973 
1974  /** Schedule for execution on Hexagon. When a loop is marked with
1975  * Hexagon, that loop is executed on a Hexagon DSP. */
1977 
1978  /** Prefetch data written to or read from a Func or an ImageParam by a
1979  * subsequent loop iteration, at an optionally specified iteration offset.
1980  * 'var' specifies at which loop level the prefetch calls should be inserted.
1981  * The final argument specifies how prefetch of region outside bounds
1982  * should be handled.
1983  *
1984  * For example, consider this pipeline:
1985  \code
1986  Func f, g;
1987  Var x, y;
1988  f(x, y) = x + y;
1989  g(x, y) = 2 * f(x, y);
1990  \endcode
1991  *
1992  * The following schedule:
1993  \code
1994  f.compute_root();
1995  g.prefetch(f, x, 2, PrefetchBoundStrategy::NonFaulting);
1996  \endcode
1997  *
1998  * will inject prefetch call at the innermost loop of 'g' and generate
1999  * the following loop nest:
2000  * for y = ...
2001  * for x = ...
2002  * f(x, y) = x + y
2003  * for y = ..
2004  * for x = ...
2005  * prefetch(&f[x + 2, y], 1, 16);
2006  * g(x, y) = 2 * f(x, y)
2007  */
2008  // @{
2009  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
2010  Func &prefetch(const Func &f, const VarOrRVar &var, int offset = 1,
2011  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
2012  return prefetch(f, var, var, offset, strategy);
2013  }
2014  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
2015  Func &prefetch(const Internal::Parameter &param, const VarOrRVar &var, int offset = 1,
2016  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
2017  return prefetch(param, var, var, offset, strategy);
2018  }
2019  template<typename T>
2020  HALIDE_ATTRIBUTE_DEPRECATED("Call prefetch() with the two-var form instead.")
2021  Func &prefetch(const T &image, VarOrRVar var, int offset = 1,
2022  PrefetchBoundStrategy strategy = PrefetchBoundStrategy::GuardWithIf) {
2023  return prefetch<T>(image, var, var, offset, strategy);
2024  }
2025  // @}
2026 
2027  /** prefetch() is a more fine-grained version of prefetch(), which allows
2028  * specification of different vars for the location of the prefetch() instruction
2029  * vs. the location that is being prefetched:
2030  *
2031  * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
2032  * - the second var specified, 'from', determines the var used to find the bounds to prefetch
2033  * (in conjunction with 'offset')
2034  *
2035  * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
2036  * Note that the value for 'offset' applies only to 'from', not 'at'.
2037  *
2038  * For example, consider this pipeline:
2039  \code
2040  Func f, g;
2041  Var x, y, z;
2042  f(x, y) = x + y;
2043  g(x, y) = 2 * f(x, y);
2044  h(x, y) = 3 * f(x, y);
2045  \endcode
2046  *
2047  * The following schedule:
2048  \code
2049  f.compute_root();
2050  g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
2051  h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
2052  \endcode
2053  *
2054  * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
2055  * the following loop nest:
2056  \code
2057  for y = ...
2058  for x = ...
2059  f(x, y) = x + y
2060  for y = ..
2061  for x = ...
2062  prefetch(&f[x + 2, y], 1, 16);
2063  g(x, y) = 2 * f(x, y)
2064  for y = ..
2065  for x = ...
2066  prefetch(&f[x, y + 2], 1, 16);
2067  h(x, y) = 3 * f(x, y)
2068  \endcode
2069  *
2070  * Note that the 'from' nesting level need not be adjacent to 'at':
2071  \code
2072  Func f, g;
2073  Var x, y, z, w;
2074  f(x, y, z, w) = x + y + z + w;
2075  g(x, y, z, w) = 2 * f(x, y, z, w);
2076  \endcode
2077  *
2078  * The following schedule:
2079  \code
2080  f.compute_root();
2081  g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
2082  \endcode
2083  *
2084  * will produce code that prefetches a tile of data:
2085  \code
2086  for w = ...
2087  for z = ...
2088  for y = ...
2089  for x = ...
2090  f(x, y, z, w) = x + y + z + w
2091  for w = ...
2092  for z = ...
2093  for y = ...
2094  for x0 = ...
2095  prefetch(&f[x0, y, z, w + 2], 1, 16);
2096  for x = ...
2097  g(x, y, z, w) = 2 * f(x, y, z, w)
2098  \endcode
2099  *
2100  * Note that calling prefetch() with the same var for both 'at' and 'from'
2101  * is equivalent to calling prefetch() with that var.
2102  */
2103  // @{
2104  Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2106  Func &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2108  template<typename T>
2109  Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2111  return prefetch(image.parameter(), at, from, std::move(offset), strategy);
2112  }
2113  // @}
2114 
2115  /** Specify how the storage for the function is laid out. These
2116  * calls let you specify the nesting order of the dimensions. For
2117  * example, foo.reorder_storage(y, x) tells Halide to use
2118  * column-major storage for any realizations of foo, without
2119  * changing how you refer to foo in the code. You may want to do
2120  * this if you intend to vectorize across y. When representing
2121  * color images, foo.reorder_storage(c, x, y) specifies packed
2122  * storage (red, green, and blue values adjacent in memory), and
2123  * foo.reorder_storage(x, y, c) specifies planar storage (entire
2124  * red, green, and blue images one after the other in memory).
2125  *
2126  * If you leave out some dimensions, those remain in the same
2127  * positions in the nesting order while the specified variables
2128  * are reordered around them. */
2129  // @{
2130  Func &reorder_storage(const std::vector<Var> &dims);
2131 
2132  Func &reorder_storage(const Var &x, const Var &y);
2133  template<typename... Args>
2134  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<Var, Args...>::value, Func &>::type
2135  reorder_storage(const Var &x, const Var &y, Args &&...args) {
2136  std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
2137  return reorder_storage(collected_args);
2138  }
2139  // @}
2140 
2141  /** Pad the storage extent of a particular dimension of
2142  * realizations of this function up to be a multiple of the
2143  * specified alignment. This guarantees that the strides for the
2144  * dimensions stored outside of dim will be multiples of the
2145  * specified alignment, where the strides and alignment are
2146  * measured in numbers of elements.
2147  *
2148  * For example, to guarantee that a function foo(x, y, c)
2149  * representing an image has scanlines starting on offsets
2150  * aligned to multiples of 16, use foo.align_storage(x, 16). */
2151  Func &align_storage(const Var &dim, const Expr &alignment);
2152 
2153  /** Store realizations of this function in a circular buffer of a
2154  * given extent. This is more efficient when the extent of the
2155  * circular buffer is a power of 2. If the fold factor is too
2156  * small, or the dimension is not accessed monotonically, the
2157  * pipeline will generate an error at runtime.
2158  *
2159  * The fold_forward option indicates that the new values of the
2160  * producer are accessed by the consumer in a monotonically
2161  * increasing order. Folding storage of producers is also
2162  * supported if the new values are accessed in a monotonically
2163  * decreasing order by setting fold_forward to false.
2164  *
2165  * For example, consider the pipeline:
2166  \code
2167  Func f, g;
2168  Var x, y;
2169  g(x, y) = x*y;
2170  f(x, y) = g(x, y) + g(x, y+1);
2171  \endcode
2172  *
2173  * If we schedule f like so:
2174  *
2175  \code
2176  g.compute_at(f, y).store_root().fold_storage(y, 2);
2177  \endcode
2178  *
2179  * Then g will be computed at each row of f and stored in a buffer
2180  * with an extent in y of 2, alternately storing each computed row
2181  * of g in row y=0 or y=1.
2182  */
2183  Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
2184 
2185  /** Compute this function as needed for each unique value of the
2186  * given var for the given calling function f.
2187  *
2188  * For example, consider the simple pipeline:
2189  \code
2190  Func f, g;
2191  Var x, y;
2192  g(x, y) = x*y;
2193  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2194  \endcode
2195  *
2196  * If we schedule f like so:
2197  *
2198  \code
2199  g.compute_at(f, x);
2200  \endcode
2201  *
2202  * Then the C code equivalent to this pipeline will look like this
2203  *
2204  \code
2205 
2206  int f[height][width];
2207  for (int y = 0; y < height; y++) {
2208  for (int x = 0; x < width; x++) {
2209  int g[2][2];
2210  g[0][0] = x*y;
2211  g[0][1] = (x+1)*y;
2212  g[1][0] = x*(y+1);
2213  g[1][1] = (x+1)*(y+1);
2214  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2215  }
2216  }
2217 
2218  \endcode
2219  *
2220  * The allocation and computation of g is within f's loop over x,
2221  * and enough of g is computed to satisfy all that f will need for
2222  * that iteration. This has excellent locality - values of g are
2223  * used as soon as they are computed, but it does redundant
2224  * work. Each value of g ends up getting computed four times. If
2225  * we instead schedule f like so:
2226  *
2227  \code
2228  g.compute_at(f, y);
2229  \endcode
2230  *
2231  * The equivalent C code is:
2232  *
2233  \code
2234  int f[height][width];
2235  for (int y = 0; y < height; y++) {
2236  int g[2][width+1];
2237  for (int x = 0; x < width; x++) {
2238  g[0][x] = x*y;
2239  g[1][x] = x*(y+1);
2240  }
2241  for (int x = 0; x < width; x++) {
2242  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2243  }
2244  }
2245  \endcode
2246  *
2247  * The allocation and computation of g is within f's loop over y,
2248  * and enough of g is computed to satisfy all that f will need for
2249  * that iteration. This does less redundant work (each point in g
2250  * ends up being evaluated twice), but the locality is not quite
2251  * as good, and we have to allocate more temporary memory to store
2252  * g.
2253  */
2254  Func &compute_at(const Func &f, const Var &var);
2255 
2256  /** Schedule a function to be computed within the iteration over
2257  * some dimension of an update domain. Produces equivalent code
2258  * to the version of compute_at that takes a Var. */
2259  Func &compute_at(const Func &f, const RVar &var);
2260 
2261  /** Schedule a function to be computed within the iteration over
2262  * a given LoopLevel. */
2263  Func &compute_at(LoopLevel loop_level);
2264 
2265  /** Schedule the iteration over the initial definition of this function
2266  * to be fused with another stage 's' from outermost loop to a
2267  * given LoopLevel. */
2268  // @{
2269  Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2271  Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2273 
2274  /** Compute all of this function once ahead of time. Reusing
2275  * the example in \ref Func::compute_at :
2276  *
2277  \code
2278  Func f, g;
2279  Var x, y;
2280  g(x, y) = x*y;
2281  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2282 
2283  g.compute_root();
2284  \endcode
2285  *
2286  * is equivalent to
2287  *
2288  \code
2289  int f[height][width];
2290  int g[height+1][width+1];
2291  for (int y = 0; y < height+1; y++) {
2292  for (int x = 0; x < width+1; x++) {
2293  g[y][x] = x*y;
2294  }
2295  }
2296  for (int y = 0; y < height; y++) {
2297  for (int x = 0; x < width; x++) {
2298  f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
2299  }
2300  }
2301  \endcode
2302  *
2303  * g is computed once ahead of time, and enough is computed to
2304  * satisfy all uses of it. This does no redundant work (each point
2305  * in g is evaluated once), but has poor locality (values of g are
2306  * probably not still in cache when they are used by f), and
2307  * allocates lots of temporary memory to store g.
2308  */
2310 
2311  /** Use the halide_memoization_cache_... interface to store a
2312  * computed version of this function across invocations of the
2313  * Func.
2314  *
2315  * If an eviction_key is provided, it must be constructed with
2316  * Expr of integer or handle type. The key Expr will be promoted
2317  * to a uint64_t and can be used with halide_memoization_cache_evict
2318  * to remove memoized entries using this eviction key from the
2319  * cache. Memoized computations that do not provide an eviction
2320  * key will never be evicted by this mechanism.
2321  */
2322  Func &memoize(const EvictionKey &eviction_key = EvictionKey());
2323 
2324  /** Produce this Func asynchronously in a separate
2325  * thread. Consumers will be run by the task system when the
2326  * production is complete. If this Func's store level is different
2327  * to its compute level, consumers will be run concurrently,
2328  * blocking as necessary to prevent reading ahead of what the
2329  * producer has computed. If storage is folded, then the producer
2330  * will additionally not be permitted to run too far ahead of the
2331  * consumer, to avoid clobbering data that has not yet been
2332  * used.
2333  *
2334  * Take special care when combining this with custom thread pool
2335  * implementations, as avoiding deadlock with producer-consumer
2336  * parallelism requires a much more sophisticated parallel runtime
2337  * than with data parallelism alone. It is strongly recommended
2338  * you just use Halide's default thread pool, which guarantees no
2339  * deadlock and a bound on the number of threads launched.
2340  */
2342 
2343  /** Allocate storage for this function within f's loop over
2344  * var. Scheduling storage is optional, and can be used to
2345  * separate the loop level at which storage occurs from the loop
2346  * level at which computation occurs to trade off between locality
2347  * and redundant work. This can open the door for two types of
2348  * optimization.
2349  *
2350  * Consider again the pipeline from \ref Func::compute_at :
2351  \code
2352  Func f, g;
2353  Var x, y;
2354  g(x, y) = x*y;
2355  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2356  \endcode
2357  *
2358  * If we schedule it like so:
2359  *
2360  \code
2361  g.compute_at(f, x).store_at(f, y);
2362  \endcode
2363  *
2364  * Then the computation of g takes place within the loop over x,
2365  * but the storage takes place within the loop over y:
2366  *
2367  \code
2368  int f[height][width];
2369  for (int y = 0; y < height; y++) {
2370  int g[2][width+1];
2371  for (int x = 0; x < width; x++) {
2372  g[0][x] = x*y;
2373  g[0][x+1] = (x+1)*y;
2374  g[1][x] = x*(y+1);
2375  g[1][x+1] = (x+1)*(y+1);
2376  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2377  }
2378  }
2379  \endcode
2380  *
2381  * Provided the for loop over x is serial, halide then
2382  * automatically performs the following sliding window
2383  * optimization:
2384  *
2385  \code
2386  int f[height][width];
2387  for (int y = 0; y < height; y++) {
2388  int g[2][width+1];
2389  for (int x = 0; x < width; x++) {
2390  if (x == 0) {
2391  g[0][x] = x*y;
2392  g[1][x] = x*(y+1);
2393  }
2394  g[0][x+1] = (x+1)*y;
2395  g[1][x+1] = (x+1)*(y+1);
2396  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2397  }
2398  }
2399  \endcode
2400  *
2401  * Two of the assignments to g only need to be done when x is
2402  * zero. The rest of the time, those sites have already been
2403  * filled in by a previous iteration. This version has the
2404  * locality of compute_at(f, x), but allocates more memory and
2405  * does much less redundant work.
2406  *
2407  * Halide then further optimizes this pipeline like so:
2408  *
2409  \code
2410  int f[height][width];
2411  for (int y = 0; y < height; y++) {
2412  int g[2][2];
2413  for (int x = 0; x < width; x++) {
2414  if (x == 0) {
2415  g[0][0] = x*y;
2416  g[1][0] = x*(y+1);
2417  }
2418  g[0][(x+1)%2] = (x+1)*y;
2419  g[1][(x+1)%2] = (x+1)*(y+1);
2420  f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
2421  }
2422  }
2423  \endcode
2424  *
2425  * Halide has detected that it's possible to use a circular buffer
2426  * to represent g, and has reduced all accesses to g modulo 2 in
2427  * the x dimension. This optimization only triggers if the for
2428  * loop over x is serial, and if halide can statically determine
2429  * some power of two large enough to cover the range needed. For
2430  * powers of two, the modulo operator compiles to more efficient
2431  * bit-masking. This optimization reduces memory usage, and also
2432  * improves locality by reusing recently-accessed memory instead
2433  * of pulling new memory into cache.
2434  *
2435  */
2436  Func &store_at(const Func &f, const Var &var);
2437 
2438  /** Equivalent to the version of store_at that takes a Var, but
2439  * schedules storage within the loop over a dimension of a
2440  * reduction domain */
2441  Func &store_at(const Func &f, const RVar &var);
2442 
2443  /** Equivalent to the version of store_at that takes a Var, but
2444  * schedules storage at a given LoopLevel. */
2445  Func &store_at(LoopLevel loop_level);
2446 
2447  /** Equivalent to \ref Func::store_at, but schedules storage
2448  * outside the outermost loop. */
2450 
2451  /** Aggressively inline all uses of this function. This is the
2452  * default schedule, so you're unlikely to need to call this. For
2453  * a Func with an update definition, that means it gets computed
2454  * as close to the innermost loop as possible.
2455  *
2456  * Consider once more the pipeline from \ref Func::compute_at :
2457  *
2458  \code
2459  Func f, g;
2460  Var x, y;
2461  g(x, y) = x*y;
2462  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2463  \endcode
2464  *
2465  * Leaving g as inline, this compiles to code equivalent to the following C:
2466  *
2467  \code
2468  int f[height][width];
2469  for (int y = 0; y < height; y++) {
2470  for (int x = 0; x < width; x++) {
2471  f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2472  }
2473  }
2474  \endcode
2475  */
2477 
2478  /** Get a handle on an update step for the purposes of scheduling
2479  * it. */
2480  Stage update(int idx = 0);
2481 
2482  /** Set the type of memory this Func should be stored in. Controls
2483  * whether allocations go on the stack or the heap on the CPU, and
2484  * in global vs shared vs local on the GPU. See the documentation
2485  * on MemoryType for more detail. */
2486  Func &store_in(MemoryType memory_type);
2487 
2488  /** Trace all loads from this Func by emitting calls to
2489  * halide_trace. If the Func is inlined, this has no
2490  * effect. */
2492 
2493  /** Trace all stores to the buffer backing this Func by emitting
2494  * calls to halide_trace. If the Func is inlined, this call
2495  * has no effect. */
2497 
2498  /** Trace all realizations of this Func by emitting calls to
2499  * halide_trace. */
2501 
2502  /** Add a string of arbitrary text that will be passed thru to trace
2503  * inspection code if the Func is realized in trace mode. (Funcs that are
2504  * inlined won't have their tags emitted.) Ignored entirely if
2505  * tracing is not enabled for the Func (or globally).
2506  */
2507  Func &add_trace_tag(const std::string &trace_tag);
2508 
2509  /** Get a handle on the internal halide function that this Func
2510  * represents. Useful if you want to do introspection on Halide
2511  * functions */
2512  Internal::Function function() const {
2513  return func;
2514  }
2515 
2516  /** You can cast a Func to its pure stage for the purposes of
2517  * scheduling it. */
2518  operator Stage() const;
2519 
2520  /** Get a handle on the output buffer for this Func. Only relevant
2521  * if this is the output Func in a pipeline. Useful for making
2522  * static promises about strides, mins, and extents. */
2523  // @{
2525  std::vector<OutputImageParam> output_buffers() const;
2526  // @}
2527 
2528  /** Use a Func as an argument to an external stage. */
2529  operator ExternFuncArgument() const;
2530 
2531  /** Infer the arguments to the Func, sorted into a canonical order:
2532  * all buffers (sorted alphabetically by name), followed by all non-buffers
2533  * (sorted alphabetically by name).
2534  This lets you write things like:
2535  \code
2536  func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2537  \endcode
2538  */
2539  std::vector<Argument> infer_arguments() const;
2540 
2541  /** Get the source location of the pure definition of this
2542  * Func. See Stage::source_location() */
2543  std::string source_location() const;
2544 
2545  /** Return the current StageSchedule associated with this initial
2546  * Stage of this Func. For introspection only: to modify schedule,
2547  * use the Func interface. */
2549  return Stage(*this).get_schedule();
2550  }
2551 };
2552 
2553 namespace Internal {
2554 
2555 template<typename Last>
2556 inline void check_types(const Tuple &t, int idx) {
2557  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2558  user_assert(t[idx].type() == type_of<T>())
2559  << "Can't evaluate expression "
2560  << t[idx] << " of type " << t[idx].type()
2561  << " as a scalar of type " << type_of<T>() << "\n";
2562 }
2563 
2564 template<typename First, typename Second, typename... Rest>
2565 inline void check_types(const Tuple &t, int idx) {
2566  check_types<First>(t, idx);
2567  check_types<Second, Rest...>(t, idx + 1);
2568 }
2569 
2570 template<typename Last>
2571 inline void assign_results(Realization &r, int idx, Last last) {
2572  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2573  *last = Buffer<T>(r[idx])();
2574 }
2575 
2576 template<typename First, typename Second, typename... Rest>
2577 inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
2578  assign_results<First>(r, idx, first);
2579  assign_results<Second, Rest...>(r, idx + 1, second, rest...);
2580 }
2581 
2582 } // namespace Internal
2583 
2584 /** JIT-Compile and run enough code to evaluate a Halide
2585  * expression. This can be thought of as a scalar version of
2586  * \ref Func::realize */
2587 template<typename T>
2589  user_assert(e.type() == type_of<T>())
2590  << "Can't evaluate expression "
2591  << e << " of type " << e.type()
2592  << " as a scalar of type " << type_of<T>() << "\n";
2593  Func f;
2594  f() = e;
2595  Buffer<T> im = f.realize();
2596  return im();
2597 }
2598 
2599 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2600 template<typename First, typename... Rest>
2601 HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
2602  Internal::check_types<First, Rest...>(t, 0);
2603 
2604  Func f;
2605  f() = t;
2606  Realization r = f.realize();
2607  Internal::assign_results(r, 0, first, rest...);
2608 }
2609 
2610 namespace Internal {
2611 
2612 inline void schedule_scalar(Func f) {
2614  if (t.has_gpu_feature()) {
2615  f.gpu_single_thread();
2616  }
2617  if (t.has_feature(Target::HVX)) {
2618  f.hexagon();
2619  }
2620 }
2621 
2622 } // namespace Internal
2623 
2624 /** JIT-Compile and run enough code to evaluate a Halide
2625  * expression. This can be thought of as a scalar version of
2626  * \ref Func::realize. Can use GPU if jit target from environment
2627  * specifies one.
2628  */
2629 template<typename T>
2631  user_assert(e.type() == type_of<T>())
2632  << "Can't evaluate expression "
2633  << e << " of type " << e.type()
2634  << " as a scalar of type " << type_of<T>() << "\n";
2635  Func f;
2636  f() = e;
2638  Buffer<T> im = f.realize();
2639  return im();
2640 }
2641 
2642 /** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2643  * use GPU if jit target from environment specifies one. */
2644 // @{
2645 template<typename First, typename... Rest>
2646 HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
2647  Internal::check_types<First, Rest...>(t, 0);
2648 
2649  Func f;
2650  f() = t;
2652  Realization r = f.realize();
2653  Internal::assign_results(r, 0, first, rest...);
2654 }
2655 // @}
2656 
2657 } // namespace Halide
2658 
2659 #endif
Defines a type used for expressing the type signature of a generated halide pipeline.
#define internal_assert(c)
Definition: Errors.h:19
#define user_assert(c)
Definition: Errors.h:15
Base classes for Halide expressions (Halide::Expr) and statements (Halide::Internal::Stmt)
#define HALIDE_ATTRIBUTE_DEPRECATED(x)
Defines the struct representing lifetime and dependencies of a JIT compiled halide pipeline.
Defines Module, an IR container that fully describes a Halide program.
Classes for declaring scalar parameters to halide pipelines.
Defines the front-end class representing an entire Halide imaging pipeline.
Defines the front-end syntax for reduction domains and reduction variables.
Defines the structure that describes a Halide target.
Defines Tuple - the front-end handle on small arrays of expressions.
#define HALIDE_NO_USER_CODE_INLINE
Definition: Util.h:45
Defines the Var - the front-end variable.
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Buffer.h:115
Helper class for identifying purpose of an Expr passed to memoize.
Definition: Func.h:683
EvictionKey(const Expr &expr=Expr())
Definition: Func.h:689
A halide function.
Definition: Func.h:698
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
bool is_extern() const
Is this function an external stage? That is, was it defined using define_extern?
FuncRef operator()(std::vector< Expr >) const
Either calls to the function, or the left-hand-side of an update definition (see RDom).
Func & hexagon(const VarOrRVar &x=Var::outermost())
Schedule for execution on Hexagon.
Func(const std::string &name)
Declare a new undefined function with the given name.
void compile_to_multitarget_object_files(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets, const std::vector< std::string > &suffixes)
Like compile_to_multitarget_static_library(), except that the object files are all output as object f...
Func & align_extent(const Var &var, Expr modulus)
Expand the region computed so that the extent is a multiple of 'modulus'.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, FuncRef >::type operator()(Args &&...args) const
Definition: Func.h:1293
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
void specialize_fail(const std::string &message)
Add a specialization to a Func that always terminates execution with a call to halide_error().
Func & memoize(const EvictionKey &eviction_key=EvictionKey())
Use the halide_memoization_cache_...
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to text assembly equivalent to the object file generated by compile_...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & allow_race_conditions()
Specify that race conditions are permitted for this Func, which enables parallelizing over RVars even...
bool has_update_definition() const
Does this function have at least one update definition?
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
Func()
Declare a new undefined function with an automatically-generated unique name.
Func & async()
Produce this Func asynchronously in a separate thread.
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
void realize(Pipeline::RealizationArg outputs, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function into an existing allocated buffer or buffers.
void set_custom_trace(int(*trace_fn)(void *, const halide_trace_event_t *))
Set custom routines to call when tracing is enabled.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & compute_root()
Compute all of this function once ahead of time.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Generalized tiling, reusing the previous names as the outer names.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices and thread indices.
Func & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Schedule the iteration over the initial definition of this function to be fused with another stage 's...
void compile_to_lowered_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out an internal representation of lowered code.
void compile_to_c(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Statically compile this function to C source code.
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimenion.
Func & fold_storage(const Var &dim, const Expr &extent, bool fold_forward=true)
Store realizations of this function in a circular buffer of a given extent.
Func & store_at(LoopLevel loop_level)
Equivalent to the version of store_at that takes a Var, but schedules storage at a given LoopLevel.
Stage update(int idx=0)
Get a handle on an update step for the purposes of scheduling it.
Func & reorder_storage(const Var &x, const Var &y)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Expr, Args... >::value, FuncRef >::type operator()(const Expr &x, Args &&...args) const
Definition: Func.h:1310
bool defined() const
Does this function have at least a pure definition.
Func & compute_at(LoopLevel loop_level)
Schedule a function to be computed within the iteration over a given LoopLevel.
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this initial Stage of this Func.
Definition: Func.h:2548
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
Func copy_to_host()
Declare that this function should be implemented by a call to halide_buffer_copy with a NULL target d...
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
Func & prefetch(const Func &f, const VarOrRVar &var, int offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration,...
Definition: Func.h:2010
Func & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
std::vector< Argument > infer_arguments() const
Infer the arguments to the Func, sorted into a canonical order: all buffers (sorted alphabetically by...
void compile_to_header(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Emit a header file with the given filename for this function.
Func & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:2109
std::vector< Var > args() const
Get the pure arguments.
Func(const Expr &e)
Declare a new function with an automatically-generated unique name, and define it to return the given...
Func & add_trace_tag(const std::string &trace_tag)
Add a string of arbitrary text that will be passed thru to trace inspection code if the Func is reali...
int dimensions() const
The dimensionality (number of arguments) of this function.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, Func & >::type reorder_storage(const Var &x, const Var &y, Args &&...args)
Definition: Func.h:2135
void set_custom_do_par_for(int(*custom_do_par_for)(void *, int(*)(void *, int, uint8_t *), int, int, uint8_t *))
Set a custom parallel for loop launcher.
Func & align_bounds(const Var &var, Expr modulus, Expr remainder=0)
Expand the region computed so that the min coordinates is congruent to 'remainder' modulo 'modulus',...
std::string source_location() const
Get the source location of the pure definition of this Func.
Func & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:1614
void infer_input_bounds(const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
For a given size of output, or a given output buffer, determine the bounds required of all unbound Im...
Func & store_root()
Equivalent to Func::store_at, but schedules storage outside the outermost loop.
int outputs() const
Get the number of outputs of this Func.
void set_custom_allocator(void *(*malloc)(void *, size_t), void(*free)(void *, void *))
Set a custom malloc and free for halide to use.
Tuple update_values(int idx=0) const
Get the right-hand-side of an update definition for functions that returns multiple values.
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm bitcode, with the given filename (which should probably end ...
int num_update_definitions() const
How many update definitions does this function have?
Func & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Rename a dimension.
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
A more general form of tile, which defines tiles of any dimensionality.
Func & prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Func & bound_extent(const Var &var, Expr extent)
Bound the extent of a Func's realization, but not its min.
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
Func & set_estimates(const Region &estimates)
Set (min, extent) estimates for all dimensions in the Func at once; this is equivalent to calling set...
Stage specialize(const Expr &condition)
Specialize a Func.
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
void set_custom_do_task(int(*custom_do_task)(void *, int(*)(void *, int, uint8_t *), int, uint8_t *))
Set a custom task handler to be called by the parallel for loop.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
The generalized tile, with a single tail strategy to apply to all vars.
Func & reorder_storage(const std::vector< Var > &dims)
Specify how the storage for the function is laid out.
Func & compute_at(const Func &f, const RVar &var)
Schedule a function to be computed within the iteration over some dimension of an update domain.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
const std::vector< Expr > & update_args(int idx=0) const
Get the left-hand-side of the update definition.
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & store_at(const Func &f, const RVar &var)
Equivalent to the version of store_at that takes a Var, but schedules storage within the loop over a ...
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Func & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given task_size, and the parallelize the outer dimension.
Expr value() const
The right-hand-side value of the pure definition of this function.
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
A shorter form of tile, which reuses the old variable names as the new outer dimensions.
void set_error_handler(void(*handler)(void *, const char *))
Set the error handler function that be called in the case of runtime errors during halide pipelines.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func clone_in(const std::vector< Func > &fs)
Module compile_to_module(const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Store an internal representation of lowered code as a self contained Module suitable for further comp...
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1240
void set_custom_print(void(*handler)(void *, const char *))
Set the function called to print messages from the runtime.
Func in()
Create and return a global identity wrapper, which wraps all calls to this Func by any other Func.
Func & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then vectorize the inner dimension.
OutputImageParam output_buffer() const
Get a handle on the output buffer for this Func.
Expr update_value(int idx=0) const
Get the right-hand-side of an update definition.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition: Func.h:1127
Func in(const std::vector< Func > &fs)
Create and return an identity wrapper shared by all the Funcs in 'fs'.
void compile_to(const std::map< Output, std::string > &output_files, const std::vector< Argument > &args, const std::string &fn_name, const Target &target=get_target_from_environment())
Compile and generate multiple target files with single call.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Func & serial(const VarOrRVar &var)
Mark a dimension to be traversed serially.
Func & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
prefetch() is a more fine-grained version of prefetch(), which allows specification of different vars...
const Internal::JITHandlers & jit_handlers()
Get a struct containing the currently set custom functions used by JIT.
const std::string & name() const
The name of this function, either given during construction, or automatically generated.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1222
Func & align_storage(const Var &dim, const Expr &alignment)
Pad the storage extent of a particular dimension of realizations of this function up to be a multiple...
void compile_to_file(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to object file and header pair, with the given arguments.
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
void add_custom_lowering_pass(Internal::IRMutator *pass, std::function< void()> deleter)
Add a custom pass to be used during lowering, with the function that will be called to delete it also...
void clear_custom_lowering_passes()
Remove all previously-set custom lowering passes.
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm assembly, with the given filename (which should probably end...
void compile_to_multitarget_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets)
Compile to static-library file and header pair once for each target; each resulting function will be ...
Func & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
The given dimension corresponds to the lanes in a GPU warp.
std::vector< OutputImageParam > output_buffers() const
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & store_in(MemoryType memory_type)
Set the type of memory this Func should be stored in.
HALIDE_NO_USER_CODE_INLINE Func(Buffer< T > &im)
Construct a new Func to wrap a Buffer.
Definition: Func.h:744
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func clone_in(const Func &f)
Similar to Func::in; however, instead of replacing the call to this Func with an identity Func that r...
std::vector< RVar > rvars(int idx=0) const
Get the RVars of the reduction domain for an update definition, if there is one.
Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
Func(Internal::Function f)
Construct a new Func to wrap an existing, already-define Function object.
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to an object file, with the given filename (which should probably en...
const std::string & extern_function_name() const
Get the name of the extern function called for an extern definition.
Func & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Func & trace_realizations()
Trace all realizations of this Func by emitting calls to halide_trace.
Tuple values() const
The values returned by this function.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & compute_inline()
Aggressively inline all uses of this function.
const std::vector< Type > & output_types() const
Get the types of the outputs of this Func.
Func copy_to_device(DeviceAPI d=DeviceAPI::Default_GPU)
Declare that this function should be implemented by a call to halide_buffer_copy with the given targe...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1250
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
Func & atomic(bool override_associativity_test=false)
Issue atomic updates for this Func.
const std::vector< CustomLoweringPass > & custom_lowering_passes()
Get the custom lowering passes.
void debug_to_file(const std::string &filename)
When this function is compiled, include code that dumps its values to a file after it is realized,...
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func in(const Func &f)
Creates and returns a new identity Func that wraps this Func.
void compile_to_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to static-library file and header pair, with the given arguments.
Func & set_estimate(const Var &var, const Expr &min, const Expr &extent)
Statically declare the range over which the function will be evaluated in the general case.
Func & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then unroll the inner dimension.
void infer_input_bounds(Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
Func & trace_loads()
Trace all loads from this Func by emitting calls to halide_trace.
FuncRef operator()(std::vector< Var >) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling)
Definition: Func.h:1232
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition: Func.h:489
Stage operator*=(const FuncRef &)
FuncTupleElementRef operator[](int) const
When a FuncRef refers to a function that provides multiple outputs, you can access each output as an ...
Stage operator-=(const FuncRef &)
size_t size() const
How many outputs does the function this refers to produce.
Internal::Function function() const
What function is this calling?
Definition: Func.h:586
Stage operator+=(Expr)
Define a stage that adds the given expression to this Func.
Stage operator-=(Expr)
Define a stage that adds the negative of the given expression to this Func.
Stage operator*=(Expr)
Define a stage that multiplies this Func by the given expression.
Stage operator-=(const Tuple &)
Stage operator/=(Expr)
Define a stage that divides this Func by the given expression.
Stage operator+=(const FuncRef &)
Stage operator=(const Expr &)
Use this as the left-hand-side of a definition or an update definition (see RDom).
Stage operator=(const FuncRef &)
FuncRef(Internal::Function, const std::vector< Var > &, int placeholder_pos=-1, int count=0)
Stage operator+=(const Tuple &)
FuncRef(const Internal::Function &, const std::vector< Expr > &, int placeholder_pos=-1, int count=0)
Stage operator/=(const FuncRef &)
Stage operator*=(const Tuple &)
Stage operator/=(const Tuple &)
Stage operator=(const Tuple &)
Use this as the left-hand-side of a definition or an update definition for a Func with multiple outpu...
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs.
Definition: Func.h:608
int index() const
Return index to the function outputs.
Definition: Func.h:672
Stage operator+=(const Expr &e)
Define a stage that adds the given expression to Tuple component 'idx' of this Func.
Stage operator*=(const Expr &e)
Define a stage that multiplies Tuple component 'idx' of this Func by the given expression.
Stage operator/=(const Expr &e)
Define a stage that divides Tuple component 'idx' of this Func by the given expression.
Stage operator=(const Expr &e)
Use this as the left-hand-side of an update definition of Tuple component 'idx' of a Func (see RDom).
Stage operator=(const FuncRef &e)
Stage operator-=(const Expr &e)
Define a stage that adds the negative of the given expression to Tuple component 'idx' of this Func.
FuncTupleElementRef(const FuncRef &ref, const std::vector< Expr > &args, int idx)
An Image parameter to a halide pipeline.
Definition: ImageParam.h:23
A Function definition which can either represent a init or an update definition.
Definition: Definition.h:38
const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition.
bool defined() const
Definition objects are nullable.
A reference-counted handle to Halide's internal representation of a function.
Definition: Function.h:38
A base class for passes over the IR which modify it (e.g.
Definition: IRMutator.h:26
A reference-counted handle to a parameter to a halide pipeline.
Definition: Parameter.h:29
A schedule for a single stage of a Halide pipeline.
Definition: Schedule.h:643
bool & touched()
This flag is set to true if the dims list has been manipulated by the user (or if a ScheduleHandle wa...
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:176
A halide module.
Definition: Module.h:135
A handle on the output buffer of a pipeline.
static const ParamMap & empty_map()
A const ref to an empty ParamMap.
Definition: ParamMap.h:104
A class representing a Halide pipeline.
Definition: Pipeline.h:99
A multi-dimensional domain over which to iterate.
Definition: RDom.h:193
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition: RDom.h:29
const std::string & name() const
The name of this reduction variable.
A Realization is a vector of references to existing Buffer objects.
Definition: Realization.h:21
A single definition of a Func.
Definition: Func.h:70
Stage & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:466
std::string name() const
Return the name of this stage, e.g.
Stage & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:379
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & hexagon(const VarOrRVar &x=Var::outermost())
Func rfactor(const RVar &r, const Var &v)
Stage & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Stage & vectorize(const VarOrRVar &var)
Stage & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & prefetch(const Func &f, const VarOrRVar &var, int offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:446
Stage & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Stage & prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Stage & unroll(const VarOrRVar &var)
Stage & parallel(const VarOrRVar &var)
Stage & allow_race_conditions()
Stage & serial(const VarOrRVar &var)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
Stage specialize(const Expr &condition)
Stage & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Schedule the iteration over this stage to be fused with another stage 's' from outermost loop to a gi...
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Scheduling calls that control how the domain of this stage is traversed.
Stage & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
Definition: Func.h:94
Stage & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Func rfactor(std::vector< std::pair< RVar, Var >> preserved)
Calling rfactor() on an associative update definition a Func will split the update into an intermedia...
Stage & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition: Func.h:108
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & reorder(const std::vector< VarOrRVar > &vars)
Stage & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
void specialize_fail(const std::string &message)
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Stage & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage & atomic(bool override_associativity_test=false)
std::string source_location() const
Attempt to get the source file and line where this stage was defined by parsing the process's own deb...
Stage & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
std::string dump_argument_list() const
Return a string describing the current var list taking into account all the splits,...
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition: Tuple.h:18
A Halide variable, to be used when defining functions.
Definition: Var.h:19
const std::string & name() const
Get the name of a Var.
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition: Var.h:163
void schedule_scalar(Func f)
Definition: Func.h:2612
void assign_results(Realization &r, int idx, Last last)
Definition: Func.h:2571
void check_types(const Tuple &t, int idx)
Definition: Func.h:2556
ForType
An enum describing a type of loop traversal.
Definition: Expr.h:395
std::vector< Var > make_argument_list(int dimensionality)
Make a list of unique arguments for definitions with unnamed arguments.
WEAK halide_do_task_t custom_do_task
WEAK halide_do_par_for_t custom_do_par_for
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
HALIDE_NO_USER_CODE_INLINE T evaluate(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2588
PrefetchBoundStrategy
Different ways to handle accesses outside the original extents in a prefetch.
@ GuardWithIf
Guard the prefetch with if-guards that ignores the prefetch if any of the prefetched region ever goes...
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2630
TailStrategy
Different ways to handle a tail case in a split when the factor does not provably divide the extent.
Definition: Schedule.h:32
@ Auto
For pure definitions use ShiftInwards.
LoopAlignStrategy
Different ways to handle the case when the start/end of the loops of stages computed with (fused) are...
Definition: Schedule.h:110
@ Auto
By default, LoopAlignStrategy is set to NoAlign.
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:595
NameMangling
An enum to specify calling convention for extern stages.
Definition: Function.h:24
@ Default
Match whatever is specified in the Target.
Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
DeviceAPI
An enum describing a type of device API.
Definition: DeviceAPI.h:15
@ Host
Used to denote for loops that run on the same device as the containing code.
Target get_target_from_environment()
Return the target that Halide will use.
StmtOutputFormat
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition: Pipeline.h:63
@ Text
Definition: Pipeline.h:64
Stage ScheduleHandle
Definition: Func.h:480
std::vector< Range > Region
A multi-dimensional box.
Definition: Expr.h:343
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:598
MemoryType
An enum describing different address spaces to be used with Func::store_in.
Definition: Expr.h:346
void * malloc(size_t)
unsigned __INT8_TYPE__ uint8_t
void free(void *)
A fragment of Halide syntax.
Definition: Expr.h:256
HALIDE_ALWAYS_INLINE Type type() const
Get the type of this expression node.
Definition: Expr.h:320
An argument to an extern-defined Func.
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
bool has_feature(Feature f) const
Types in the halide type system.
Definition: Type.h:265
A class that can represent Vars or RVars.
Definition: Func.h:30
bool is_rvar
Definition: Func.h:58
VarOrRVar(const Var &v)
Definition: Func.h:34
VarOrRVar(const RVar &r)
Definition: Func.h:37
const std::string & name() const
Definition: Func.h:48
VarOrRVar(const std::string &n, bool r)
Definition: Func.h:31
VarOrRVar(const ImplicitVar< N > &u)
Definition: Func.h:44
VarOrRVar(const RDom &r)
Definition: Func.h:40