Author

Topic: fastest way in C to generate numbers (Read 355 times)

legendary
Activity: 1512
Merit: 7340
Farewell, Leo
February 13, 2024, 03:00:26 PM
#22
I just tested out the min/max values on my system:
[...]

Result:
Quote
Min long double: 3.3621031431e-4932
Max long double: 1.1897314954e+4932
That's correct. Notice how astronomically small the minimum long double is. If you increment it by epsilon (the smallest possible increment in floating point numbers), it goes from e-4932 to e-19. That's lack of accuracy; tradeoff when working with 64 bits and real numbers.  Wink

Another fun fact. The maximum long double number is 1.1897314954e+4932, as rightly said. This means that long doubles are 2^128 in total, in a range of [~0, ~10^4932]. The more you increase the wanted number, the more inaccurate it becomes.
member
Activity: 503
Merit: 38
February 13, 2024, 08:00:26 AM
#21
test.cpp
Code:
#include
#include
#include
#include
#include
#include
#include

gmp_randstate_t state;

void *wrapper_gcry_alloc(size_t size);
void *wrapper_gcry_realloc(void *ptr, size_t old_size, size_t new_size);
void wrapper_gcry_free(void *ptr, size_t cur_size);

int main() {
    int n;

    printf("Enter the value of n: ");
    scanf("%d", &n);

    mpz_t key;
    mpz_init(key);

    clock_t start_random = clock();

    size_t bytes = (n + 7) / 8; // bytes needed for n bits
    unsigned char *buffer_key = (unsigned char *)malloc(bytes);

    if (buffer_key == NULL) {
        printf("Memory allocation failed.");
        return -1;
    }

    getrandom(buffer_key, bytes, 0); // No flags needed

    mpz_import(key, bytes, 1, 1, 0, 0, buffer_key);

    free(buffer_key);
    mpz_clear(key);

    clock_t end_random = clock();

    double time_random = ((double)(end_random - start_random)) / CLOCKS_PER_SEC;

    printf("Time taken for random method: %f seconds\n", time_random);

    return 0;
}

void *wrapper_gcry_alloc(size_t size) {
    return malloc(size); // Changed to malloc
}

void *wrapper_gcry_realloc(void *ptr, size_t old_size, size_t new_size) {
    return realloc(ptr, new_size); // Changed to realloc
}

void wrapper_gcry_free(void *ptr, size_t cur_size) {
    free(ptr); // Changed to free
}


Code:
g++ -m64 -march=native -mtune=native  -O3 test.cpp -o test $(libgcrypt-config --cflags --libs) -lgcrypt -lgmp

./test
Enter the value of n: 40
Time taken for random method: 0.000012 seconds

This is a very demanding one.

This code will loop through all numbers from 1 to 2^n - 1   and increment by 1 (optionally print each number)

test.cpp
Code:
#include
#include
#include
#include
#include
#include
#include


gmp_randstate_t state;

void *wrapper_gcry_alloc(size_t size);
void *wrapper_gcry_realloc(void *ptr, size_t old_size, size_t new_size);
void wrapper_gcry_free(void *ptr, size_t cur_size);

int main() {
    int n;

    printf("Enter the value of n: ");
    scanf("%d", &n);

    mpz_t key;
    mpz_init(key);

    clock_t start_random = clock();

    size_t bytes = (n + 7) / 8;
    unsigned char *buffer_key = (unsigned char *)malloc(bytes);

    if (buffer_key == NULL) {
        printf("Memory allocation failed.");
        return -1;
    }

    getrandom(buffer_key, bytes, 0);

    mpz_import(key, bytes, 1, 1, 0, 0, buffer_key);

    free(buffer_key);

    // Loop through all numbers from 1 to 2^n - 1 and add 1 to each number
    mpz_t num;
    mpz_init(num);
    mpz_set_ui(num, 1);

    mpz_t max_num;
    mpz_init(max_num);
    mpz_ui_pow_ui(max_num, 2, n);
    mpz_sub_ui(max_num, max_num, 1);

    while (mpz_cmp(num, max_num) <= 0) {
        // Print the current number (num)
        // gmp_printf("%Zd\n", num);
        // Increment num by 1
        mpz_add_ui(num, num, 1);
    }

    mpz_clear(num);
    mpz_clear(max_num);
    mpz_clear(key);

    clock_t end_random = clock();

    double time_random = ((double)(end_random - start_random)) / CLOCKS_PER_SEC;

    printf("Time taken for random method: %f seconds\n", time_random);

    return 0;
}

void *wrapper_gcry_alloc(size_t size) {
    return malloc(size);
}

void *wrapper_gcry_realloc(void *ptr, size_t old_size, size_t new_size) {
    return realloc(ptr, new_size);
}

void wrapper_gcry_free(void *ptr, size_t cur_size) {
    free(ptr);
}

# ./test
Enter the value of n: 30
Time taken for random method: 6.051514 seconds
member
Activity: 239
Merit: 59
a young loner on a crusade
February 13, 2024, 06:01:28 AM
#20
Try seq. 30 bit, or 1-1073741824, takes 13.7 seconds. I can't compile your code to compare processing time.
Try for yourself and compare:
Code:
https://github.com/MaiZure/coreutils-8.3/blob/master/src/seq.c
/* seq - print sequence of numbers to standard output.
   Copyright (C) 1994-2018 Free Software Foundation, Inc.

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see .  */

/* Written by Ulrich Drepper.  */

#include
#include
#include
#include

#include "system.h"
#include "die.h"
#include "c-strtod.h"
#include "error.h"
#include "quote.h"
#include "xstrtod.h"

/* Roll our own isfinite/isnan rather than using , so that we don't
   have to worry about linking -lm just for isfinite.  */
#ifndef isfinite
# define isfinite(x) ((x) * 0 == 0)
#endif
#ifndef isnan
# define isnan(x) ((x) != (x))
#endif

/* The official name of this program (e.g., no 'g' prefix).  */
#define PROGRAM_NAME "seq"

#define AUTHORS proper_name ("Ulrich Drepper")

/* If true print all number with equal width.  */
static bool equal_width;

/* The string used to separate two numbers.  */
static char const *separator;

/* The string output after all numbers have been output.
   Usually "\n" or "\0".  */
static char const terminator[] = "\n";

static struct option const long_options[] =
{
  { "equal-width", no_argument, NULL, 'w'},
  { "format", required_argument, NULL, 'f'},
  { "separator", required_argument, NULL, 's'},
  {GETOPT_HELP_OPTION_DECL},
  {GETOPT_VERSION_OPTION_DECL},
  { NULL, 0, NULL, 0}
};

void
usage (int status)
{
  if (status != EXIT_SUCCESS)
    emit_try_help ();
  else
    {
      printf (_("\
Usage: %s [OPTION]... LAST\n\
  or:  %s [OPTION]... FIRST LAST\n\
  or:  %s [OPTION]... FIRST INCREMENT LAST\n\
"), program_name, program_name, program_name);
      fputs (_("\
Print numbers from FIRST to LAST, in steps of INCREMENT.\n\
"), stdout);

      emit_mandatory_arg_note ();

      fputs (_("\
  -f, --format=FORMAT      use printf style floating-point FORMAT\n\
  -s, --separator=STRING   use STRING to separate numbers (default: \\n)\n\
  -w, --equal-width        equalize width by padding with leading zeroes\n\
"), stdout);
      fputs (HELP_OPTION_DESCRIPTION, stdout);
      fputs (VERSION_OPTION_DESCRIPTION, stdout);
      fputs (_("\
\n\
If FIRST or INCREMENT is omitted, it defaults to 1.  That is, an\n\
omitted INCREMENT defaults to 1 even when LAST is smaller than FIRST.\n\
The sequence of numbers ends when the sum of the current number and\n\
INCREMENT would become greater than LAST.\n\
FIRST, INCREMENT, and LAST are interpreted as floating point values.\n\
INCREMENT is usually positive if FIRST is smaller than LAST, and\n\
INCREMENT is usually negative if FIRST is greater than LAST.\n\
INCREMENT must not be 0; none of FIRST, INCREMENT and LAST may be NaN.\n\
"), stdout);
      fputs (_("\
FORMAT must be suitable for printing one argument of type 'double';\n\
it defaults to %.PRECf if FIRST, INCREMENT, and LAST are all fixed point\n\
decimal numbers with maximum precision PREC, and to %g otherwise.\n\
"), stdout);
      emit_ancillary_info (PROGRAM_NAME);
    }
  exit (status);
}

/* A command-line operand.  */
struct operand
{
  /* Its value, converted to 'long double'.  */
  long double value;

  /* Its print width, if it were printed out in a form similar to its
     input form.  An input like "-.1" is treated like "-0.1", and an
     input like "1." is treated like "1", but otherwise widths are
     left alone.  */
  size_t width;

  /* Number of digits after the decimal point, or INT_MAX if the
     number can't easily be expressed as a fixed-point number.  */
  int precision;
};
typedef struct operand operand;

/* Description of what a number-generating format will generate.  */
struct layout
{
  /* Number of bytes before and after the number.  */
  size_t prefix_len;
  size_t suffix_len;
};

/* Read a long double value from the command line.
   Return if the string is correct else signal error.  */

static operand
scan_arg (const char *arg)
{
  operand ret;

  if (! xstrtold (arg, NULL, &ret.value, c_strtold))
    {
      error (0, 0, _("invalid floating point argument: %s"), quote (arg));
      usage (EXIT_FAILURE);
    }

  if (isnan (ret.value))
    {
      error (0, 0, _("invalid %s argument: %s"), quote_n (0, "not-a-number"),
             quote_n (1, arg));
      usage (EXIT_FAILURE);
    }

  /* We don't output spaces or '+' so don't include in width */
  while (isspace (to_uchar (*arg)) || *arg == '+')
    arg++;

  /* Default to auto width and precision.  */
  ret.width = 0;
  ret.precision = INT_MAX;

  /* Use no precision (and possibly fast generation) for integers.  */
  char const *decimal_point = strchr (arg, '.');
  if (! decimal_point && ! strchr (arg, 'p') /* not a hex float */)
    ret.precision = 0;

  /* auto set width and precision for decimal inputs.  */
  if (! arg[strcspn (arg, "xX")] && isfinite (ret.value))
    {
      size_t fraction_len = 0;
      ret.width = strlen (arg);

      if (decimal_point)
        {
          fraction_len = strcspn (decimal_point + 1, "eE");
          if (fraction_len <= INT_MAX)
            ret.precision = fraction_len;
          ret.width += (fraction_len == 0                      /* #.  -> #   */
                        ? -1
                        : (decimal_point == arg                /* .#  -> 0.# */
                           || ! ISDIGIT (decimal_point[-1]))); /* -.# -> 0.# */
        }
      char const *e = strchr (arg, 'e');
      if (! e)
        e = strchr (arg, 'E');
      if (e)
        {
          long exponent = strtol (e + 1, NULL, 10);
          ret.precision += exponent < 0 ? -exponent
                                        : - MIN (ret.precision, exponent);
          /* Don't account for e.... in the width since this is not output.  */
          ret.width -= strlen (arg) - (e - arg);
          /* Adjust the width as per the exponent.  */
          if (exponent < 0)
            {
              if (decimal_point)
                {
                  if (e == decimal_point + 1) /* undo #. -> # above  */
                    ret.width++;
                }
              else
                ret.width++;
              exponent = -exponent;
            }
          else
            {
              if (decimal_point && ret.precision == 0 && fraction_len)
                ret.width--; /* discount space for '.'  */
              exponent -= MIN (fraction_len, exponent);
            }
          ret.width += exponent;
        }
    }

  return ret;
}

/* If FORMAT is a valid printf format for a double argument, return
   its long double equivalent, allocated from dynamic storage, and
   store into *LAYOUT a description of the output layout; otherwise,
   report an error and exit.  */

static char const *
long_double_format (char const *fmt, struct layout *layout)
{
  size_t i;
  size_t prefix_len = 0;
  size_t suffix_len = 0;
  size_t length_modifier_offset;
  bool has_L;

  for (i = 0; ! (fmt[i] == '%' && fmt[i + 1] != '%'); i += (fmt[i] == '%') + 1)
    {
      if (!fmt[i])
        die (EXIT_FAILURE, 0,
             _("format %s has no %% directive"), quote (fmt));
      prefix_len++;
    }

  i++;
  i += strspn (fmt + i, "-+#0 '");
  i += strspn (fmt + i, "0123456789");
  if (fmt[i] == '.')
    {
      i++;
      i += strspn (fmt + i, "0123456789");
    }

  length_modifier_offset = i;
  has_L = (fmt[i] == 'L');
  i += has_L;
  if (fmt[i] == '\0')
    die (EXIT_FAILURE, 0, _("format %s ends in %%"), quote (fmt));
  if (! strchr ("efgaEFGA", fmt[i]))
    die (EXIT_FAILURE, 0,
         _("format %s has unknown %%%c directive"), quote (fmt), fmt[i]);

  for (i++; ; i += (fmt[i] == '%') + 1)
    if (fmt[i] == '%' && fmt[i + 1] != '%')
      die (EXIT_FAILURE, 0, _("format %s has too many %% directives"),
           quote (fmt));
    else if (fmt[i])
      suffix_len++;
    else
      {
        size_t format_size = i + 1;
        char *ldfmt = xmalloc (format_size + 1);
        memcpy (ldfmt, fmt, length_modifier_offset);
        ldfmt[length_modifier_offset] = 'L';
        strcpy (ldfmt + length_modifier_offset + 1,
                fmt + length_modifier_offset + has_L);
        layout->prefix_len = prefix_len;
        layout->suffix_len = suffix_len;
        return ldfmt;
      }
}

static void ATTRIBUTE_NORETURN
io_error (void)
{
  /* FIXME: consider option to silently ignore errno=EPIPE */
  clearerr (stdout);
  die (EXIT_FAILURE, errno, _("write error"));
}

/* Actually print the sequence of numbers in the specified range, with the
   given or default stepping and format.  */

static void
print_numbers (char const *fmt, struct layout layout,
               long double first, long double step, long double last)
{
  bool out_of_range = (step < 0 ? first < last : last < first);

  if (! out_of_range)
    {
      long double x = first;
      long double i;

      for (i = 1; ; i++)
        {
          long double x0 = x;
          if (printf (fmt, x) < 0)
            io_error ();
          if (out_of_range)
            break;
          x = first + i * step;
          out_of_range = (step < 0 ? x < last : last < x);

          if (out_of_range)
            {
              /* If the number just past LAST prints as a value equal
                 to LAST, and prints differently from the previous
                 number, then print the number.  This avoids problems
                 with rounding.  For example, with the x86 it causes
                 "seq 0 0.000001 0.000003" to print 0.000003 instead
                 of stopping at 0.000002.  */

              bool print_extra_number = false;
              long double x_val;
              char *x_str;
              int x_strlen;
              setlocale (LC_NUMERIC, "C");
              x_strlen = asprintf (&x_str, fmt, x);
              setlocale (LC_NUMERIC, "");
              if (x_strlen < 0)
                xalloc_die ();
              x_str[x_strlen - layout.suffix_len] = '\0';

              if (xstrtold (x_str + layout.prefix_len, NULL, &x_val, c_strtold)
                  && x_val == last)
                {
                  char *x0_str = NULL;
                  if (asprintf (&x0_str, fmt, x0) < 0)
                    xalloc_die ();
                  print_extra_number = !STREQ (x0_str, x_str);
                  free (x0_str);
                }

              free (x_str);
              if (! print_extra_number)
                break;
            }

          if (fputs (separator, stdout) == EOF)
            io_error ();
        }

      if (fputs (terminator, stdout) == EOF)
        io_error ();
    }
}

/* Return the default format given FIRST, STEP, and LAST.  */
static char const *
get_default_format (operand first, operand step, operand last)
{
  static char format_buf[sizeof "%0.Lf" + 2 * INT_STRLEN_BOUND (int)];

  int prec = MAX (first.precision, step.precision);

  if (prec != INT_MAX && last.precision != INT_MAX)
    {
      if (equal_width)
        {
          /* increase first_width by any increased precision in step */
          size_t first_width = first.width + (prec - first.precision);
          /* adjust last_width to use precision from first/step */
          size_t last_width = last.width + (prec - last.precision);
          if (last.precision && prec == 0)
            last_width--;  /* don't include space for '.' */
          if (last.precision == 0 && prec)
            last_width++;  /* include space for '.' */
          if (first.precision == 0 && prec)
            first_width++;  /* include space for '.' */
          size_t width = MAX (first_width, last_width);
          if (width <= INT_MAX)
            {
              int w = width;
              sprintf (format_buf, "%%0%d.%dLf", w, prec);
              return format_buf;
            }
        }
      else
        {
          sprintf (format_buf, "%%.%dLf", prec);
          return format_buf;
        }
    }

  return "%Lg";
}

/* The NUL-terminated string S0 of length S_LEN represents a valid
   non-negative decimal integer.  Adjust the string and length so
   that the pair describe the next-larger value.  */
static void
incr (char **s0, size_t *s_len)
{
  char *s = *s0;
  char *endp = s + *s_len - 1;

  do
    {
      if ((*endp)++ < '9')
        return;
      *endp-- = '0';
    }
  while (endp >= s);
  *--(*s0) = '1';
  ++*s_len;
}

/* Compare A and B (each a NUL-terminated digit string), with lengths
   given by A_LEN and B_LEN.  Return +1 if A < B, -1 if B < A, else 0.  */
static int
cmp (char const *a, size_t a_len, char const *b, size_t b_len)
{
  if (a_len < b_len)
    return -1;
  if (b_len < a_len)
    return 1;
  return (strcmp (a, b));
}

/* Trim leading 0's from S, but if S is all 0's, leave one.
   Return a pointer to the trimmed string.  */
static char const * _GL_ATTRIBUTE_PURE
trim_leading_zeros (char const *s)
{
  char const *p = s;
  while (*s == '0')
    ++s;

  /* If there were only 0's, back up, to leave one.  */
  if (!*s && s != p)
    --s;
  return s;
}

/* Print all whole numbers from A to B, inclusive -- to stdout, each
   followed by a newline.  If B < A, return false and print nothing.
   Otherwise, return true.  */
static bool
seq_fast (char const *a, char const *b)
{
  bool inf = STREQ (b, "inf");

  /* Skip past any leading 0's.  Without this, our naive cmp
     function would declare 000 to be larger than 99.  */
  a = trim_leading_zeros (a);
  b = trim_leading_zeros (b);

  size_t p_len = strlen (a);
  size_t q_len = inf ? 0 : strlen (b);

  /* Allow for at least 31 digits without realloc.
     1 more than p_len is needed for the inf case.  */
  size_t inc_size = MAX (MAX (p_len + 1, q_len), 31);

  /* Copy input strings (incl NUL) to end of new buffers.  */
  char *p0 = xmalloc (inc_size + 1);
  char *p = memcpy (p0 + inc_size - p_len, a, p_len + 1);
  char *q;
  char *q0;
  if (! inf)
    {
      q0 = xmalloc (inc_size + 1);
      q = memcpy (q0 + inc_size - q_len, b, q_len + 1);
    }
  else
    q = q0 = NULL;

  bool ok = inf || cmp (p, p_len, q, q_len) <= 0;
  if (ok)
    {
      /* Reduce number of fwrite calls which is seen to
         give a speed-up of more than 2x over the unbuffered code
         when printing the first 10^9 integers.  */
      size_t buf_size = MAX (BUFSIZ, (inc_size + 1) * 2);
      char *buf = xmalloc (buf_size);
      char const *buf_end = buf + buf_size;

      char *bufp = buf;

      /* Write first number to buffer.  */
      bufp = mempcpy (bufp, p, p_len);

      /* Append separator then number.  */
      while (inf || cmp (p, p_len, q, q_len) < 0)
        {
          *bufp++ = *separator;
          incr (&p, &p_len);

          /* Double up the buffers when needed for the inf case.  */
          if (p_len == inc_size)
            {
              inc_size *= 2;
              p0 = xrealloc (p0, inc_size + 1);
              p = memmove (p0 + p_len, p0, p_len + 1);

              if (buf_size < (inc_size + 1) * 2)
                {
                  size_t buf_offset = bufp - buf;
                  buf_size = (inc_size + 1) * 2;
                  buf = xrealloc (buf, buf_size);
                  buf_end = buf + buf_size;
                  bufp = buf + buf_offset;
                }
            }

          bufp = mempcpy (bufp, p, p_len);
          /* If no place for another separator + number then
             output buffer so far, and reset to start of buffer.  */
          if (buf_end - (p_len + 1) < bufp)
            {
              if (fwrite (buf, bufp - buf, 1, stdout) != 1)
                io_error ();
              bufp = buf;
            }
        }

      /* Write any remaining buffered output, and the terminator.  */
      *bufp++ = *terminator;
      if (fwrite (buf, bufp - buf, 1, stdout) != 1)
        io_error ();

      IF_LINT (free (buf));
    }

  free (p0);
  free (q0);
  return ok;
}

/* Return true if S consists of at least one digit and no non-digits.  */
static bool _GL_ATTRIBUTE_PURE
all_digits_p (char const *s)
{
  size_t n = strlen (s);
  return ISDIGIT (s[0]) && n == strspn (s, "0123456789");
}

int
main (int argc, char **argv)
{
  int optc;
  operand first = { 1, 1, 0 };
  operand step = { 1, 1, 0 };
  operand last;
  struct layout layout = { 0, 0 };

  /* The printf(3) format used for output.  */
  char const *format_str = NULL;

  initialize_main (&argc, &argv);
  set_program_name (argv[0]);
  setlocale (LC_ALL, "");
  bindtextdomain (PACKAGE, LOCALEDIR);
  textdomain (PACKAGE);

  atexit (close_stdout);

  equal_width = false;
  separator = "\n";

  /* We have to handle negative numbers in the command line but this
     conflicts with the command line arguments.  So explicitly check first
     whether the next argument looks like a negative number.  */
  while (optind < argc)
    {
      if (argv[optind][0] == '-'
          && ((optc = argv[optind][1]) == '.' || ISDIGIT (optc)))
        {
          /* means negative number */
          break;
        }

      optc = getopt_long (argc, argv, "+f:s:w", long_options, NULL);
      if (optc == -1)
        break;

      switch (optc)
        {
        case 'f':
          format_str = optarg;
          break;

        case 's':
          separator = optarg;
          break;

        case 'w':
          equal_width = true;
          break;

        case_GETOPT_HELP_CHAR;

        case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);

        default:
          usage (EXIT_FAILURE);
        }
    }

  unsigned int n_args = argc - optind;
  if (n_args < 1)
    {
      error (0, 0, _("missing operand"));
      usage (EXIT_FAILURE);
    }

  if (3 < n_args)
    {
      error (0, 0, _("extra operand %s"), quote (argv[optind + 3]));
      usage (EXIT_FAILURE);
    }

  if (format_str)
    format_str = long_double_format (format_str, &layout);

  if (format_str != NULL && equal_width)
    {
      error (0, 0, _("format string may not be specified"
                     " when printing equal width strings"));
      usage (EXIT_FAILURE);
    }

  /* If the following hold:
     - no format string, [FIXME: relax this, eventually]
     - integer start (or no start)
     - integer end
     - increment == 1 or not specified [FIXME: relax this, eventually]
     then use the much more efficient integer-only code.  */
  if (all_digits_p (argv[optind])
      && (n_args == 1 || all_digits_p (argv[optind + 1]))
      && (n_args < 3 || (STREQ ("1", argv[optind + 1])
                         && all_digits_p (argv[optind + 2])))
      && !equal_width && !format_str && strlen (separator) == 1)
    {
      char const *s1 = n_args == 1 ? "1" : argv[optind];
      char const *s2 = argv[optind + (n_args - 1)];
      if (seq_fast (s1, s2))
        return EXIT_SUCCESS;

      /* Upon any failure, let the more general code deal with it.  */
    }

  last = scan_arg (argv[optind++]);

  if (optind < argc)
    {
      first = last;
      last = scan_arg (argv[optind++]);

      if (optind < argc)
        {
          step = last;
          if (step.value == 0)
            {
              error (0, 0, _("invalid Zero increment value: %s"),
                     quote (argv[optind-1]));
              usage (EXIT_FAILURE);
            }

          last = scan_arg (argv[optind++]);
        }
    }

  if ((isfinite (first.value) && first.precision == 0)
      && step.precision == 0 && last.precision == 0
      && 0 <= first.value && step.value == 1 && 0 <= last.value
      && !equal_width && !format_str && strlen (separator) == 1)
    {
      char *s1;
      char *s2;
      if (asprintf (&s1, "%0.Lf", first.value) < 0)
        xalloc_die ();
      if (! isfinite (last.value))
        s2 = xstrdup ("inf"); /* Ensure "inf" is used.  */
      else if (asprintf (&s2, "%0.Lf", last.value) < 0)
        xalloc_die ();

      if (*s1 != '-' && *s2 != '-' && seq_fast (s1, s2))
        {
          IF_LINT (free (s1));
          IF_LINT (free (s2));
          return EXIT_SUCCESS;
        }

      free (s1);
      free (s2);
      /* Upon any failure, let the more general code deal with it.  */
    }

  if (format_str == NULL)
    format_str = get_default_format (first, step, last);

  print_numbers (format_str, layout, first.value, step.value, last.value);

  return EXIT_SUCCESS;
}

For multiprocessing, start multiple instances and set lower and upper limits on each to spread the work.
legendary
Activity: 3472
Merit: 10611
February 12, 2024, 11:33:21 PM
#19
Nice, thanks for sharing. You are on the bit space 2^30 and it took 4 seconds, right ?

We are much faster in C though using the bit shifting method and -O1 optimization flag:
That's right, 4 seconds. The code was just a self-challenge to explore some ideas using SIMD and C# spans.

It could be faster in C, although to be sure the same exact code in two languages (correctly populating a list of 1,073,741,824‬ ints with sequential values) has to run on the same exact hardware and then compared. Unfortunately I can't compile C (don't want to add more dev tools to my Visual Studio and take up more space) to do the comparison.
hero member
Activity: 630
Merit: 731
Bitcoin g33k
February 11, 2024, 11:52:36 AM
#18
Nonetheless, it is good practice to not make operations in expressions.
makes sense, I'll take care of it. Thanks

Yes, but double is... not integer. A double can take up to 2128 values (hereby, 128-bit), but it cannot represent all the numbers between 0 and 2128, if that was your goal. Read about double-precision in here: https://en.wikipedia.org/wiki/Double-precision_floating-point_format.

I just tested out the min/max values on my system:
Code:
#include
#include

int main() {
    printf("Min long double: %.10Le\n", LDBL_MIN);
    printf("Max long double: %.10Le\n", LDBL_MAX);
    return 0;
}

Result:
Just for fun I wrote one using C#, quick, easy and clean using SIMD

On my ancient CPU
Code:
Vector element count: 8
Populating buffer takes: 00:00:04.3862910

Nice, thanks for sharing. You are on the bit space 2^30 and it took 4 seconds, right ?

We are much faster in C though using the bit shifting method and -O1 optimization flag:
Just for fun I wrote one using C#, quick, easy and clean using SIMD

On my ancient CPU
Code:
Vector element count: 8
Populating buffer takes: 00:00:04.3862910

Nice, thanks for sharing. You are on the bit space 2^30 and it took 4 seconds, right ?

We are much faster in C though using the bit shifting method and -O1 optimization flag:
Just for fun I wrote one using C#, quick, easy and clean using SIMD

On my ancient CPU
Code:
Vector element count: 8
Populating buffer takes: 00:00:04.3862910

Nice, thanks for sharing. You are on the bit space 2^30 and it took 4 seconds, right ?

We are much faster in C though using the bit shifting method and -O1 optimization flag:
Quote
Enter the value of n: 30
Time taken for bit shifting method: 0.225257 seconds
legendary
Activity: 3472
Merit: 10611
February 11, 2024, 12:36:54 AM
#17
Just for fun I wrote one using C#, quick, easy and clean using SIMD
Code:
using System.Diagnostics;
using System.Numerics;

namespace BenchSequence
{
    internal class SeqNumberBench
    {
        public static void Run()
        {
            Stopwatch timer = Stopwatch.StartNew();

            Span buffer = new int[0x40000000]; // 2 ^30

            Vector accumulator;
            Vector toAdd;
            int vectorSize = Vector.Count;
            Console.WriteLine($"Vector element count: {vectorSize}");
            if (Vector.Count == 4)
            {
                toAdd = new Vector(new[] { 4, 4, 4, 4 });
                accumulator = new Vector(new[] { 0, 1, 2, 3, });
            }
            else if (Vector.Count == 8)
            {
                toAdd = new Vector(new[] { 8, 8, 8, 8, 8, 8, 8, 8 });
                accumulator = new Vector(new[] { 0, 1, 2, 3, 4, 5, 6, 7 });
            }
            else
            {
                throw new Exception("Invalid!");
            }

            for (int i = 0; i < buffer.Length; i += vectorSize)
            {
                accumulator.CopyTo(buffer.Slice(i, vectorSize));
                accumulator += toAdd;
            }

            TimeSpan t1 = timer.Elapsed;
            Console.WriteLine($"Populating buffer takes: {t1}");
            Console.ReadLine();
        }
    }
}

On my ancient CPU
Code:
Vector element count: 8
Populating buffer takes: 00:00:04.3862910

Here is a smaller version (0x00400000 or 2^22) that can run on the internet using sharplap

P.S. To write to disk all it takes is using BinaryWriter. To get bigger integers all it takes is to replace int with long (ulong for unsigned). To get any bigger than that a simple 256 bit struct can be defined to store the values and have an increment/add method
Code:
readonly struct UInt256
{
    ulong u0, u1, u2, u3;
}
legendary
Activity: 1512
Merit: 7340
Farewell, Leo
February 10, 2024, 02:16:08 PM
#16
that somehow contradicts the previous statement
You're doing the same mistake for pow, and that's why it appears to have tremendous difference in execution time.
Sorry if I have confused you. What I meant is that whether you keep the bitwise operation in the expression or not, it won't make a big difference in terms of time, because it is inexpensive. On the other hand, pow is more computationally expensive, and you can notice the difference if you remove it from the expression.

The -O1 option doesn't mean anything to me, I had to google it to find out what it means. Thanks for pointing out to that optimization mechanisms.
Compilers like GCC have developed overtime and come with optimization options. There are quite a lot. I don't know them all obviously, but I've used -O1, -O2 and -O3.

As you explained, the -O2 or -O3 will only have effect if using "real work" within the loop clause. Here the results I achieved for the bit shifting operations and the particular optimization levels...
I know, I'm not totally sure about what -O2 and -O3 do behind the scene, but due to them being more aggressive, I expect they ignore the empty loop.

You mean long long int, right? So it means that the technical limitation of the default data types in C would be 2^64 for the for loop, regardless of whether you use the bit shifting method or another one? What about the data type "long double", shouldn't this support up to 2^128 ?
Yes, but double is... not integer. A double can take up to 2128 values (hereby, 128-bit), but it cannot represent all the numbers between 0 and 2128, if that was your goal. Read about double-precision in here: https://en.wikipedia.org/wiki/Double-precision_floating-point_format.
hero member
Activity: 630
Merit: 731
Bitcoin g33k
February 10, 2024, 01:35:02 PM
#15
Removing the bitwise operation from the for loop expression won't have a significant difference in performance, because the bitwise operation is really computationally inexpensive.
that somehow contradicts the previous statement
You're doing the same mistake for pow, and that's why it appears to have tremendous difference in execution time.

Nonetheless, it is good practice to not make operations in expressions.
From a logical point of view, I fully agree.

Let's optimize during compiling. Append the usual command with -O1:
Code:
gcc -o foo foo.c -lm -O1
(might pop a warning about scanf, ignore it)
As I originally mentioned, I am completely new to C and therefore don't really know my way around. The -O1 option doesn't mean anything to me, I had to google it to find out what it means. Thanks for pointing out to that optimization mechanisms.

("Before" program takes a little bit more with optimization, but it isn't always true; it can take less time sometimes)
[...]
If you put something inside the for loop, then you could use -O2 or -O3 for more aggressive optimization.

Yeah, I see. The -O1 optimization flag is pretty nice Tongue As you explained, the -O2 or -O3 will only have effect if using "real work" within the loop clause. Here the results I achieved for the bit shifting operations and the particular optimization levels...

C supports 64-bit as a maximum integer
You mean long long int, right? So it means that the technical limitation of the default data types in C would be 2^64 for the for loop, regardless of whether you use the bit shifting method or another one? What about the data type "long double", shouldn't this support up to 2^128 ?

Now coming back to the roots. What mechanism do you Pro's suggest to speed things up? How to implement parallelization, multi-threading, multi-processing, concurrent work load distribution, or/and even CUDA implementation ? Let's try to challenge the performance. Currently we're on n=32 but we should go to higher bit ranges and see how performance behaves within the different domains.
legendary
Activity: 1512
Merit: 7340
Farewell, Leo
February 10, 2024, 11:28:26 AM
#14
You are so right. Thank you very much for pointing this out and correcting the performance error.
Removing the bitwise operation from the for loop expression won't have a significant difference in performance, because the bitwise operation is really computationally inexpensive. It has difference if you compare it with pow, if run for millions or billions of times. Nonetheless, it is good practice to not make operations in expressions.

As you can clearly see, the initial (BEFORE:) code I used with the expression in the for loop is faster. But I lack any justification for this.
Let's optimize during compiling. Append the usual command with -O1:
Code:
gcc -o foo foo.c -lm -O1
(might pop a warning about scanf, ignore it)

"After" program times without optimization:
Quote
Enter the value of n: 36
Time taken for bit shifting method: 130.311547 seconds

"After" program times with optimization:
Quote
Enter the value of n: 36
Time taken for bit shifting method: 16.889529 seconds
("Before" program takes a little bit more with optimization, but it isn't always true; it can take less time sometimes)

If you put something inside the for loop, then you could use -O2 or -O3 for more aggressive optimization.
hero member
Activity: 630
Merit: 731
Bitcoin g33k
February 10, 2024, 09:21:40 AM
#13
Answer: The expression i < (1LL << n) is checked in every iteration. That means that the CPU will run the bitwise operation for 2n times. Instead, to save time, you should define a constant that will be 2n, without working it out every time:
Code:
const long long int max = 1LL << n;
for (long long int i = 0; i < max; i++)

You are so right. Thank you very much for pointing this out and correcting the performance error. I have understood this and can fully comprehend it. Lesson learned

However, after trying it out myself, I am now even more confused. I have implemented this change and it seems to me that this change has worsened performance.

Before:
Code:
#include
#include
#include
#include

int main() {
    int n;

    printf("Enter the value of n: ");
    scanf("%d", &n);

    clock_t start_bitshifting = clock();
    for (long long int i = 0; i < (1LL << n); i++) {
        // Do nothing
    }
    clock_t end_bitshifting = clock();

    double time_bitshifting = ((double) (end_bitshifting - start_bitshifting)) / CLOCKS_PER_SEC;

    printf("Time taken for bit shifting method: %f seconds\n", time_bitshifting);

    return 0;
}

Quote
Enter the value of n: 36
Time taken for bit shifting method: 90.776877 seconds
Quote
Enter the value of n: 38
Time taken for bit shifting method: 362.336811 seconds

After:
Code:
#include
#include
#include
#include

int main() {
    int n;

    printf("Enter the value of n: ");
    scanf("%d", &n);

    clock_t start_bitshifting = clock();
    const long long int MAX1 = 1LL << n;
    for (long long int i = 0; i < MAX1; i++) {
        // Do nothing
    }
    clock_t end_bitshifting = clock();

    double time_bitshifting = ((double) (end_bitshifting - start_bitshifting)) / CLOCKS_PER_SEC;

    printf("Time taken for bit shifting method: %f seconds\n", time_bitshifting);

    return 0;
}

Quote
Enter the value of n: 36
Time taken for bit shifting method: 111.764399 seconds
Quote
Enter the value of n: 38
Time taken for bit shifting method: 443.352655 seconds

As you can clearly see, the initial (BEFORE:) code I used with the expression in the for loop is faster. But I lack any justification for this.
/puzzled Huh
legendary
Activity: 1512
Merit: 7340
Farewell, Leo
February 10, 2024, 09:11:16 AM
#12
First of all, I just saw your code, and you're adding an extra burden for no reason. This is how you've defined the for loop:
so I don't see any advantage in prefering the suggested complex number method in terms of speed. I see the bitwise operation being the fastest so far. Please correct me if I'm wrong.
This is what I get:
Quote
Time taken for complex number method: 1.506799 seconds

Seems like the fastest, or equally fast with the program above.
hero member
Activity: 630
Merit: 731
Bitcoin g33k
February 10, 2024, 08:45:30 AM
#11
However, for the sake of simplicity, I suggest you use complex numbers instead of this. C supports 64-bit as a maximum integer, so a complex x + y*i, with x, y ∈(0, 2^64-1) can take up to (264)2 = 2128 values. That's enough for our purposes:
Code:
#include
#define MAX __UINT64_MAX__

int main() {Enter the value of n: 32
    unsigned long long int x, y;
Enter the value of n: 30
Time taken for Bitshifting method: 1.457047 seconds
    for(x = 1; x <= MAX; x++)
        for(y = 1; y <= MAX; y++)
            printf("%llu + %llu*i\n", x, y);
        
    return 0;
}

You can define MAX to smaller values (like 210=1024) to see it finishing. If the last number printed is 1024 + 1024*i, then you can be certain it is the 220th.

I tried the method with complex numbers you suggested and compared it to the time needed when using the bit shifting method. Latter one is faster, I get better results with the bit shifting method.
I'm using MAX=32768 which represents 2^15. That means the program iterated through 2^30 numbers.
Code:
#include
#include
//#define MAX __UINT64_MAX__
#define MAX 32768

int main() {
    unsigned long long int x, y;

    clock_t start_complex = clock();
    for(x = 1; x <= MAX; x++)
        for(y = 1; y <= MAX; y++) {
            //printf("%llu + %llu*i\n", x, y);
        }

    clock_t end_complex = clock();
    double time_complex = ((double) (end_complex - start_complex)) / CLOCKS_PER_SEC;
    printf("Time taken for complex number method: %f seconds\n", time_complex);
    return 0;
}

C supports 64-bit as a maximum integer
You mean long long int, right? So it means that the technical limitation of the default data types in C would be 2^64 for the for loop, regardless of whether you use the bit shifting method or another one? What about the data type "long double", shouldn't this support up to 2^128 ?
legendary
Activity: 1512
Merit: 7340
Farewell, Leo
February 10, 2024, 05:38:01 AM
#10
The limit of 2^64 is too small, it must be compatible up to 2^256 and thus with very large numbers.
You'll have to use specialized libraries, like GMP (GNU Multiple Precision Arithmetic Library). I just wrote a small program for you to play with.

Code:
#include
#include

int main() {
    // define a 256-bit integer
    mpz_t maxIterations;
    mpz_init2(maxIterations, 256);

    // maxIterations = ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
    mpz_set_str(maxIterations, "115792089237316195423570985008687907853269984665640564039457584007913129639935", 10);

    mpz_t currentIteration;
    mpz_init(currentIteration);

    while (mpz_cmp(currentIteration, maxIterations) < 0) {
        // print the iteration
        gmp_printf("Iteration %Zx\n", currentIteration);

        // increment it
        mpz_add_ui(currentIteration, currentIteration, 1);
    }

    mpz_clear(maxIterations);
    mpz_clear(currentIteration);

    return 0;
}

If you run this, it will start printing all 256-bit numbers, from the start (0) to finish (2256-1). It will obviously never finish, because 2256-1 is astronomically huge.

However, for the sake of simplicity, I suggest you use complex numbers instead of this. C supports 64-bit as a maximum integer, so a complex x + y*i, with x, y ∈(0, 2^64-1) can take up to (264)2 = 2128 values. That's enough for our purposes:
Code:
#include
#define MAX __UINT64_MAX__

int main() {
    unsigned long long int x, y;

    for(x = 1; x <= MAX; x++)
        for(y = 1; y <= MAX; y++)
            printf("%llu + %llu*i\n", x, y);
       
    return 0;
}

You can define MAX to smaller values (like 210=1024) to see it finishing. If the last number printed is 1024 + 1024*i, then you can be certain it is the 220th.
hero member
Activity: 630
Merit: 731
Bitcoin g33k
February 10, 2024, 04:40:02 AM
#9
In order to make the program and the challenge even simpler, we can, for all I care, neglect the file output. Then we have one less link in the process chain.
So let's concentrate on the number generation. The limit of 2^64 is too small, it must be compatible up to 2^256 and thus with very large numbers.

pow returns a double, not an int. You should rather utilize bitwise operator <<.
Code:
int end_number = 2 << n;

that was a very good tip, thank you very much! the bit shifting method achieves an enormous speed advantage, as I have discovered through this comparison.

Code:
#include
#include
#include
#include

int main() {
    int n;

    printf("Enter the value of n: ");
    scanf("%d", &n);

    clock_t start_bitshifting = clock();
    for (long long int i = 0; i < (1LL << n); i++) {
    }
    clock_t end_bitshifting = clock();

    clock_t start_pow = clock();
    for (long long int i = 0; i < pow(2, n); i++) {
    }
    clock_t end_pow = clock();

    double time_bitshifting = ((double) (end_bitshifting - start_bitshifting)) / CLOCKS_PER_SEC;
    double time_pow = ((double) (end_pow - start_pow)) / CLOCKS_PER_SEC;

    printf("Time taken for bit shifting method: %f seconds\n", time_bitshifting);
    printf("Time taken for pow function method: %f seconds\n", time_pow);

    return 0;
}

Code:
$ ./bitwise_vs_pow
Quote
Enter the value of n: 30
Time taken for Bitshifting method: 1.457047 seconds
Time taken for pow function method: 11.827297 seconds
Quote
Enter the value of n: 31
Time taken for Bitshifting method: 2.836781 seconds
Time taken for pow function method: 23.645096 seconds
Quote
Enter the value of n: 32
Time taken for Bitshifting method: 5.757158 seconds
Time taken for pow function method: 47.412838 seconds
Quote
Enter the value of n: 33
Time taken for Bitshifting method: 10.960150 seconds
Time taken for pow function method: 92.934464 seconds
hero member
Activity: 686
Merit: 1341
✔️ CoinJoin Wallet
February 10, 2024, 03:20:58 AM
#8
The fastest way to generate numbers would be to use a pre-allocated array. An array is a data structure that stores a collection of elements in a contiguous block of memory. You could then use a simple for loop to access the elements in the array, which would be faster than using a random number generator function. For example, you could use the following code to generate a sequence of numbers from 0 to 100:
Code:
int array[100];
for (int i = 0; i < 100; i++) {
array[i] = i;
}

The user didn't mention random number generators.

In fact, your idea will not improve performance. In the original scenario, OP runs a for loop and prints each number to the file. In your scenario, you create an array and then you do the exact same thing, looping the array.
legendary
Activity: 1512
Merit: 7340
Farewell, Leo
February 09, 2024, 05:39:34 PM
#7
Code:
int end_number = pow(2, n);
pow returns a double, not an int. You should rather utilize bitwise operator <<.
Code:
int end_number = 2 << n;

Code:
fprintf(output_file, "%d\n", i);
Writing to the disk is going to slow this down. Why don't you just use printf? It will display the numbers in the terminal at faster rate than fprintf will write them in disk.

Also, apogio is correct about overflows. C permits you to declaring up to 64 bit integers (unsigned long long int is the largest it can get, up to 2^64 - 1).
jr. member
Activity: 37
Merit: 6
February 09, 2024, 05:11:04 PM
#6
The fastest way to generate numbers would be to use a pre-allocated array. An array is a data structure that stores a collection of elements in a contiguous block of memory. You could then use a simple for loop to access the elements in the array, which would be faster than using a random number generator function. For example, you could use the following code to generate a sequence of numbers from 0 to 100:
Code:
int array[100];
for (int i = 0; i < 100; i++) {
array[i] = i;
}
hero member
Activity: 630
Merit: 731
Bitcoin g33k
February 09, 2024, 05:02:52 PM
#5
I have provided a template so that we have a benchmark. Based on this, everyone can try to generate as many numbers as possible, write them to the output file and then compare the time needed for the operations. Goal is to achieve the fastest way. All techniques are allowed, the faster the better Smiley

This is a good challenge in fact, I will try to make it work fast and I will let you know. There are still 2 questions  though. Is it mandatory to use C ? If yes, then how will we solve the issue with the buffer overflow? We can't go higher than 30 bits using an int.

Yes, it has to be in C due to the performance requirement. I guess we'll have to stick to GMP according to many other C program and tools out there in Bitcoin community. Just think of VanitySearch or BitCrack and tools like that. They all handle big numbers as well some of them even come with CUDA capabilities.
hero member
Activity: 686
Merit: 1341
✔️ CoinJoin Wallet
February 09, 2024, 04:44:24 PM
#4
I have provided a template so that we have a benchmark. Based on this, everyone can try to generate as many numbers as possible, write them to the output file and then compare the time needed for the operations. Goal is to achieve the fastest way. All techniques are allowed, the faster the better Smiley

This is a good challenge in fact, I will try to make it work fast and I will let you know. There are still 2 questions  though. Is it mandatory to use C ? If yes, then how will we solve the issue with the buffer overflow? We can't go higher than 30 bits using an int.
hero member
Activity: 630
Merit: 731
Bitcoin g33k
February 09, 2024, 04:05:04 PM
#3
I thank you for every objective comment, no matter how helpful it is. After all, we are all learning, even the successors who will read here. And that's what it's all about, the added value for the entire community.

The approach you describe (with the batches) is what I've been using in Python so far. I am also aware that the file operations are very cost-intensive and yes, I am aware that everything runs much faster in RAM.

After all, this is just a simple program, I'm looking for a basis. The final program will look completely different anyway. I have provided a template so that we have a benchmark. Based on this, everyone can try to generate as many numbers as possible, write them to the output file and then compare the time needed for the operations. Goal is to achieve the fastest way. All techniques are allowed, the faster the better Smiley
hero member
Activity: 686
Merit: 1341
✔️ CoinJoin Wallet
February 09, 2024, 03:37:01 PM
#2
Hello, let me share my comments and perhaps we can construct something good.

First of all, I will not comment anything on CUDA because I am not very competent with it and I don't want to mislead you.

Now, my comments:

1. You basically want to generate numbers from 1 to 2^n where 1 <= n <= 100. But, I can assure you that printing them on the file requires much more time than just generating them on the RAM. If you measure how much time it takes to fprintf all the numbers on the file, versus the time it takes to fprintf only the last number (which will happen when the program terminates), the time elapsed will differ dramatically. But, I believe you want to print all the numbers, so my comment isn't very accurate.

2.Writing on the file, requires consecutive disk I/Os which are very time consuming. A work-around would be to print the numbers in batches. But the problem with C is that you cannot print the numbers without using recursion or loops. So you could add the numbers to an array in RAM (let's say the size of the array is 10,000). Then every 10,000 numbers, you would need to print to the file in batch. But, as I said, batch writing is not possible in C. You still need to loop the array and print one-by-one so you will technically add an overhead to your whole experiment which will not improve the time needed. However, the concept could be applied in other programming languages.

3. You assign numbers to an integer (int). In C, an int can take values from -2,147,483,648 to 2,147,483,647. The latter is equal to 2^31 + 1, which means that you cannot put any number larger than 30. So if you put n=100, then you will get a buffer overflow. Try it, just for fun.

Disclaimer: My last C program was written 10 years ago, so please be nice with me  Tongue
hero member
Activity: 630
Merit: 731
Bitcoin g33k
February 09, 2024, 02:29:56 PM
#1
Hi folks.

I'm trying to achieve the highest possible throughput for number generation and am making my first attempts in C. Unfortunately I can't manage to work with multithreading, all my attempts failed. The goal is to simply generate numbers from 1 to 2^n and measure the time needed to do so in order to work as efficiently as possible. My current simple program looks like this:

The user can enter an integer number from 1-100 and then the program starts and generates all numbers starting from 1 up to 2^n. It writes the result into the output.txt file. At the end of the program it displays the required time.

Code:
#include
#include
#include
#include

int main() {
    int n;
    printf("Enter the value of n (1-100): ");
    scanf("%d", &n);

    if (n < 1 || n > 100) {
        printf("Invalid input. Please enter an integer between 1 and 100.\n");
        return 1;
    }

    clock_t start_time, end_time;
    double total_time;

    start_time = clock();

    FILE *output_file = fopen("output.txt", "w");
    if (output_file == NULL) {
        printf("Error opening file.\n");
        return 1;
    }

    int end_number = pow(2, n);

    for (int i = 1; i <= end_number; i++) {
        fprintf(output_file, "%d\n", i);
    }

    fclose(output_file);

    end_time = clock();
    total_time = (double)(end_time - start_time) / CLOCKS_PER_SEC;
    printf("Total runtime: %.4f seconds\n", total_time);

    return 0;
}

Here are some benchmark results on my machine for this simple program:

Quote
25 bit --> 1.9 seconds
28 bit --> 9.87 seconds
30 bit --> 40.01 seconds

I would like to be able to use multithreading or multiprocessing so that the program runs not only on one thread/core but optionally on several or all cores available to the system. The load would have to be shared, similar to Python with concurrent feautures with which I had the best experience.

The very best would of course be the use of CUDA, but I have absolutely no idea how to write the kernel for this.A typical sequence of operations for a CUDA C program is:

Declare and allocate host and device memory.
Initialize host data.
Transfer data from the host to the device.
Execute one or more kernels.
Transfer results from the device to the host.

If anyone could assists with a simple basic structure as a template, that would help me a lot. I am very grateful for any tips.
Jump to: