haskell-te: c63966f573e68ee4961be56a5266da013f0c2466

     1: # Commands which split their input into various "buckets", e.g. based on
     2: # clustering. We don't do any exploration or reduction, we just look at the
     3: # resulting buckets.
     4: { bash, bc, cluster, format, jq, mkBin, ghc, runCommand, runWeka, stdenv,
     5:   withDeps, wrap, writeScript }:
     6: 
     7: with rec {
     8:   hashes = mkBin {
     9:     name   = "hashBucket";
    10:     paths  = [ bash bc ghc jq ];
    11:     vars   = { SIMPLE = "1"; };
    12:     script = ''
    13:       #!${bash}/bin/bash
    14:       set -e
    15:       set -o pipefail
    16: 
    17:       INPUT=$(cat)
    18: 
    19:       # Wrap up raw objects into an array
    20:       if echo "$INPUT" | jq -r 'type' | grep 'object' > /dev/null
    21:       then
    22:         INPUT=$(echo "$INPUT" | jq -s '.')
    23:       fi
    24: 
    25:       if [[ -n "$CLUSTER_SIZE" ]]
    26:       then
    27:         echo "Using cluster size of $CLUSTER_SIZE" 1>&2
    28:         LENGTH=$(echo "$INPUT" | jq 'length')
    29:         [[ -n "$LENGTH" ]] || LENGTH=0
    30: 
    31:         PROG=$(echo "main = print (ceiling (($LENGTH :: Float) / $CLUSTER_SIZE) :: Int)")
    32:         CLUSTERS=$(echo "$PROG" | runhaskell)
    33: 
    34:         echo "Using $CLUSTERS clusters of length $CLUSTER_SIZE" 1>&2
    35:       fi
    36: 
    37:       [[ -n "$CLUSTERS" ]] || {
    38:         CLUSTERS=$(echo "$INPUT" | jq 'length | sqrt | . + 0.5 | floor')
    39:         export CLUSTERS
    40: 
    41:         echo "No cluster count given; using $CLUSTERS (sqrt of sample size)" 1>&2
    42:       }
    43: 
    44:       clCount="$CLUSTERS"
    45:       export clCount
    46: 
    47:       function getHashes() {
    48:         echo "Calculating SHA256 checksums of names" 1>&2
    49:         while read -r ENTRY
    50:         do
    51:           NAME=$(echo "$ENTRY" | jq -r '.name')
    52: 
    53:           SHA=$(echo "clusters-$clCount-name-$NAME-entropy-input-$INPUT" |
    54:                 sha256sum | cut -d ' ' -f1 | tr '[:lower:]' '[:upper:]')
    55: 
    56:           # Convert hex to decimal. Use large BC_LINE_LENGTH to avoid line-
    57:           # breaking.
    58:           SHADEC=$(echo "ibase=16; $SHA" | BC_LINE_LENGTH=5000 bc)
    59: 
    60:           # Calculate modulo, now that both numbers are in decimal
    61:           NUM=$(echo "$SHADEC % $CLUSTERS" | BC_LINE_LENGTH=5000 bc)
    62: 
    63:           # Cluster numbers start from 1
    64:           echo "$ENTRY" | jq --argjson num "$NUM" '. + {"cluster": ($num + 1)}'
    65:         done < <(echo "$INPUT" | jq -c '.[]') | jq -s '.'
    66:       }
    67: 
    68:       getHashes | "${format.fromStdin}"
    69:     '';
    70:   };
    71: 
    72:   hashCheck = runCommand "hash-bucket-check" { buildInputs = [ hashes jq ]; } ''
    73:     set -e
    74:     set -o pipefail
    75: 
    76:     echo "Testing empty input" 1>&2
    77:     echo "" | CLUSTER_SIZE=10 hashBucket 1 1 | jq -e 'length | . == 0'
    78: 
    79:     echo "Testing single input" 1>&2
    80:     O='{"name":"foo", "type": "T", "quickspecable": true}'
    81:     echo "[$O]" | CLUSTER_SIZE=10 hashBucket 1 1 |
    82:       jq -e --argjson o "$O" '. == [[$o + {"cluster":1}]]'
    83: 
    84:     mkdir "$out"
    85:   '';
    86: 
    87:   recurrent = mkBin {
    88:     name   = "recurrentBucket";
    89:     paths  = [ bash jq runWeka ];
    90:     vars   = { SIMPLE = "1"; };
    91:     script = ''
    92:       #!${bash}/bin/bash
    93:       set -e
    94:       set -o pipefail
    95: 
    96:       # Perform clustering
    97:       CLUSTERED=$(${cluster})
    98: 
    99:       clCount=$(echo "$CLUSTERED" | jq 'map(.cluster) | max')
   100:       export clCount
   101: 
   102:       echo "$CLUSTERED" | "${format.fromStdin}"
   103:     '';
   104:   };
   105: };
   106: 
   107: {
   108:   inherit recurrent;
   109:   hashes = withDeps [ hashCheck ] hashes;
   110: }

Generated by git2html.