haskell-te: c63966f573e68ee4961be56a5266da013f0c2466
1: # Commands which split their input into various "buckets", e.g. based on
2: # clustering. We don't do any exploration or reduction, we just look at the
3: # resulting buckets.
4: { bash, bc, cluster, format, jq, mkBin, ghc, runCommand, runWeka, stdenv,
5: withDeps, wrap, writeScript }:
6:
7: with rec {
8: hashes = mkBin {
9: name = "hashBucket";
10: paths = [ bash bc ghc jq ];
11: vars = { SIMPLE = "1"; };
12: script = ''
13: #!${bash}/bin/bash
14: set -e
15: set -o pipefail
16:
17: INPUT=$(cat)
18:
19: # Wrap up raw objects into an array
20: if echo "$INPUT" | jq -r 'type' | grep 'object' > /dev/null
21: then
22: INPUT=$(echo "$INPUT" | jq -s '.')
23: fi
24:
25: if [[ -n "$CLUSTER_SIZE" ]]
26: then
27: echo "Using cluster size of $CLUSTER_SIZE" 1>&2
28: LENGTH=$(echo "$INPUT" | jq 'length')
29: [[ -n "$LENGTH" ]] || LENGTH=0
30:
31: PROG=$(echo "main = print (ceiling (($LENGTH :: Float) / $CLUSTER_SIZE) :: Int)")
32: CLUSTERS=$(echo "$PROG" | runhaskell)
33:
34: echo "Using $CLUSTERS clusters of length $CLUSTER_SIZE" 1>&2
35: fi
36:
37: [[ -n "$CLUSTERS" ]] || {
38: CLUSTERS=$(echo "$INPUT" | jq 'length | sqrt | . + 0.5 | floor')
39: export CLUSTERS
40:
41: echo "No cluster count given; using $CLUSTERS (sqrt of sample size)" 1>&2
42: }
43:
44: clCount="$CLUSTERS"
45: export clCount
46:
47: function getHashes() {
48: echo "Calculating SHA256 checksums of names" 1>&2
49: while read -r ENTRY
50: do
51: NAME=$(echo "$ENTRY" | jq -r '.name')
52:
53: SHA=$(echo "clusters-$clCount-name-$NAME-entropy-input-$INPUT" |
54: sha256sum | cut -d ' ' -f1 | tr '[:lower:]' '[:upper:]')
55:
56: # Convert hex to decimal. Use large BC_LINE_LENGTH to avoid line-
57: # breaking.
58: SHADEC=$(echo "ibase=16; $SHA" | BC_LINE_LENGTH=5000 bc)
59:
60: # Calculate modulo, now that both numbers are in decimal
61: NUM=$(echo "$SHADEC % $CLUSTERS" | BC_LINE_LENGTH=5000 bc)
62:
63: # Cluster numbers start from 1
64: echo "$ENTRY" | jq --argjson num "$NUM" '. + {"cluster": ($num + 1)}'
65: done < <(echo "$INPUT" | jq -c '.[]') | jq -s '.'
66: }
67:
68: getHashes | "${format.fromStdin}"
69: '';
70: };
71:
72: hashCheck = runCommand "hash-bucket-check" { buildInputs = [ hashes jq ]; } ''
73: set -e
74: set -o pipefail
75:
76: echo "Testing empty input" 1>&2
77: echo "" | CLUSTER_SIZE=10 hashBucket 1 1 | jq -e 'length | . == 0'
78:
79: echo "Testing single input" 1>&2
80: O='{"name":"foo", "type": "T", "quickspecable": true}'
81: echo "[$O]" | CLUSTER_SIZE=10 hashBucket 1 1 |
82: jq -e --argjson o "$O" '. == [[$o + {"cluster":1}]]'
83:
84: mkdir "$out"
85: '';
86:
87: recurrent = mkBin {
88: name = "recurrentBucket";
89: paths = [ bash jq runWeka ];
90: vars = { SIMPLE = "1"; };
91: script = ''
92: #!${bash}/bin/bash
93: set -e
94: set -o pipefail
95:
96: # Perform clustering
97: CLUSTERED=$(${cluster})
98:
99: clCount=$(echo "$CLUSTERED" | jq 'map(.cluster) | max')
100: export clCount
101:
102: echo "$CLUSTERED" | "${format.fromStdin}"
103: '';
104: };
105: };
106:
107: {
108: inherit recurrent;
109: hashes = withDeps [ hashCheck ] hashes;
110: }
Generated by git2html.