Skip to content

Commit e5a7aec

Browse files
committed
feat: add CUDA RNG
1 parent 31e77e1 commit e5a7aec

File tree

6 files changed

+217
-26
lines changed

6 files changed

+217
-26
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
2020
- [stable-diffusion-webui](https://github.com/AUTOMATIC1111/stable-diffusion-webui) style tokenizer (not all the features, only token weighting for now)
2121
- Sampling method
2222
- `Euler A`
23+
- Cross-platform reproducibility (`--rng cuda`, consistent with the `stable-diffusion-webui GPU RNG`)
2324
- Supported platforms
2425
- Linux
2526
- Mac OS
@@ -35,8 +36,6 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
3536
- [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
3637
- [ ] LoRA support
3738
- [ ] k-quants support
38-
- [ ] Cross-platform reproducibility (perhaps ensuring consistency with the original SD)
39-
- [ ] Adapting to more weight formats
4039

4140
## Usage
4241

examples/main.cpp

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,11 @@ int32_t get_num_physical_cores() {
6767
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
6868
}
6969

70+
const char* rng_type_to_str[] = {
71+
"std_default",
72+
"cuda",
73+
};
74+
7075
struct Option {
7176
int n_threads = -1;
7277
std::string mode = TXT2IMG;
@@ -81,6 +86,7 @@ struct Option {
8186
SampleMethod sample_method = EULAR_A;
8287
int sample_steps = 20;
8388
float strength = 0.75f;
89+
RNGType rng_type = STD_DEFAULT_RNG;
8490
int seed = 42;
8591
bool verbose = false;
8692

@@ -99,6 +105,7 @@ struct Option {
99105
printf(" sample_method: %s\n", "eular a");
100106
printf(" sample_steps: %d\n", sample_steps);
101107
printf(" strength: %.2f\n", strength);
108+
printf(" rng: %s\n", rng_type_to_str[rng_type]);
102109
printf(" seed: %d\n", seed);
103110
}
104111
};
@@ -123,6 +130,7 @@ void print_usage(int argc, const char* argv[]) {
123130
printf(" -W, --width W image width, in pixel space (default: 512)\n");
124131
printf(" --sample-method SAMPLE_METHOD sample method (default: \"eular a\")\n");
125132
printf(" --steps STEPS number of sample steps (default: 20)\n");
133+
printf(" --rng {std_default, cuda} RNG (default: std_default)\n");
126134
printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n");
127135
printf(" -v, --verbose print extra info\n");
128136
}
@@ -206,6 +214,20 @@ void parse_args(int argc, const char* argv[], Option* opt) {
206214
break;
207215
}
208216
opt->sample_steps = std::stoi(argv[i]);
217+
} else if (arg == "--rng") {
218+
if (++i >= argc) {
219+
invalid_arg = true;
220+
break;
221+
}
222+
std::string rng_type_str = argv[i];
223+
if (rng_type_str == "std_default") {
224+
opt->rng_type = STD_DEFAULT_RNG;
225+
} else if (rng_type_str == "cuda") {
226+
opt->rng_type = CUDA_RNG;
227+
} else {
228+
invalid_arg = true;
229+
break;
230+
}
209231
} else if (arg == "-s" || arg == "--seed") {
210232
if (++i >= argc) {
211233
invalid_arg = true;
@@ -328,7 +350,7 @@ int main(int argc, const char* argv[]) {
328350
init_img.assign(img_data, img_data + (opt.w * opt.h * c));
329351
}
330352

331-
StableDiffusion sd(opt.n_threads, vae_decode_only, true);
353+
StableDiffusion sd(opt.n_threads, vae_decode_only, true, opt.rng_type);
332354
if (!sd.load_from_file(opt.model_path)) {
333355
return 1;
334356
}

rng.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#ifndef __RNG_H__
2+
#define __RNG_H__
3+
4+
#include <random>
5+
#include <vector>
6+
7+
class RNG {
8+
public:
9+
virtual void manual_seed(uint32_t seed) = 0;
10+
virtual std::vector<float> randn(uint32_t n) = 0;
11+
};
12+
13+
class STDDefaultRNG : public RNG {
14+
private:
15+
std::default_random_engine generator;
16+
17+
public:
18+
void manual_seed(uint32_t seed) {
19+
generator.seed(seed);
20+
}
21+
22+
std::vector<float> randn(uint32_t n) {
23+
std::vector<float> result;
24+
float mean = 0.0;
25+
float stddev = 1.0;
26+
std::normal_distribution<float> distribution(mean, stddev);
27+
for (int i = 0; i < n; i++) {
28+
float random_number = distribution(generator);
29+
result.push_back(random_number);
30+
}
31+
return result;
32+
}
33+
};
34+
35+
#endif // __RNG_H__

rng_philox.h

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#ifndef __RNG_PHILOX_H__
2+
#define __RNG_PHILOX_H__
3+
4+
#include <cmath>
5+
#include <vector>
6+
7+
#include "rng.h"
8+
9+
// RNG imitiating torch cuda randn on CPU.
10+
// Port from: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/5ef669de080814067961f28357256e8fe27544f4/modules/rng_philox.py
11+
class PhiloxRNG : public RNG {
12+
private:
13+
uint64_t seed;
14+
uint32_t offset;
15+
16+
private:
17+
std::vector<uint32_t> philox_m = {0xD2511F53, 0xCD9E8D57};
18+
std::vector<uint32_t> philox_w = {0x9E3779B9, 0xBB67AE85};
19+
float two_pow32_inv = 2.3283064e-10;
20+
float two_pow32_inv_2pi = 2.3283064e-10 * 6.2831855;
21+
22+
std::vector<uint32_t> uint32(uint64_t x) {
23+
std::vector<uint32_t> result(2);
24+
result[0] = static_cast<uint32_t>(x & 0xFFFFFFFF);
25+
result[1] = static_cast<uint32_t>(x >> 32);
26+
return result;
27+
}
28+
29+
std::vector<std::vector<uint32_t>> uint32(const std::vector<uint64_t>& x) {
30+
int N = x.size();
31+
std::vector<std::vector<uint32_t>> result(2, std::vector<uint32_t>(N));
32+
33+
for (int i = 0; i < N; ++i) {
34+
result[0][i] = static_cast<uint32_t>(x[i] & 0xFFFFFFFF);
35+
result[1][i] = static_cast<uint32_t>(x[i] >> 32);
36+
}
37+
38+
return result;
39+
}
40+
41+
// A single round of the Philox 4x32 random number generator.
42+
void philox4_round(std::vector<std::vector<uint32_t>>& counter,
43+
const std::vector<std::vector<uint32_t>>& key) {
44+
uint32_t N = counter[0].size();
45+
for (uint32_t i = 0; i < N; i++) {
46+
std::vector<uint32_t> v1 = uint32(static_cast<uint64_t>(counter[0][i]) * static_cast<uint64_t>(philox_m[0]));
47+
std::vector<uint32_t> v2 = uint32(static_cast<uint64_t>(counter[2][i]) * static_cast<uint64_t>(philox_m[1]));
48+
49+
counter[0][i] = v2[1] ^ counter[1][i] ^ key[0][i];
50+
counter[1][i] = v2[0];
51+
counter[2][i] = v1[1] ^ counter[3][i] ^ key[1][i];
52+
counter[3][i] = v1[0];
53+
}
54+
}
55+
56+
// Generates 32-bit random numbers using the Philox 4x32 random number generator.
57+
// Parameters:
58+
// counter : A 4xN array of 32-bit integers representing the counter values (offset into generation).
59+
// key : A 2xN array of 32-bit integers representing the key values (seed).
60+
// rounds : The number of rounds to perform.
61+
// Returns:
62+
// std::vector<std::vector<uint32_t>>: A 4xN array of 32-bit integers containing the generated random numbers.
63+
std::vector<std::vector<uint32_t>> philox4_32(std::vector<std::vector<uint32_t>>& counter,
64+
std::vector<std::vector<uint32_t>>& key,
65+
int rounds = 10) {
66+
uint32_t N = counter[0].size();
67+
for (int i = 0; i < rounds - 1; ++i) {
68+
philox4_round(counter, key);
69+
70+
for (uint32_t j = 0; j < N; ++j) {
71+
key[0][j] += philox_w[0];
72+
key[1][j] += philox_w[1];
73+
}
74+
}
75+
76+
philox4_round(counter, key);
77+
return counter;
78+
}
79+
80+
float box_muller(float x, float y) {
81+
float u = x * two_pow32_inv + two_pow32_inv / 2;
82+
float v = y * two_pow32_inv_2pi + two_pow32_inv_2pi / 2;
83+
84+
float s = sqrt(-2.0 * log(u));
85+
86+
float r1 = s * sin(v);
87+
return r1;
88+
}
89+
90+
public:
91+
PhiloxRNG(uint64_t seed = 0) {
92+
this->seed = seed;
93+
this->offset = 0;
94+
}
95+
96+
void manual_seed(uint32_t seed) {
97+
this->seed = seed;
98+
this->offset = 0;
99+
}
100+
101+
std::vector<float> randn(uint32_t n) {
102+
std::vector<std::vector<uint32_t>> counter(4, std::vector<uint32_t>(n, 0));
103+
for (uint32_t i = 0; i < n; i++) {
104+
counter[0][i] = this->offset;
105+
}
106+
107+
for (uint32_t i = 0; i < n; i++) {
108+
counter[2][i] = i;
109+
}
110+
this->offset += 1;
111+
112+
std::vector<uint64_t> key(n, this->seed);
113+
std::vector<std::vector<uint32_t>> key_uint32 = uint32(key);
114+
115+
std::vector<std::vector<uint32_t>> g = philox4_32(counter, key_uint32);
116+
117+
std::vector<float> result;
118+
for (int i = 0; i < n; ++i) {
119+
result.push_back(box_muller(g[0][i], g[1][i]));
120+
}
121+
return result;
122+
}
123+
};
124+
125+
#endif // __RNG_PHILOX_H__

stable-diffusion.cpp

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
#include "ggml/ggml.h"
1717
#include "stable-diffusion.h"
18+
#include "rng.h"
19+
#include "rng_philox.h"
1820

1921
static SDLogLevel log_level = SDLogLevel::INFO;
2022

@@ -117,19 +119,11 @@ ggml_tensor* load_tensor_from_file(ggml_context* ctx, const std::string& file_pa
117119
return tensor;
118120
}
119121

120-
static std::default_random_engine generator;
121-
122-
void set_random_seed(int seed) {
123-
generator.seed(seed);
124-
}
125-
126-
void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor) {
127-
float mean = 0.0;
128-
float stddev = 1.0;
129-
std::normal_distribution<float> distribution(mean, stddev);
130-
for (int i = 0; i < ggml_nelements(tensor); i++) {
131-
float random_number = distribution(generator);
132-
ggml_set_f32_1d(tensor, i, random_number);
122+
void ggml_tensor_set_f32_randn(struct ggml_tensor* tensor, std::shared_ptr<RNG> rng) {
123+
uint32_t n = ggml_nelements(tensor);
124+
std::vector<float> random_numbers = rng->randn(n);
125+
for (int i = 0; i < n; i++) {
126+
ggml_set_f32_1d(tensor, i, random_numbers[i]);
133127
}
134128
}
135129

@@ -2747,6 +2741,8 @@ class StableDiffusionGGML {
27472741
bool dynamic = true;
27482742
bool vae_decode_only = false;
27492743
bool free_params_immediately = false;
2744+
2745+
std::shared_ptr<RNG> rng = std::make_shared<STDDefaultRNG>();
27502746
int32_t ftype = 1;
27512747
int n_threads = -1;
27522748
float scale_factor = 0.18215f;
@@ -2765,11 +2761,17 @@ class StableDiffusionGGML {
27652761

27662762
StableDiffusionGGML(int n_threads,
27672763
bool vae_decode_only,
2768-
bool free_params_immediately)
2764+
bool free_params_immediately,
2765+
RNGType rng_type)
27692766
: n_threads(n_threads),
27702767
vae_decode_only(vae_decode_only),
27712768
free_params_immediately(free_params_immediately) {
27722769
first_stage_model.decode_only = vae_decode_only;
2770+
if (rng_type == STD_DEFAULT_RNG) {
2771+
rng = std::make_shared<STDDefaultRNG>();
2772+
} else if (rng_type == CUDA_RNG) {
2773+
rng = std::make_shared<PhiloxRNG>();
2774+
}
27732775
}
27742776

27752777
~StableDiffusionGGML() {
@@ -3539,7 +3541,7 @@ class StableDiffusionGGML {
35393541

35403542
if (sigmas[i + 1] > 0) {
35413543
// x = x + noise_sampler(sigmas[i], sigmas[i + 1]) * s_noise * sigma_up
3542-
ggml_tensor_set_f32_randn(noise);
3544+
ggml_tensor_set_f32_randn(noise, rng);
35433545
// noise = load_tensor_from_file(res_ctx, "./rand" + std::to_string(i+1) + ".bin");
35443546
{
35453547
float* vec_x = (float*)x->data;
@@ -3674,7 +3676,7 @@ class StableDiffusionGGML {
36743676
ggml_tensor* latent = ggml_new_tensor_4d(res_ctx, moments->type, moments->ne[0],
36753677
moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
36763678
struct ggml_tensor* noise = ggml_dup_tensor(res_ctx, latent);
3677-
ggml_tensor_set_f32_randn(noise);
3679+
ggml_tensor_set_f32_randn(noise, rng);
36783680
// noise = load_tensor_from_file(res_ctx, "noise.bin");
36793681
{
36803682
float mean = 0;
@@ -3802,10 +3804,12 @@ class StableDiffusionGGML {
38023804

38033805
StableDiffusion::StableDiffusion(int n_threads,
38043806
bool vae_decode_only,
3805-
bool free_params_immediately) {
3807+
bool free_params_immediately,
3808+
RNGType rng_type) {
38063809
sd = std::make_shared<StableDiffusionGGML>(n_threads,
38073810
vae_decode_only,
3808-
free_params_immediately);
3811+
free_params_immediately,
3812+
rng_type);
38093813
}
38103814

38113815
bool StableDiffusion::load_from_file(const std::string& file_path) {
@@ -3835,7 +3839,7 @@ std::vector<uint8_t> StableDiffusion::txt2img(const std::string& prompt,
38353839
if (seed < 0) {
38363840
seed = (int)time(NULL);
38373841
}
3838-
set_random_seed(seed);
3842+
sd->rng->manual_seed(seed);
38393843

38403844
int64_t t0 = ggml_time_ms();
38413845
ggml_tensor* c = sd->get_learned_condition(ctx, prompt);
@@ -3856,7 +3860,7 @@ std::vector<uint8_t> StableDiffusion::txt2img(const std::string& prompt,
38563860
int W = width / 8;
38573861
int H = height / 8;
38583862
struct ggml_tensor* x_t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, W, H, C, 1);
3859-
ggml_tensor_set_f32_randn(x_t);
3863+
ggml_tensor_set_f32_randn(x_t, sd->rng);
38603864

38613865
std::vector<float> sigmas = sd->denoiser->get_sigmas(sample_steps);
38623866

@@ -3935,7 +3939,7 @@ std::vector<uint8_t> StableDiffusion::img2img(const std::vector<uint8_t>& init_i
39353939
if (seed < 0) {
39363940
seed = (int)time(NULL);
39373941
}
3938-
set_random_seed(seed);
3942+
sd->rng->manual_seed(seed);
39393943

39403944
ggml_tensor* init_img = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, width, height, 3, 1);
39413945
image_vec_to_ggml(init_img_vec, init_img);

stable-diffusion.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,18 @@
44
#include <memory>
55
#include <vector>
66

7-
enum class SDLogLevel {
7+
enum SDLogLevel {
88
DEBUG,
99
INFO,
1010
WARN,
1111
ERROR
1212
};
1313

14+
enum RNGType {
15+
STD_DEFAULT_RNG,
16+
CUDA_RNG
17+
};
18+
1419
enum SampleMethod {
1520
EULAR_A,
1621
};
@@ -24,7 +29,8 @@ class StableDiffusion {
2429
public:
2530
StableDiffusion(int n_threads = -1,
2631
bool vae_decode_only = false,
27-
bool free_params_immediately = false);
32+
bool free_params_immediately = false,
33+
RNGType rng_type = STD_DEFAULT_RNG);
2834
bool load_from_file(const std::string& file_path);
2935
std::vector<uint8_t> txt2img(
3036
const std::string& prompt,

0 commit comments

Comments
 (0)