-
Notifications
You must be signed in to change notification settings - Fork 302
/
Copy pathindex.html
889 lines (694 loc) · 57.3 KB
/
index.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
<!DOCTYPE html>
<html lang="en">
<head>
<!-- Google Tag Manager -->
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-T8XT4PS');</script>
<!-- End Google Tag Manager -->
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<meta http-equiv="X-UA-Compatible" content="ie=edge">
<link rel="shortcut icon" type="image/x-icon" href="/favicon.ico?">
<title>
Accelerating Generative AI with PyTorch II: GPT, Fast | PyTorch
</title>
<meta name="robots" content="index, follow" />
<meta name="description" content="This post is the second part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In this blog we’ll focus on LLM optimization.
" />
<meta property="og:image" content="https://pytorch.org/assets/images/accelerating-generative-ai-2/social-share.jpg" />
<meta name="twitter:image" content="https://pytorch.org/assets/images/accelerating-generative-ai-2/social-share.jpg" />
<meta property="og:locale" content="en_US" />
<meta property="og:type" content="website" />
<meta property="og:title" content="Accelerating Generative AI with PyTorch II: GPT, Fast" />
<meta property="og:description" content="This post is the second part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In this blog we’ll focus on LLM optimization.
" />
<meta property="og:site_name" content="PyTorch" />
<meta name="twitter:card" content="summary_large_image" />
<meta name="twitter:title" content="Accelerating Generative AI with PyTorch II: GPT, Fast" />
<meta name="twitter:description" content="This post is the second part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate Segment Anything over 8x using only pure, native PyTorch. In this blog we’ll focus on LLM optimization.
" />
<link rel="stylesheet" href="/assets/main.css">
<script src="/assets/vendor/jquery.min.js"></script>
<script src="/assets/vendor/popper.min.js"></script>
<script src="/assets/vendor/bootstrap.min.js"></script>
<script src="/assets/vendor/anchor.min.js"></script>
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
tex2jax: {
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'],
inlineMath: [['$','$']]
}
});
</script>
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js"></script>
<script>
!function(f,b,e,v,n,t,s)
{if(f.fbq)return;n=f.fbq=function(){n.callMethod?
n.callMethod.apply(n,arguments):n.queue.push(arguments)};
if(!f._fbq)f._fbq=n;n.push=n;n.loaded=!0;n.version='2.0';
n.queue=[];t=b.createElement(e);t.async=!0;
t.src=v;s=b.getElementsByTagName(e)[0];
s.parentNode.insertBefore(t,s)}(window,document,'script',
'https://connect.facebook.net/en_US/fbevents.js');
fbq('init', '243028289693773');
fbq('track', 'PageView');
</script>
<noscript>
<img height="1" width="1"
src="https://www.facebook.com/tr?id=243028289693773&ev=PageView
&noscript=1"/>
</noscript>
<!-- Twitter universal website tag code -->
<img height="1" width="1" style="display:none;" alt="" src="https://analytics.twitter.com/i/adsct?p_id=Twitter&p_user_id=0&txn_id=o2gi1&events=%5B%5B%22pageview%22%2Cnull%5D%5D&tw_sale_amount=0&tw_order_quantity=0 (https://urldefense.proofpoint.com/v2/url?u=https-3A__analytics.twitter.com_i_adsct-3Fp-5Fid-3DTwitter-26p-5Fuser-5Fid-3D0-26txn-5Fid-3Do2gi1-26events-3D-255B-255B-2522pageview-2522-252Cnull-255D-255D-26tw-5Fsale-5Famount-3D0-26tw-5Forder-5Fquantity-3D0&d=DwMGaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=GMr8XYCDyeQQZuD3noL91A&m=dAJyokk16UvYy-vMrGn_JwYiGfp_eEgo25B9iGDCG-A&s=o6i4D0V0088WH2RnzIoqiF-vj45PL-2sTrsxQ0SNO3A&e=)" />
<img height="1" width="1" style="display:none;" alt="" src="//t.co/i/adsct?p_id=Twitter&p_user_id=0&txn_id=o2gi1&events=%5B%5B%22pageview%22%2Cnull%5D%5D&tw_sale_amount=0&tw_order_quantity=0 (https://urldefense.proofpoint.com/v2/url?u=https-3A__linkprotect.cudasvc.com_url-3Fa-3Dhttp-253a-252f-252ft.co-252fi-252fadsct-253fp-5Fid-253dTwitter-2526p-5Fuser-5Fid-253d0-2526txn-5Fid-253do2gi1-2526events-253d-25255B-25255B-252522pageview-252522-25252Cnull-25255D-25255D-2526tw-5Fsale-5Famount-253d0-2526tw-5Forder-5Fquantity-253d0-26c-3DE-2C1-2CC33dLwIhtuEcl5FhdztSnUwsioeej5k-2DWy0RYREBAq51kGji32A2Cw94YU9vQBpY5tPN3AukEw3C-5F-2DlbtndnLoR7-5FA-5FLoH0Rr7zLtP1ykptN-26typo-3D1&d=DwMGaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=GMr8XYCDyeQQZuD3noL91A&m=dAJyokk16UvYy-vMrGn_JwYiGfp_eEgo25B9iGDCG-A&s=Abgc3XBkhESv8XBYtLchdDZyISGsK6v_BB6cLMJGyCw&e=)" />
<!-- End Twitter universal website tag code -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
<link href="/feed.xml" type="application/atom+xml" rel="alternate" title="Pythorch Blog Posts" />
</head>
<body class="blog">
<!-- Google Tag Manager (noscript) -->
<noscript><iframe src="https://www.googletagmanager.com/ns.html?id=GTM-T8XT4PS"
height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
<!-- End Google Tag Manager (noscript) -->
<div class="main-background blog-background blog-detail-background"></div>
<div class="hello-bar">
<div class="container">
Join us at PyTorch Conference in San Francisco, October 22-23. CFP open now! <a target="_blank" href="https://events.linuxfoundation.org/pytorch-conference/">Learn more</a>.
</div>
</div>
<div class="container-fluid header-holder blog-detail-header">
<div class="container">
<div class="header-container">
<a class="header-logo" href="https://pytorch.org" aria-label="PyTorch"></a>
<div class="main-menu">
<ul>
<li class="main-menu-item">
<div id="dropdownMenuButton" data-toggle="resources-dropdown" class="resources-dropdown">
<a class="with-down-arrow">
Learn
</a>
<div class="resources-dropdown-menu">
<a class="nav-dropdown-item" href="/get-started">
<span class=dropdown-title>Get Started</span>
<p>Run PyTorch locally or get started quickly with one of the supported cloud platforms</p>
</a>
<a class="nav-dropdown-item" href="https://pytorch.org/tutorials/">
<span class="dropdown-title">Tutorials</span>
<p>Whats new in PyTorch tutorials</p>
</a>
<a class="nav-dropdown-item" href="https://pytorch.org/tutorials/beginner/basics/intro.html">
<span class="dropdown-title">Learn the Basics</span>
<p>Familiarize yourself with PyTorch concepts and modules</p>
</a>
<a class="nav-dropdown-item" href="https://pytorch.org/tutorials/recipes/recipes_index.html">
<span class="dropdown-title">PyTorch Recipes</span>
<p>Bite-size, ready-to-deploy PyTorch code examples</p>
</a>
<a class="nav-dropdown-item" href="https://pytorch.org/tutorials/beginner/introyt.html">
<span class="dropdown-title">Intro to PyTorch - YouTube Series</span>
<p>Master PyTorch basics with our engaging YouTube tutorial series</p>
</a>
<a class="nav-dropdown-item" href="/new">
<span class="dropdown-title">New to PyTorch Foundation</span>
</a>
</div>
</div>
</li>
<li class="main-menu-item">
<div id="dropdownMenuButton" data-toggle="resources-dropdown" class="resources-dropdown">
<a class="with-down-arrow">
Ecosystem
</a>
<div class="resources-dropdown-menu">
<a class="nav-dropdown-item" href="https://landscape.pytorch.org/" target="_blank">
<span class="dropdown-title">Tools</span>
<p>Learn about the tools and frameworks in the PyTorch Ecosystem</p>
</a>
<a class="nav-dropdown-item" href="/join-ecosystem">
<span class="dropdown-title">Join the Ecosystem</span>
</a>
<a class="nav-dropdown-item" href="/#community-module">
<span class=dropdown-title>Community</span>
<p>Join the PyTorch developer community to contribute, learn, and get your questions answered.</p>
</a>
<a class="nav-dropdown-item" href="https://discuss.pytorch.org" target="_blank">
<span class=dropdown-title>Forums</span>
<p>A place to discuss PyTorch code, issues, install, research</p>
</a>
<a class="nav-dropdown-item" href="/resources">
<span class=dropdown-title>Developer Resources</span>
<p>Find resources and get questions answered</p>
</a>
<a class="nav-dropdown-item" href="/ecosystem/contributor-awards-2024">
<span class="dropdown-title">Contributor Awards - 2024</span>
<p>Award winners announced at this year's PyTorch Conference</p>
</a>
</div>
</div>
</li>
<li class="main-menu-item">
<div id="dropdownMenuButton" data-toggle="resources-dropdown" class="resources-dropdown">
<a class="with-down-arrow">
Edge
</a>
<div class="resources-dropdown-menu">
<a class="nav-dropdown-item" href="/edge">
<span class="dropdown-title">About PyTorch Edge</span>
<p>Build innovative and privacy-aware AI experiences for edge devices</p>
</a>
<a class="nav-dropdown-item" href="/executorch-overview">
<span class="dropdown-title">ExecuTorch</span>
<p>End-to-end solution for enabling on-device inference capabilities across mobile and edge devices</p>
</a>
<a class="nav-dropdown-item" target="_blank" href="https://pytorch.org/executorch/stable/index.html">
<span class="dropdown-title">ExecuTorch Documentation</span>
</a>
</div>
</div>
</li>
<li class="main-menu-item">
<div id="docsDropdownButton" data-toggle="resources-dropdown" class="resources-dropdown">
<a class="with-down-arrow">
Docs
</a>
<div class="resources-dropdown-menu">
<a class="nav-dropdown-item" href="https://pytorch.org/docs">
<span class="dropdown-title">PyTorch</span>
<p>Explore the documentation for comprehensive guidance on how to use PyTorch.</p>
</a>
<a class="nav-dropdown-item" href="/pytorch-domains">
<span class="dropdown-title">PyTorch Domains</span>
<p> Read the PyTorch Domains documentation to learn more about domain-specific libraries.</p>
</a>
</div>
</div>
</li>
<li class="main-menu-item">
<div id="dropdownMenuButton" data-toggle="resources-dropdown" class="resources-dropdown">
<a class="with-down-arrow">
Blog & News
</a>
<div class="resources-dropdown-menu">
<a class="nav-dropdown-item" href="/blog">
<span class="dropdown-title">PyTorch Blog</span>
<p>Catch up on the latest technical news and happenings</p>
</a>
<a class="nav-dropdown-item" href="/community-blog">
<span class="dropdown-title">Community Blog</span>
<p>Stories from the PyTorch ecosystem</p>
</a>
<a class="nav-dropdown-item" href="/videos">
<span class="dropdown-title">Videos</span>
<p>Learn about the latest PyTorch tutorials, new, and more </p>
</a>
<a class="nav-dropdown-item" href="/community-stories">
<span class="dropdown-title">Community Stories</span>
<p>Learn how our community solves real, everyday machine learning problems with PyTorch</p>
</a>
<a class="nav-dropdown-item" href="/events">
<span class=dropdown-title>Events</span>
<p>Find events, webinars, and podcasts</p>
</a>
<a class="nav-dropdown-item" href="/newsletter">
<span class=dropdown-title>Newsletter</span>
<p>Stay up-to-date with the latest updates</p>
</a>
</div>
</div>
</li>
<li class="main-menu-item">
<div id="resourcesDropdownButton" data-toggle="resources-dropdown" class="resources-dropdown">
<a class="with-down-arrow">
About
</a>
<div class="resources-dropdown-menu">
<a class="nav-dropdown-item" href="/foundation">
<span class=dropdown-title>PyTorch Foundation</span>
<p>Learn more about the PyTorch Foundation.</p>
</a>
<a class="nav-dropdown-item" href="/governing-board">
<span class=dropdown-title>Governing Board</span>
</a>
<a class="nav-dropdown-item" href="/credits">
<span class=dropdown-title>Cloud Credit Program</span>
</a>
<a class="nav-dropdown-item" href="/tac">
<span class=dropdown-title>Technical Advisory Council</span>
</a>
<a class="nav-dropdown-item" href="/staff">
<span class=dropdown-title>Staff</span>
</a>
<a class="nav-dropdown-item" href="/contact-us">
<span class=dropdown-title>Contact Us</span>
</a>
</div>
</div>
</li>
<li class="main-menu-item">
<a href="/join" data-cta="join">
Become a Member
</a>
</li>
<li class="main-menu-item" id="github-main-menu-link">
<a href="https://github.com/pytorch/pytorch" title="Go to PyTorch GitHub">
<div id="topnav-gh-icon"></div>
</a>
</li>
<li class="navSearchWrapper reactNavSearchWrapper" key="search">
<div class="search-border">
<div id="search-icon"></div>
<input
id="search-input"
type="text"
title="Search"
/>
<div id="close-search">X</div>
</div>
</li>
</ul>
</div>
<script src="/assets/main-menu-dropdown.js"></script>
<a class="main-menu-open-button" href="#" data-behavior="open-mobile-menu"></a>
</div>
</div>
</div>
<div class="jumbotron jumbotron-fluid blog-detail-jumbotron">
<div class="container blog-detail-container">
<p class="featured-post">November 30, 2023</p>
<h1>
<a class="blog-title">Accelerating Generative AI with PyTorch II: GPT, Fast</a>
</h1>
</div>
</div>
<div class="main-content-wrapper blog-detail-wrapper">
<div class="main-content blog-detail-content">
<div class="container">
<img src="/assets/images/logo-icon.svg" class="img-fluid author-icon">
<article class="pytorch-article">
<p class="author">
by
Team PyTorch
</p>
<p>This post is the second part of a multi-series blog focused on how to accelerate generative AI models with pure, native PyTorch. We are excited to share a breadth of newly released PyTorch performance features alongside practical examples to see how far we can push PyTorch native performance. In part one, we showed how to accelerate <a href="https://pytorch.org/blog/accelerating-generative-ai/">Segment Anything over 8x</a> using only pure, native PyTorch. In this blog we’ll focus on LLM optimization.</p>
<p>Over the past year, generative AI use cases have exploded in popularity. Text generation has been one particularly popular area, with lots of innovation among open-source projects such as <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a>, <a href="https://github.com/vllm-project/vllm">vLLM</a>, and <a href="https://github.com/mlc-ai/mlc-llm">MLC-LLM</a>.</p>
<p>While these projects are performant, they often come with tradeoffs in ease of use, such as requiring model conversion to specific formats or building and shipping new dependencies. This begs the question: <strong>how fast can we run transformer inference with only pure, native PyTorch?</strong></p>
<p>As announced during our recent <a href="https://www.youtube.com/watch?v=IWpM_9AsC-U">PyTorch Developer Conference</a>, the PyTorch team wrote a from-scratch LLM <strong>almost 10x faster than baseline,</strong> with no loss of accuracy, all using native PyTorch optimizations. We leverage a breadth of optimizations including:</p>
<ul>
<li><strong><a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html">Torch.compile</a></strong>: A compiler for PyTorch models</li>
<li><strong><a href="https://github.com/pytorch-labs/ao/tree/main#torchao">GPU quantization</a></strong>: Accelerate models with reduced precision operations</li>
<li><strong><a href="https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py#L76">Speculative Decoding</a></strong>: Accelerate LLMs using a small “draft” model to predict large “target” model’s output</li>
<li><strong><a href="https://github.com/pytorch-labs/gpt-fast/blob/main/tp.py">Tensor Parallelism</a></strong>: Accelerate models by running them across multiple devices.</li>
</ul>
<p>And, even better, we can do it in <strong>less than 1000 lines of native PyTorch code</strong>.</p>
<p>If this excites you enough to jump straight into the code, check it out at <a href="https://github.com/pytorch-labs/gpt-fast">https://github.com/pytorch-labs/gpt-fast</a>!</p>
<p><img src="/assets/images/accelerating-generative-ai-2/screen-recording.gif" alt="Screen recording" style="width:100%;" /></p>
<p><em>Note: We will be focusing on latency (i.e. batch size=1) for all of these benchmarks. Unless otherwise specified, all benchmarks are run on an A100-80GB, power limited to 330W.</em></p>
<h2 id="starting-point-255-toks">Starting Point (25.5 tok/s)</h2>
<p>Let’s start off with an extremely basic and simple implementation.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image23.png" alt="simple implementation" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<p>Sadly, this does not perform very well. But why? Looking at a trace reveals the answer - it’s heavily <strong>CPU overhead bound</strong>! What this means is that our CPU is not able to tell the GPU what to do fast enough for the GPU to be fully utilized.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image14.png" alt="trace" style="width:100%;" /></p>
<p>Imagine the GPU as this super massive factory with a ridiculous amount of compute available. Then, imagine the CPU as some messenger shuttling instructions back and forth to the GPU. Remember, in large scale deep learning systems, the GPU is responsible for doing 100% of the work! In such systems, the only role of the CPU is to tell the GPU what work it should be doing.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image16.png" alt="factory" style="width:100%;display: block;max-width:500px; margin-left:auto; margin-right:auto;" /></p>
<p>So, the CPU runs over and tells the GPU to do an “add”, but by the time the CPU can give the GPU another chunk of work, the GPU has long finished the previous chunk of work.</p>
<p>Despite the fact that the GPU needs to perform thousands of computations while the CPU only needs to do orchestration work, this is surprisingly common! There’s a variety of reasons for this, ranging from the fact that the CPU is likely running some single-threaded Python to the fact that GPUs are just incredibly fast nowadays.</p>
<p>Regardless of the reason, we now find ourselves in the <strong>overhead-bound regime</strong>. So, what can we do? One, we could rewrite our implementation in C++, perhaps even eschew frameworks entirely and write raw CUDA. Or…. we could just send more work to the GPU at once.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image3.png" alt="factory" style="width:100%;display: block;max-width:500px; margin-left:auto; margin-right:auto;" /></p>
<p>By just sending a massive chunk of work at once, we can keep our GPU busy! Although during training, this may just be accomplished by increasing your batch size, how do we do this during inference?</p>
<p>Enter torch.compile.</p>
<h2 id="step-1-reducing-cpu-overhead-through-torchcompile-and-a-static-kv-cache-1070-toks">Step 1: Reducing CPU overhead through torch.compile and a static kv-cache (107.0 tok/s)</h2>
<p>Torch.compile allows us to capture a larger region into a single compiled region, and particularly when run with mode=”reduce-overhead”, is very effective at reducing CPU overhead. Here, we also specify fullgraph=True, which validates that there are no “graph breaks” in your model (i.e. portions that torch.compile cannot compile). In other words, it ensures that torch.compile is running to its fullest potential.</p>
<p>To apply it, we <a href="https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py#L296">simply wrap a function (or a module) with it</a>.</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True)
</code></pre></div></div>
<p>However, there are a couple of nuances here that make it somewhat nontrivial for folks to get significant performance boosts from applying torch.compile to text generation.</p>
<p>The first obstacle is the kv-cache. The kv-cache is an inference-time optimization that caches the activations computed for the previous tokens (see <a href="https://www.dipkumar.dev/becoming-the-unbeatable/posts/gpt-kvcache/">here</a> for a more in-depth explanation). However, as we generate more tokens, the “logical length” of the kv-cache grows. This is problematic for two reasons. One is that reallocating (and copying!) the kv-cache every time the cache grows is simply expensive. The other one is that this dynamism makes it harder to reduce the overhead, as we are no longer able to leverage approaches like cudagraphs.</p>
<p>To resolve this, we use a<a href="https://github.com/pytorch-labs/gpt-fast/blob/0afae1ace441ce4c5d02ef11a72da28cf7ca4795/generate.py#L154"> “static” kv-cache</a>, which means that we statically allocate the maximum size of the kv-cache, and then mask out the unused values in the attention portion of the computation.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image2.png" alt="code" style="width:100%;" /></p>
<p>The second obstacle is the prefill phase. Transformer text generation is best thought of as a two phase process: 1. The prefill where the entire prompt is processed, and 2. Decoding where each token is generated autoregressively.</p>
<p>Although decoding can be made entirely static once the kv-cache is made static, the prefill stage still requires significantly more dynamism, due to having a variable prompt length. Thus, we actually need to compile the two stages with separate compilation strategies.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image9.png" alt="compile" style="width:100%;" /></p>
<p>Although these details are a bit tricky, the actual implementation is not very difficult at all (see gpt-fast)! And the performance boost is dramatic.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image28.png" alt="chart" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<p>All of a sudden, our performance improves by more than 4x! Such performance gains are often common when one’s workload is overhead bound.</p>
<h2 id="sidenote-how-is-torchcompile-helping">Sidenote: How is torch.compile helping?</h2>
<p>It is worth disentangling how exactly torch.compile is improving performance. There’s 2 main factors leading to torch.compile’s performance.</p>
<p>The first factor, like mentioned above, is overhead reduction. Torch.compile is able to reduce overhead through a variety of optimizations, but one of the most effective ones is called <a href="https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/">CUDAGraphs</a>. Although torch.compile applies this automatically for you when “reduce-overhead” is set, saving the extra work and code you need to write when doing this yourself manually without torch.compile.</p>
<p>The second factor, however, is that torch.compile simply generates faster kernels. In the decoding benchmark above, torch.compile actually generates every single kernel from scratch, including both the matrix multiplications and the attention! And even cooler, these kernels are actually faster than the built in alternatives (CuBLAS and FlashAttention2)!</p>
<p>This may sound implausible to many of you, considering how hard it is to write efficient matrix multiplication/attention kernels, and how much manpower has been put into CuBLAS and FlashAttention. The key here, however, is that transformer decoding has very unusual computational properties. In particular, because of the KV-cache, for BS=1 <em>every single matrix multiplication in a transformer is actually a matrix vector multiplication</em>.</p>
<p>This means that the computations are completely <em>memory-bandwidth bound</em>, and as such, are well within the range of compilers to automatically generate. And in fact, when we benchmark torch.compile’s matrix-vector multiplications against CuBLAS, we find that torch.compile’s kernels are actually quite a bit faster!</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image24.png" alt="code" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<p><img src="/assets/images/accelerating-generative-ai-2/image17.png" alt="code" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<h2 id="step-2-alleviating-memory-bandwidth-bottleneck-through-int8-weight-only-quantization-1574-toks">Step 2: Alleviating memory bandwidth bottleneck through int8 weight-only quantization (157.4 tok/s)</h2>
<p>So, given that we’ve already seen massive speedups from applying torch.compile, is it possible to do even better? One way to think about this problem is to compute how close we are to the theoretical peak. In this case, the largest bottleneck is the cost of loading the weights from GPU global memory to registers. In other words, each forward pass requires us to “touch” every single parameter on the GPU. So, how fast can we theoretically “touch” every single parameter in a model?</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image11.png" alt="weights" style="width:100%;display: block;max-width:500px; margin-left:auto; margin-right:auto;" /></p>
<p>To measure this, we can use <strong>Model Bandwidth Utilization (MBU).</strong> This measures what percentage of our memory bandwidth we’re able to use during inference.</p>
<p>Computing it is pretty simple. We simply take the total size of our model (# params * bytes per param) and multiply it by the number of inferences we can do per second. Then, we divide this by the peak bandwidth of the GPU to get our MBU.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image8.png" alt="MBU" style="width:100%;" /></p>
<p>For example, for our above case, we have a 7B parameter model. Each parameter is stored in fp16 (2 bytes per parameter), and we achieved 107 tokens/s. Finally, our A100-80GB has a theoretical 2 TB/s of memory bandwidth.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image25.png" alt="MBU" style="width:100%;" /></p>
<p>Putting this all together, we get **72% MBU! **This is quite good, considering that even just copying memory struggles to break 85%.</p>
<p>But… it does mean that we’re pretty close to the theoretical limit here, and that we’re clearly bottlenecked on just loading our weights from memory. It doesn’t matter what we do - without changing the problem statement in some manner, we might only be able to eek out another 10% in performance.</p>
<p>Let’s take another look at the above equation. We can’t really change the number of parameters in our model. We can’t really change the memory bandwidth of our GPU (well, without paying more money). But, we <strong>can</strong> change how many bytes each parameter is stored in!</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image18.png" alt="MBU" style="width:100%;" /></p>
<p>Thus, we arrive at our next technique - int8 quantization. The idea here is simple. If loading our weights from memory is our main bottleneck, why don’t we just make the weights smaller?</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image7.png" alt="MBU" style="width:100%;" /></p>
<p>Note that this is quantizing <em>only</em> the weights - the computation itself is still done in bf16. This makes this form of quantization easy to apply with very little to no accuracy degradation.</p>
<p>Moreover, torch.compile can also easily generate efficient code for int8 quantization. Let’s look again at the above benchmark, this time with int8 weight-only quantization included.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image1.png" alt="code" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<p><img src="/assets/images/accelerating-generative-ai-2/image27.png" alt="code" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<p>As you can see from the dark blue line (torch.compile + int8), there is a significant performance improvement when using torch.compile + int8 weight-only quantization! Moreover, the light-blue line (no torch.compile + int8) is actually much worse than even the fp16 performance! This is because in order to take advantage of the perf benefits of int8 quantization, we need the kernels to be fused. This shows one of the benefits of torch.compile - these kernels can be automatically generated for the user!</p>
<p><a href="https://github.com/pytorch-labs/gpt-fast/blob/main/quantize.py#L314">Applying int8 quantization to our model</a>, we see a nice 50% performance improvement, bringing us up to 157.4 tokens/s!</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image19.png" alt="chart" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<h2 id="step-3-reframing-the-problem-using-speculative-decoding">Step 3: Reframing the problem using speculative decoding</h2>
<p>Even after using techniques like quantization, we’re still faced with another problem. In order to generate 100 tokens, we must load our weights 100 times.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image5.png" alt="diagram" style="width:100%;" /></p>
<p>Even if the weights are quantized, we still must load our weights over and over, once for each token we generate! Is there any way around this?</p>
<p>At first glance, the answer might seem like no - there’s a strict serial dependency in our autoregressive generation. However, as it turns out, by utilizing <a href="https://arxiv.org/abs/2211.17192">speculative decoding</a>, we’re able to break this strict serial dependency and obtain speedups!</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image21.png" alt="engineers" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<p>Imagine you had a senior engineer (called Verity), who makes the right technical decisions but is rather slow at writing code. However, you also have a junior engineer (called Drake), who doesn’t always make the right technical decisions but can write code much faster (and cheaper!) than Verity. How can we take advantage of Drake (the junior engineer) to write code faster while ensuring that we are still making the right technical decisions?</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image6.png" alt="engineers" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<p>First, Drake goes through the labor-intensive process of writing the code, making technical decisions along the way. Next, we give the code to Verity to review.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image15.png" alt="engineers" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<p>Upon reviewing the code, Verity might decide that the first 3 technical decisions Drake made are correct, but the last 2 need to be redone. So, Drake goes back, throws away his last 2 decisions, and restarts coding from there.</p>
<p>Notably, although Verity (the senior engineer) has only looked at the code once, we are able to generate 3 pieces of validated code identical to what she would have written! Thus, assuming Verity is able to review the code faster than it would have taken her to write those 3 pieces herself, this approach comes out ahead.</p>
<p>In the context of transformer inference, Verity would be played by the role of the larger model whose outputs we want for our task, called the <strong>verifier model</strong>. Similarly, Drake would be played by a smaller model that’s able to generate text much faster than the larger model, called the <strong>draft model</strong>. So, we would generate 8 tokens using the draft model, and then process all eight tokens in parallel using the verifier model, throwing out the ones that don’t match.</p>
<p>Like mentioned above, one crucial property of speculative decoding is that <strong>it does not change the quality of the output</strong>. As long as the time it takes for generating the tokens using the draft model + verifying the tokens is less than it would have taken to generate those tokens, we come out ahead.</p>
<p>One of the great things about doing this all in native PyTorch is that this technique is actually really easy to implement! Here’s the <a href="https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py#L76">entirety of the implementation</a>, in about 50 lines of native PyTorch.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image10.png" alt="code" style="width:100%;" /></p>
<p>Although speculative decoding guarantees that we have mathematically identical results compared to regular generation, it does have the property that the runtime performance varies depending on the generated text, as well as how aligned the draft and verifier model are. For example, when running CodeLlama-34B + CodeLlama-7B, we’re able to obtain a 2x boost in tokens/s for generating code. On the other hand, when using Llama-7B + TinyLlama-1B, we’re only able to obtain about a 1.3x boost in tokens/s.</p>
<h2 id="sidenote-running-this-on-amd">Sidenote: Running this on AMD</h2>
<p>Like mentioned above, every single kernel in decoding is generated from scratch by torch.compile, and is converted into OpenAI Triton. As AMD has a <a href="https://pytorch.org/blog/experience-power-pytorch-2.0/">torch.compile backend</a> (and also a Triton backend), we can simply go through all of the optimizations above… but on an AMD GPU! With int8 quantization, we’re able to achieve 102.5 tokens/s with one GCD (i.e. one half) of a MI250x!</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image4.png" alt="chart" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<h2 id="step-4-reducing-the-size-of-the-weights-even-more-with-int4-quantization-and-gptq-2021-toks">Step 4: Reducing the size of the weights even more with int4 quantization and GPTQ (202.1 tok/s)</h2>
<p>Of course, if reducing the weights down from 16 bits to 8 bits allows for speedups by reducing the number of bytes we need to load, reducing the weights down to 4 bits would result in even larger speedups!</p>
<p>Unfortunately, when reducing weights down to 4-bits, the accuracy of the model starts to become a much larger concern. From our preliminary evals, we see that although using int8 weight-only quantization has no perceptible accuracy degradation, using int4 weight-only quantization does.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image13.png" alt="table" style="width:100%;" /></p>
<p>There are 2 main tricks we can use to limit the accuracy degradation of int4 quantization.</p>
<p>The first one is to have a more granular scaling factor. One way to think about the scaling factor is that when we have a quantized tensor representation, it is on a sliding scale between a floating point tensor (each value has a scaling factor) and an integer tensor (no values have a scaling factor). For example, with int8 quantization, we had one scaling factor per row. If we want higher accuracy, however, we can change that to “one scaling factor per 32 elements”. We choose a group size of 32 to minimize accuracy degradation, and this is also a common choice among the community.</p>
<p>The other one is to use a more advanced quantization strategy than simply rounding the weights. For example, approaches like <a href="https://arxiv.org/abs/2210.17323">GPTQ</a> leverage example data in order to calibrate the weights more accurately. In this case, we prototype an implementation of GPTQ in the repository based off of PyTorch’s recently released <a href="https://pytorch.org/tutorials/intermediate/torch_export_tutorial.html">torch.export</a>.</p>
<p>In addition, we need kernels that fuse int4 dequantize with the matrix vector multiplication. In this case, torch.compile is unfortunately not able to generate these kernels from scratch, so we leverage some handwritten CUDA kernels in PyTorch.</p>
<p>These techniques require some additional work, but putting them all together results in even better performance!</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image12.png" alt="chart" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<h2 id="step-5-combining-everything-together-2447-toks">Step 5: Combining everything together (244.7 tok/s)</h2>
<p>Finally, we can compose all of the techniques together to achieve even better performance!</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image22.png" alt="chart" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<h2 id="step-6-using-tensor-parallelism">Step 6: Using Tensor Parallelism</h2>
<p>So far, we’ve been restricting ourselves to minimizing latency while on a single GPU. In many settings, however, we have access to multiple GPUs. This allows us to improve our latency further!</p>
<p>To get an intuitive sense of why this would allow us to improve our latency, let’s take a look at the prior equation for MBU, particularly the denominator. Running on multiple GPUs gives us access to more memory bandwidth, and thus, higher potential performance.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image8.png" alt="MBU" style="width:100%;" /></p>
<p>As for which parallelism strategy to pick, note that in order to reduce our latency for one example, we need to be able to leverage our memory bandwidth across more devices simultaneously. This means that we need to split the processing of one token across multiple devices. In other words, we need to use tensor parallelism.</p>
<p>Luckily, PyTorch also provides low-level tools for tensor-parallelism that compose with torch.compile. We are also working on higher-level APIs for expressing tensor parallelism, stay tuned for those!</p>
<p>However, even without a higher-level API, it’s actually still quite easy to add tensor parallelism. Our implementation comes in at <a href="https://github.com/pytorch-labs/gpt-fast/blob/main/tp.py">150 lines of code</a>, and doesn’t require any model changes.</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image20.png" alt="code" style="width:100%;" /></p>
<p>We are still able to take advantage of all the optimizations mentioned previously, which all can continue to compose with tensor parallelism. Combining these together, we’re able to serve Llama-70B at 55 tokens/s with int8 quantization!</p>
<p><img src="/assets/images/accelerating-generative-ai-2/image26.png" alt="chart" style="width:100%;display: block;max-width:600px; margin-left:auto; margin-right:auto;" /></p>
<h2 id="conclusion">Conclusion</h2>
<p>Let’s take a look at what we’re able to accomplish.</p>
<ol>
<li>Simplicity: Ignoring quantization, <a href="https://github.com/pytorch-labs/gpt-fast/blob/main/model.py">model.py</a> (244 LOC) + <a href="https://github.com/pytorch-labs/gpt-fast/blob/main/generate.py">generate.py</a> (371 LOC) + <a href="https://github.com/pytorch-labs/gpt-fast/blob/main/tp.py">tp.py</a> (151 LOC) comes out to 766 LOC to implement fast inference + speculative decoding + tensor-parallelism.</li>
<li>Performance: With Llama-7B, we’re able to use compile + int4 quant + speculative decoding to reach 241 tok/s. With llama-70B, we’re able to also throw in tensor-parallelism to reach 80 tok/s. These are both close to or surpassing SOTA performance numbers!</li>
</ol>
<p>PyTorch has always allowed for simplicity, ease of use, and flexibility. However, with torch.compile, we can throw in performance as well.</p>
<p>The code can be found here: <a href="https://github.com/pytorch-labs/gpt-fast">https://github.com/pytorch-labs/gpt-fast</a>. We hope that the community finds it useful. Our goal with this repo is not to provide another library or framework for people to import. Instead, we encourage users to copy-paste, fork, and modify the code in the repo.</p>
<h2 id="acknowledgements">Acknowledgements</h2>
<p>We would like to thank the vibrant open source community for their continual support of scaling LLMs, including:</p>
<ul>
<li>Lightning AI for supporting pytorch and work in flash attention, int8 quantization, and LoRA fine-tuning.</li>
<li>GGML for driving forward fast, on device inference of LLMs</li>
<li>Andrej Karpathy for spearheading simple, interpretable and fast LLM implementations</li>
<li>MLC-LLM for pushing 4-bit quantization performance on heterogenous hardware</li>
</ul>
</article>
</div>
</div>
</div>
<!--
-->
<div class="container-fluid docs-tutorials-resources">
<div class="container">
<div class="row">
<div class="col-md-4 text-center">
<h2>Docs</h2>
<p>Access comprehensive developer documentation for PyTorch</p>
<a class="with-right-arrow" href="/docs">View Docs</a>
</div>
<div class="col-md-4 text-center">
<h2>Tutorials</h2>
<p>Get in-depth tutorials for beginners and advanced developers</p>
<a class="with-right-arrow" href="https://pytorch.org/tutorials">View Tutorials</a>
</div>
<div class="col-md-4 text-center">
<h2>Resources</h2>
<p>Find development resources and get your questions answered</p>
<a class="with-right-arrow" href="/resources">View Resources</a>
</div>
</div>
</div>
</div>
<footer class="site-footer">
<div class="container footer-container">
<div class="newsletter" id="newsletter">
<p
class="newsletter__title is-style-max-width-800"><strong>Stay in touch</strong> for updates, event info, and the latest news</p>
<script charset="utf-8" type="text/javascript" src="//js.hsforms.net/forms/embed/v2.js"></script>
<script>
hbspt.forms.create({
region: "na1",
portalId: "8112310",
formId: "2fb2231c-000b-4ec5-88a0-1ab242549c9e"
});
</script>
<p
class="newsletter__privacy">By submitting this form, I consent to receive marketing emails from the LF and its projects regarding their events, training, research, developments, and related announcements. I understand that I can unsubscribe at any time using the links in the footers of the emails I receive. <a href="https://www.linuxfoundation.org/privacy/">Privacy Policy</a>.</p>
</div>
<div class="lf-grid">
<div class="footer-logo-wrapper">
<a href="https://pytorch.org" class="footer-logo">
<img src="/assets/images/logo-icon.svg" alt="PyTorch logo" width="40">
</a>
</div>
<ul class="social-links">
<li><a href="https://www.facebook.com/pytorch" target="_blank" title="PyTorch on Facebook">
<svg xmlns="http://www.w3.org/2000/svg" viewbox="-0.51 -0.26 26.45 26.45" aria-label="Facebook"><path fill="currentColor" d="M25.497 13.075c0-2.45-.698-4.848-2.011-6.911a12.765 12.765 0 0 0-5.398-4.73A12.671 12.671 0 0 0 11.008.38a12.705 12.705 0 0 0-6.529 2.95A12.827 12.827 0 0 0 .563 9.358a12.896 12.896 0 0 0-.07 7.201 12.831 12.831 0 0 0 3.801 6.103 12.709 12.709 0 0 0 6.471 3.078v-8.957H7.53v-3.708h3.235v-2.824c0-3.213 1.903-4.988 4.813-4.988.956.014 1.909.097 2.852.25V8.67h-1.607a1.83 1.83 0 0 0-1.518.497 1.854 1.854 0 0 0-.561 1.505v2.404h3.535l-.563 3.708h-2.97v8.957a12.725 12.725 0 0 0 7.697-4.337 12.87 12.87 0 0 0 3.054-8.328z"/></svg>
</a></li>
<li><a href="https://twitter.com/pytorch" target="_blank" title="PyTorch on X">
<svg xmlns="http://www.w3.org/2000/svg" viewbox="0 0 300 300" aria-label="X"><path fill="currentColor" d="M178.57 127.15 290.27 0h-26.46l-97.03 110.38L89.34 0H0l117.13 166.93L0 300.25h26.46l102.4-116.59 81.8 116.59h89.34M36.01 19.54H76.66l187.13 262.13h-40.66"/></svg>
</a></li>
<li><a href="https://www.youtube.com/pytorch" target="_blank" title="PyTorch on YouTube">
<svg xmlns="http://www.w3.org/2000/svg" viewbox="0.21 0.27 34.45 25.07" aria-label="YouTube"><path fill="currentColor" d="M33.729 6.084s-.327-2.33-1.317-3.356a4.691 4.691 0 0 0-3.32-1.432c-4.634-.34-11.589-.34-11.589-.34h-.014s-6.954 0-11.59.342a4.692 4.692 0 0 0-3.32 1.432c-.993 1.025-1.315 3.354-1.315 3.354a52.189 52.189 0 0 0-.331 5.473v2.566c.014 1.829.125 3.656.331 5.472 0 0 .322 2.33 1.316 3.36 1.26 1.345 2.916 1.3 3.653 1.445 2.65.26 11.263.34 11.263.34s6.96-.01 11.597-.353a4.691 4.691 0 0 0 3.32-1.432c.993-1.026 1.316-3.356 1.316-3.356.206-1.817.316-3.644.33-5.473v-2.57a52.26 52.26 0 0 0-.33-5.472zM14.076 17.232V7.729l8.951 4.768-8.95 4.735z"/></svg>
</a></li>
<li><a href="https://www.linkedin.com/company/pytorch" target="_blank" title="PyTorch on LinkedIn">
<svg xmlns="http://www.w3.org/2000/svg" viewbox="-10.23 -10.23 531.96 531.96" aria-label="LinkedIn"><rect width="512" height="512" rx="0" fill="currentColor"/><circle fill="#000" cx="142" cy="138" r="37"/><path stroke="#000" stroke-width="66" d="M244 194v198M142 194v198"/><path fill="#000" d="M276 282c0-20 13-40 36-40 24 0 33 18 33 45v105h66V279c0-61-32-89-76-89-34 0-51 19-59 32"/></svg>
</a></li>
<li><a href="https://join.slack.com/t/pytorch/shared_invite/zt-2j2la612p-miUinTTaxXczKOJw48poHA" target="_blank" title="PyTorch Slack">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0.16 -0.03 21.19 21.19" aria-label="Slack"><path fill="currentColor" d="M4.896 13.27a2.147 2.147 0 0 1-2.141 2.142A2.147 2.147 0 0 1 .613 13.27c0-1.178.963-2.141 2.142-2.141h2.141v2.141zm1.08 0c0-1.178.962-2.141 2.141-2.141s2.142.963 2.142 2.141v5.363a2.147 2.147 0 0 1-2.142 2.141 2.147 2.147 0 0 1-2.141-2.142V13.27zm2.141-8.6a2.147 2.147 0 0 1-2.141-2.14c0-1.18.962-2.142 2.141-2.142s2.142.963 2.142 2.141v2.142H8.117zm0 1.08c1.179 0 2.141.962 2.141 2.141a2.147 2.147 0 0 1-2.141 2.142H2.755A2.147 2.147 0 0 1 .613 7.89c0-1.179.963-2.141 2.142-2.141h5.362zm8.599 2.141c0-1.179.963-2.141 2.141-2.141 1.179 0 2.143.962 2.143 2.14a2.147 2.147 0 0 1-2.142 2.142h-2.141V7.89zm-1.08 0a2.147 2.147 0 0 1-2.141 2.142 2.147 2.147 0 0 1-2.141-2.142V2.53c0-1.178.962-2.141 2.141-2.141s2.142.963 2.142 2.141v5.362zm-2.141 8.6c1.179 0 2.142.962 2.142 2.14a2.147 2.147 0 0 1-2.142 2.142 2.147 2.147 0 0 1-2.141-2.141V16.49h2.141zm0-1.08a2.147 2.147 0 0 1-2.141-2.141c0-1.179.962-2.142 2.141-2.142h5.362c1.179 0 2.142.963 2.142 2.142a2.147 2.147 0 0 1-2.142 2.142h-5.362z"></path></svg>
</a></li>
<li><a href="/wechat" title="PyTorch on WeChat">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0.14 -0.17 38.02 33.02" aria-label="WeChat"><path fill="currentColor" d="M26.289 10.976a12.972 12.972 0 0 0-8.742 3.53 10.386 10.386 0 0 0-3.224 8.795c-1.326-.164-2.535-.345-3.75-.448a2.332 2.332 0 0 0-1.273.216c-1.18.666-2.311 1.418-3.652 2.255.246-1.112.405-2.087.687-3.024a1.15 1.15 0 0 0-.523-1.52C1.737 17.902.02 13.601 1.307 9.165c1.189-4.1 4.11-6.587 8.077-7.884A13.54 13.54 0 0 1 24.18 5.617a10.135 10.135 0 0 1 2.109 5.359zM10.668 9.594a1.564 1.564 0 0 0-2.095-1.472 1.52 1.52 0 0 0-.895 1.964 1.502 1.502 0 0 0 1.391.966 1.545 1.545 0 0 0 1.598-1.46v.002zm8.15-1.566a1.567 1.567 0 0 0-1.528 1.543 1.528 1.528 0 0 0 1.571 1.492 1.52 1.52 0 0 0 1.375-2.117 1.518 1.518 0 0 0-1.415-.919l-.003.001z"></path><path fill="currentColor" d="M33.914 32.137c-1.075-.478-2.062-1.196-3.11-1.306-1.049-.11-2.145.494-3.24.605a10.821 10.821 0 0 1-8.781-2.864c-4.682-4.33-4.013-10.97 1.403-14.518 4.811-3.154 11.874-2.102 15.268 2.273a8.671 8.671 0 0 1-1.002 12.095c-1.046.929-1.422 1.693-.751 2.917.102.257.174.525.213.798zM21.68 20.292a1.264 1.264 0 1 0 .01-2.528 1.264 1.264 0 0 0-.01 2.528zm7.887-2.526a1.266 1.266 0 0 0-1.256 1.21 1.247 1.247 0 1 0 1.256-1.21z"></path></svg>
</a></li>
</ul>
</div>
<div class="privacy-policy">
<div class="copyright">
<p>© Copyright The Linux Foundation. The PyTorch Foundation is a project of The Linux Foundation.
For web site terms of use, trademark policy and other policies applicable to The PyTorch Foundation please see
<a href="https://www.linuxfoundation.org/legal/policies/">Linux Foundation Policies</a>. The PyTorch Foundation supports the PyTorch open source
project, which has been established as PyTorch Project a Series of LF Projects, LLC. For policies applicable to the PyTorch Project a Series of LF Projects, LLC,
please see <a href="https://www.lfprojects.org/policies/">LF Projects, LLC Policies</a>. <a href="https://www.linuxfoundation.org/privacy">Privacy Policy</a> and <a href="https://www.linuxfoundation.org/terms">Terms of Use</a>.</p>
</div>
</div>
</div>
</footer>
<div class="mobile-main-menu">
<div class="container-fluid">
<div class="container">
<div class="mobile-main-menu-header-container">
<a class="header-logo" href="https://pytorch.org" aria-label="PyTorch"></a>
<a class="main-menu-close-button" href="#" data-behavior="close-mobile-menu"></a>
</div>
</div>
</div>
<div class="mobile-main-menu-links-container">
<div class="main-menu">
<ul>
<li class="navSearchWrapper reactNavSearchWrapper tabletSearchWrapper" key="search">
<div class="mobile-search-border">
<input
id="mobile-search-input"
type="text"
title="Search"
/>
<div id="mobile-search-icon"></div>
</div>
</li>
<li class="resources-mobile-menu-title">
<a>Learn</a>
</li>
<ul class="resources-mobile-menu-items">
<li>
<a href="/get-started">Get Started</a>
</li>
<li>
<a href="https://pytorch.org/tutorials">Tutorials</a>
</li>
<li>
<a href="https://pytorch.org/tutorials/beginner/basics/intro.html">Learn the Basics</a>
</li>
<li>
<a href="https://pytorch.org/tutorials/recipes/recipes_index.html">PyTorch Recipes</a>
</li>
<li>
<a href="https://pytorch.org/tutorials/beginner/introyt.html">Introduction to PyTorch - YouTube Series</a>
</li>
<li>
<a href="/new">New to PyTorch Foundation</a>
</li>
</ul>
<li class="resources-mobile-menu-title">
<a>Ecosystem</a>
</li>
<ul class="resources-mobile-menu-items">
<li>
<a href="https://landscape.pytorch.org/">Tools</a>
</li>
<li>
<a href="/join-ecosystem">Join the Ecosystem</a>
</li>
<li>
<a href="/#community-module">Community</a>
</li>
<li>
<a href="https://discuss.pytorch.org">Forums</a>
</li>
<li>
<a href="/resources">Developer Resources</a>
</li>
<li>
<a href="/ecosystem/contributor-awards-2024">Contributor Awards - 2024</a>
</li>
</ul>
<li class="resources-mobile-menu-title">
<a>Edge</a>
</li>
<ul class="resources-mobile-menu-items">
<li>
<a href="/edge">About PyTorch Edge</a>
</li>
<li>
<a href="/executorch-overview">ExecuTorch</a>
</li>
<li>
<a href="https://pytorch.org/executorch/stable/index.html">ExecuTorch Documentation</a>
</li>
</ul>
<li class="resources-mobile-menu-title">
<a>Docs</a>
</li>
<ul class="resources-mobile-menu-items">
<li>
<a href="https://pytorch.org/docs">PyTorch</a>
</li>
<li>
<a href="/pytorch-domains">PyTorch Domains</a>
</li>
</ul>
<li class="resources-mobile-menu-title">
<a>Blog & News</a>
</li>
<ul class="resources-mobile-menu-items">
<li>
<a href="/blog">PyTorch Blog</a>
</li>
<li>
<a href="/community-blog">Community Blog</a>
</li>
<li>
<a href="/videos">Videos</a>
</li>
<li>
<a href="/community-stories">Community Stories</a>
</li>
<li>
<a href="/events">Events</a>
</li>
<li>
<a href="/newsletter">Newsletter</a>
</li>
</ul>
<li class="resources-mobile-menu-title">
<a>About</a>
</li>
<ul class="resources-mobile-menu-items">
<li>
<a href="/foundation">PyTorch Foundation</a>
</li>
<li>
<a href="/governing-board">Governing Board</a>
</li>
<li>
<a href="/credits">Cloud Credit Program</a>
</li>
<li>
<a href="/tac">Technical Advisory Council</a>
</li>
<li>
<a href="/staff">Staff</a>
</li>
<li>
<a href="/contact-us">Contact Us</a>
</li>
</ul>
<li class="resources-mobile-menu-title">
<a href="/join">Become a Member</a>
</li>
<li class="resources-mobile-menu-title">
<a href="https://github.com/pytorch/pytorch" title="Go to PyTorch GitHub"><div id="topnav-gh-icon"></div></a>
</li>
</ul>
</div>
</div>
</div>
<script src="/assets/mobile-menu.js"></script>
<script src="/assets/scroll-to-anchor.js"></script>
<script src="/assets/external-links-new-tab.js"></script>
<script src="/assets/search-bar.js"></script>
<script src="/assets/cookie-banner.js"></script>
<script type="text/javascript">
mobileMenu.bind();
anchors.add('.pytorch-article h2, .pytorch-article h3, .pytorch-article h4, .pytorch-article h5');
// Add class to links that have code blocks, since we cannot create links in code blocks
$("a code.highlighter-rouge").each(function(e) {
$(this).closest("a").addClass("has-code");
});
scrollToAnchor.bind();
var hasStaticHeader = $(".blog-header, .blog-detail-header, .resources-header, .get-started-header, .features-header, .ecosystem-header, .hub-header, .mobile-header, .announcement-header, .comm-stories-header").length > 0;
if (!hasStaticHeader) {
$(window).on("scroll", function() {
var top = $(this).scrollTop();
var fullPosition = $(".main-background").height() - $(".header-holder").height();
if (top <= 40) {
$(".header-holder").css({"backgroundColor": "rgba(0, 0, 0, 0.165)"});
} else if (top >= fullPosition) {
$(".header-holder").css({"backgroundColor": "#000000"});
} else {
var bgColor = "rgba(0, 0, 0, " + top / fullPosition + ")";
$(".header-holder").css({"backgroundColor": bgColor});
}
});
}
</script>
<script src="/assets/track-events.js"></script>
<script>trackEvents.bind();</script>
<div class="cookie-banner-wrapper">
<div class="container">
<p class="gdpr-notice">To analyze traffic and optimize your experience, we serve cookies on this site. By clicking or navigating, you agree to allow our usage of cookies. As the current maintainers of this site, Facebook’s Cookies Policy applies. Learn more, including about available controls: <a href="https://www.facebook.com/policies/cookies/">Cookies Policy</a>.</p>
<img class="close-button" src="/assets/images/pytorch-x.svg">
</div>
</div>
</body>
</html>