diff --git a/image.darknet/R/darknet_models.R b/image.darknet/R/darknet_models.R
index 7ce0ca1..19825ec 100644
--- a/image.darknet/R/darknet_models.R
+++ b/image.darknet/R/darknet_models.R
@@ -136,7 +136,8 @@
 image_darknet_model <- function(type = c("classify", "detect"), model, weights, labels, resize=TRUE){
   if(model %in% c("tiny.cfg", "alexnet.cfg", "darknet.cfg", "vgg-16.cfg", 
                   "extraction.cfg", "darknet19.cfg", "darknet19_448.cfg",
-                  "yolo.cfg", "tiny-yolo.cfg", "yolo-voc", "tiny-yolo-voc.cfg")){
+                  "yolo.cfg", "tiny-yolo.cfg", "yolo-voc", "tiny-yolo-voc.cfg",
+                  "yolov2.cfg", "yolov2-voc.cfg", "yolov3.cfg", "yolov3-voc.cfg")){
     model <- system.file(package="image.darknet", "include", "darknet", "cfg", model)
   }
   stopifnot(file.exists(model))
diff --git a/image.darknet/R/yolo_detect.R b/image.darknet/R/yolo_detect.R
index 98ea2eb..42ce291 100644
--- a/image.darknet/R/yolo_detect.R
+++ b/image.darknet/R/yolo_detect.R
@@ -32,8 +32,8 @@
 #' weights <- file.path(system.file(package="image.darknet", "models"), "yolo.weights")
 #' download.file(url = "http://pjreddie.com/media/files/yolo.weights", destfile = weights)
 #' yolo_coco <- image_darknet_model(type = 'detect', 
-#'  model = "yolo.cfg", 
-#'  weights = system.file(package="image.darknet", "models", "yolo.weights"), 
+#'  model = "yolov3.cfg", 
+#'  weights = system.file(package="image.darknet", "models", "yolov3.weights"), 
 #'  labels = system.file(package="image.darknet", "include", "darknet", "data", "coco.names"))
 #' yolo_coco
 #' 
diff --git a/image.darknet/inst/include/darknet/LICENSE.fuck b/image.darknet/inst/include/darknet/LICENSE.fuck
new file mode 100644
index 0000000..8b1a9d8
--- /dev/null
+++ b/image.darknet/inst/include/darknet/LICENSE.fuck
@@ -0,0 +1,13 @@
+           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+                   Version 2, December 2004
+
+Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
+
+Everyone is permitted to copy and distribute verbatim or modified
+copies of this license document, and changing it is allowed as long
+as the name is changed.
+
+           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+  TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. You just DO WHAT THE FUCK YOU WANT TO.
diff --git a/image.darknet/inst/include/darknet/LICENSE.gen b/image.darknet/inst/include/darknet/LICENSE.gen
new file mode 100644
index 0000000..c541132
--- /dev/null
+++ b/image.darknet/inst/include/darknet/LICENSE.gen
@@ -0,0 +1,91 @@
+RNN LICENSE Version 3, June 21 2017
+
+Copyright (c) 1990, 1989, 1999 Free87337 May 48 THIRD PARTIES OR ANY OTHER THE
+COMPLAIN OR CONSEQUENTIAL DAMAGES AND REGARDLESS OF WHETHER IN CONTRACT, TO THE
+EXTENT REPAIR OR AGENTS (NOT THE IN ANY EVENT). THE SOFTWARE WILL BE
+UNINTERRUPTED OR ERROR-FREE OR ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF ALL THE WORK (GOVERNED CODE) HIM RESPONSES, OR OF FINES,
+SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR ANY OTHER OR OTHER HARL UNDER NO
+CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT (INCLUDING NEGLIGENCE),
+PATENT PERMITTED BY THE INSTAGRAM PARENT STATE OR TORT (INCLUDING NEGLIGENCE),
+PRODUCT LIABILITY OR OTHERWISE, ARISING OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR ANYTHING PROVIDED IN THIS PRODUCT, COMMIS AND SERVICES
+ARE LICENSED SOFTWARE AND ANY RESULE OR ANY OTHER THE COPYRIGHT HOLDERS BE
+LIABLE FOR ANY SPECIAL, INCIDENTAL, CASE, SUCH WARRANTIES, EXPRESS OR IMPLIED,
+INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COPYRIGHT HOLDERS AND/OR ANY
+PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES OF ANY
+EXPRESS OR DISTRIBUTE THAT ALL CLAIMS ARE SHALL CREATE DERAVE BE LIABLE TO YOU
+WILL HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+6\. TERMINATION. TO THE EXTENT PERMITTED BY LAW, NO USE OF THE COVERED CODE IS
+WITH YOU. SHOULD ANY COVERED CODE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE
+INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY NECESSARY
+SERVICING, REPAIR OR COULT OR IN ANY WAY OUT OF THE USE OF THE WEBSITES OR
+SERVICE WILL BE CONSEQUENTIAL DAMAGES OF ANY KIND HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+
+This paragraph Agreement constitutes the entire agreement between the parties
+with respect to the Work licensed here. However, if you place the name of the
+fact that the arbitration was the consultation of the parties as a "patent is".
+Subject to the terms and conditions of this License, Contributor has knowledge
+that a license under a third party may also be used to endorse or promote
+products derived from the Work, and there is no warranty on the Software and
+Science Fees. For the purposes of this Agreement, attach the following
+disclaimers (without liabilities of written notice to the Subject Software) in a
+manner that a product is under common control with you. The Free Software
+Foundation may publish revised and/or new versions of the License for the
+Modifications made by the applicable terms. The Recipient shall promptly retain
+the covered works for any reason be entered in any federal or state or login
+Restricted Laws appearing in the United States or any of its own information
+that is not disabled from a derivative work except as expressly permitted in
+this License, to the extent that they are in receiving the Software and Source
+Code or any exercise of the rights granted to You by this License or a
+Contributor made by the Licensor or are authorized to make a reasonable
+retirement by the courts of the courts located in Santa Clara County, California
+printed and related to the Work or “Company” and Apache Software Foundation. If
+the Licensor shall be entitled to reflect your rights to use the Software and
+the Software to exercise the rights granted to the recipient without a
+requirement to exercise the rights granted by the Agreement to the provision
+will begin will appear in such cases, you will use such information without such
+corporation shall be an officer with respect to any part of the Software or any
+portion thereof. Capitalized terms are included in the Initial Contributor and
+under no circumstances will license the Service at any time and for any direct,
+indirect, special, incidental, or consequential damages of or assist in
+connection with any Services or the registration purposes only to the extent
+that it includes any or all means including the processing of which you download
+any derivative work. Any of the purchases’ transmission purposes are made
+available, if any, in other circumstances, we may review the copyright notice.
+In the event that this Agreement is required to give us strict content. The
+inclusion of the other party hereunder may also notify you Intellectual Property
+Rights to any third party. This means that the Source Code exists of the Work
+will not charge a program available to you at any time. You must include a
+prominent statement that the Software is governed under a particular version of
+this Agreement. You must include a provision to the extent that there is no
+warranty for the content of others. You agree that the Recipient was appointed
+as a Contributor, (c) are effective until terminated by hereunder, then the
+registration are not disabled and not limited to, submit any Customer Data
+without the updated use of the Software and that no fee is released. You grant
+to Use Other Arbitration Rules for Diagnostic or Services may use or modify the
+Apple Software and Consolidated Apple Software or Services. The Company may have
+full risk as a product of the Compatible Source. A Contribution by the Licensor
+or by the updated Software under the following conditions we can redistribute
+any General Provision of this Agreement. If the Program is used in accordance
+with the terms of this Agreement, Customer may provide advertisements from your
+devices that clause you can your employer or a transaction or country that has
+been controlled by the arbitrator, that they will be useful of this Agreement.
+The term "Open Source Software is available in connection with the program, and
+you may not protect the combination of the Covered Code. You should like to
+select a user's rights to charge a copy of this License. I are Contributor's
+confidentiality of the exercise of the rights granted herein. Such a covered
+work is released as a consequence, the Licensor shall be eligible for a purpose
+or subcontractor of the person or entity to the user of the user, then the word
+"Application" means having the original fee for any reason; and that no patent
+license to more than fifty stated close of the license term. The terms of this
+License will the license terms and conditions set forth in Section 2.2 (OPEC)
+and You will not use the Software or any set of responsibility for any resulting
+information that the Original Code warrants that you have the right to disclose
+these information (or in the notification; or (iii) late use of the software or
+any third party to the three (50) days before such belief to the extent that it
+includes a court court obtains the rights granted by this License.
diff --git a/image.darknet/inst/include/darknet/LICENSE.gpl b/image.darknet/inst/include/darknet/LICENSE.gpl
new file mode 100644
index 0000000..9cecc1d
--- /dev/null
+++ b/image.darknet/inst/include/darknet/LICENSE.gpl
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    {one line to give the program's name and a brief idea of what it does.}
+    Copyright (C) {year}  {name of author}
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    {project}  Copyright (C) {year}  {fullname}
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/image.darknet/inst/include/darknet/LICENSE.meta b/image.darknet/inst/include/darknet/LICENSE.meta
new file mode 100644
index 0000000..6728bd2
--- /dev/null
+++ b/image.darknet/inst/include/darknet/LICENSE.meta
@@ -0,0 +1,8 @@
+                          META-LICENSE
+                    Version 1, June 21 2017
+
+Any and all licenses may be applied to the software either individually
+or in concert. Any issues, ambiguities, paradoxes, or metaphysical quandries
+arising from this combination should be discussed with a local faith leader,
+hermit, or guru. The Oxford comma shall be used.
+
diff --git a/image.darknet/inst/include/darknet/LICENSE.mit b/image.darknet/inst/include/darknet/LICENSE.mit
new file mode 100644
index 0000000..5bd806c
--- /dev/null
+++ b/image.darknet/inst/include/darknet/LICENSE.mit
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2017 Joseph Redmon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/image.darknet/inst/include/darknet/LICENSE.v1 b/image.darknet/inst/include/darknet/LICENSE.v1
new file mode 100644
index 0000000..5b8709a
--- /dev/null
+++ b/image.darknet/inst/include/darknet/LICENSE.v1
@@ -0,0 +1,13 @@
+                                  YOLO LICENSE
+                             Version 1, July 10 2015
+
+THIS SOFTWARE LICENSE IS PROVIDED "ALL CAPS" SO THAT YOU KNOW IT IS SUPER
+SERIOUS AND YOU DON'T MESS AROUND WITH COPYRIGHT LAW BECAUSE YOU WILL GET IN
+TROUBLE HERE ARE SOME OTHER BUZZWORDS COMMONLY IN THESE THINGS WARRANTIES
+LIABILITY CONTRACT TORT LIABLE CLAIMS RESTRICTION MERCHANTABILITY SUBJECT TO
+THE FOLLOWING CONDITIONS:
+
+1. #yolo
+2. #swag
+3. #blazeit
+
diff --git a/image.darknet/inst/include/darknet/Makefile b/image.darknet/inst/include/darknet/Makefile
index 3d3d5e4..63e15e6 100644
--- a/image.darknet/inst/include/darknet/Makefile
+++ b/image.darknet/inst/include/darknet/Makefile
@@ -1,27 +1,37 @@
 GPU=0
 CUDNN=0
 OPENCV=0
+OPENMP=0
 DEBUG=0
 
-ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \
-      -gencode arch=compute_30,code=sm_30 \
+ARCH= -gencode arch=compute_30,code=sm_30 \
       -gencode arch=compute_35,code=sm_35 \
       -gencode arch=compute_50,code=[sm_50,compute_50] \
       -gencode arch=compute_52,code=[sm_52,compute_52]
+#      -gencode arch=compute_20,code=[sm_20,sm_21] \ This one is deprecated?
 
 # This is what I use, uncomment if you know your arch and want to specify
-# ARCH=  -gencode arch=compute_52,code=compute_52
+# ARCH= -gencode arch=compute_52,code=compute_52
 
-VPATH=./src/
+VPATH=./src/:./examples
+SLIB=libdarknet.so
+ALIB=libdarknet.a
 EXEC=darknet
 OBJDIR=./obj/
 
 CC=gcc
+CPP=g++
 NVCC=nvcc 
+AR=ar
+ARFLAGS=rcs
 OPTS=-Ofast
 LDFLAGS= -lm -pthread 
-COMMON= 
-CFLAGS=-Wall -Wfatal-errors 
+COMMON= -Iinclude/ -Isrc/
+CFLAGS=-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -fPIC
+
+ifeq ($(OPENMP), 1) 
+CFLAGS+= -fopenmp
+endif
 
 ifeq ($(DEBUG), 1) 
 OPTS=-O0 -g
@@ -32,7 +42,7 @@ CFLAGS+=$(OPTS)
 ifeq ($(OPENCV), 1) 
 COMMON+= -DOPENCV
 CFLAGS+= -DOPENCV
-LDFLAGS+= `pkg-config --libs opencv` 
+LDFLAGS+= `pkg-config --libs opencv` -lstdc++
 COMMON+= `pkg-config --cflags opencv` 
 endif
 
@@ -48,19 +58,32 @@ CFLAGS+= -DCUDNN
 LDFLAGS+= -lcudnn
 endif
 
-OBJ=gemm.o utils.o cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o super.o voxel.o tree.o
+OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o detection_layer.o route_layer.o upsample_layer.o box.o normalization_layer.o avgpool_layer.o layer.o local_layer.o shortcut_layer.o logistic_layer.o activation_layer.o rnn_layer.o gru_layer.o crnn_layer.o demo.o batchnorm_layer.o region_layer.o reorg_layer.o tree.o  lstm_layer.o l2norm_layer.o yolo_layer.o iseg_layer.o image_opencv.o
+EXECOBJA=captcha.o lsd.o super.o art.o tag.o cifar.o go.o rnn.o segmenter.o regressor.o classifier.o coco.o yolo.o detector.o nightmare.o instance-segmenter.o darknet.o
 ifeq ($(GPU), 1) 
 LDFLAGS+= -lstdc++ 
-OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o
+OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o avgpool_layer_kernels.o
 endif
 
+EXECOBJ = $(addprefix $(OBJDIR), $(EXECOBJA))
 OBJS = $(addprefix $(OBJDIR), $(OBJ))
-DEPS = $(wildcard src/*.h) Makefile
+DEPS = $(wildcard src/*.h) Makefile include/darknet.h
+
+all: obj backup results $(SLIB) $(ALIB) $(EXEC)
+#all: obj  results $(SLIB) $(ALIB) $(EXEC)
+
+
+$(EXEC): $(EXECOBJ) $(ALIB)
+	$(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(ALIB)
+
+$(ALIB): $(OBJS)
+	$(AR) $(ARFLAGS) $@ $^
 
-all: obj backup results $(EXEC)
+$(SLIB): $(OBJS)
+	$(CC) $(CFLAGS) -shared $^ -o $@ $(LDFLAGS)
 
-$(EXEC): $(OBJS)
-	$(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS)
+$(OBJDIR)%.o: %.cpp $(DEPS)
+	$(CPP) $(COMMON) $(CFLAGS) -c $< -o $@
 
 $(OBJDIR)%.o: %.c $(DEPS)
 	$(CC) $(COMMON) $(CFLAGS) -c $< -o $@
@@ -78,5 +101,5 @@ results:
 .PHONY: clean
 
 clean:
-	rm -rf $(OBJS) $(EXEC)
+	rm -rf $(OBJS) $(SLIB) $(ALIB) $(EXEC) $(EXECOBJ) $(OBJDIR)/*
 
diff --git a/image.darknet/inst/include/darknet/README.md b/image.darknet/inst/include/darknet/README.md
index d255dab..09fdeee 100644
--- a/image.darknet/inst/include/darknet/README.md
+++ b/image.darknet/inst/include/darknet/README.md
@@ -1,6 +1,6 @@
 ![Darknet Logo](http://pjreddie.com/media/files/darknet-black-small.png)
 
-#Darknet#
+# Darknet #
 Darknet is an open source neural network framework written in C and CUDA. It is fast, easy to install, and supports CPU and GPU computation.
 
 For more information see the [Darknet project website](http://pjreddie.com/darknet).
diff --git a/image.darknet/inst/include/darknet/cfg/alexnet.cfg b/image.darknet/inst/include/darknet/cfg/alexnet.cfg
index 7e5a9b2..e2ed4bb 100644
--- a/image.darknet/inst/include/darknet/cfg/alexnet.cfg
+++ b/image.darknet/inst/include/darknet/cfg/alexnet.cfg
@@ -1,5 +1,9 @@
 [net]
-batch=128
+# Training
+# batch=128
+# subdivisions=1
+# Testing
+batch=1
 subdivisions=1
 height=227
 width=227
@@ -90,6 +94,3 @@ activation=linear
 [softmax]
 groups=1
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/cifar.cfg b/image.darknet/inst/include/darknet/cfg/cifar.cfg
index f2c801a..b2f69f5 100644
--- a/image.darknet/inst/include/darknet/cfg/cifar.cfg
+++ b/image.darknet/inst/include/darknet/cfg/cifar.cfg
@@ -1,25 +1,23 @@
 [net]
 batch=128
 subdivisions=1
-height=32
-width=32
+height=28
+width=28
 channels=3
-momentum=0.9
-decay=0.0005
+max_crop=32
+min_crop=32
+
+hue=.1
+saturation=.75
+exposure=.75
 
 learning_rate=0.4
 policy=poly
 power=4
-max_batches = 50000
+max_batches = 5000
+momentum=0.9
+decay=0.0005
 
-[crop]
-crop_width=28
-crop_height=28
-flip=1
-angle=0
-saturation = 1
-exposure = 1
-noadjust=1
 
 [convolutional]
 batch_normalize=1
@@ -121,6 +119,3 @@ activation=leaky
 
 [softmax]
 groups=1
-
-[cost]
-
diff --git a/image.darknet/inst/include/darknet/cfg/cifar.test.cfg b/image.darknet/inst/include/darknet/cfg/cifar.test.cfg
index d3afcdd..18b6c54 100644
--- a/image.darknet/inst/include/darknet/cfg/cifar.test.cfg
+++ b/image.darknet/inst/include/darknet/cfg/cifar.test.cfg
@@ -115,5 +115,3 @@ activation=leaky
 groups=1
 temperature=3
 
-[cost]
-
diff --git a/image.darknet/inst/include/darknet/cfg/coco.data b/image.darknet/inst/include/darknet/cfg/coco.data
index 610151d..3003841 100644
--- a/image.darknet/inst/include/darknet/cfg/coco.data
+++ b/image.darknet/inst/include/darknet/cfg/coco.data
@@ -1,7 +1,7 @@
 classes= 80
 train  = /home/pjreddie/data/coco/trainvalno5k.txt
-#valid  = coco_testdev
-valid = data/coco_val_5k.list
+valid  = coco_testdev
+#valid = data/coco_val_5k.list
 names = data/coco.names
 backup = /home/pjreddie/backup/
 eval=coco
diff --git a/image.darknet/inst/include/darknet/cfg/darknet.cfg b/image.darknet/inst/include/darknet/cfg/darknet.cfg
index 60b939a..375107f 100644
--- a/image.darknet/inst/include/darknet/cfg/darknet.cfg
+++ b/image.darknet/inst/include/darknet/cfg/darknet.cfg
@@ -1,17 +1,30 @@
 [net]
-batch=128
+# Training
+# batch=128
+# subdivisions=1
+# Testing
+batch=1
 subdivisions=1
-height=224
-width=224
+height=256
+width=256
+min_crop=128
+max_crop=448
 channels=3
 momentum=0.9
 decay=0.0005
-max_crop=320
 
+burn_in=1000
 learning_rate=0.1
 policy=poly
 power=4
-max_batches=1600000
+max_batches=800000
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
+
 
 [convolutional]
 batch_normalize=1
@@ -84,7 +97,6 @@ activation=leaky
 [maxpool]
 size=2
 stride=2
-padding=1
 
 [convolutional]
 batch_normalize=1
@@ -94,18 +106,15 @@ stride=1
 pad=1
 activation=leaky
 
+[avgpool]
+
 [convolutional]
 filters=1000
 size=1
 stride=1
 pad=1
-activation=leaky
-
-[avgpool]
+activation=linear
 
 [softmax]
 groups=1
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/darknet19.cfg b/image.darknet/inst/include/darknet/cfg/darknet19.cfg
index bf73fb7..28ac966 100644
--- a/image.darknet/inst/include/darknet/cfg/darknet19.cfg
+++ b/image.darknet/inst/include/darknet/cfg/darknet19.cfg
@@ -1,17 +1,31 @@
 [net]
-batch=128
-subdivisions=1
-height=224
-width=224
+# Training
+#batch=128
+#subdivisions=2
+
+# Testing
+ batch=1
+ subdivisions=1
+
+height=256
+width=256
+min_crop=128
+max_crop=448
 channels=3
 momentum=0.9
 decay=0.0005
-max_crop=448
 
+burn_in=1000
 learning_rate=0.1
 policy=poly
 power=4
-max_batches=1600000
+max_batches=800000
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
 
 [convolutional]
 batch_normalize=1
@@ -189,6 +203,3 @@ activation=linear
 [softmax]
 groups=1
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/darknet19_448.cfg b/image.darknet/inst/include/darknet/cfg/darknet19_448.cfg
index 133c688..c6df730 100644
--- a/image.darknet/inst/include/darknet/cfg/darknet19_448.cfg
+++ b/image.darknet/inst/include/darknet/cfg/darknet19_448.cfg
@@ -195,6 +195,3 @@ activation=linear
 [softmax]
 groups=1
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/darknet53.cfg b/image.darknet/inst/include/darknet/cfg/darknet53.cfg
new file mode 100644
index 0000000..7b6d576
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/darknet53.cfg
@@ -0,0 +1,566 @@
+[net]
+# Training
+# batch=128
+# subdivisions=4
+
+# Testing
+batch=1
+subdivisions=1
+
+height=256
+width=256
+channels=3
+min_crop=128
+max_crop=448
+
+burn_in=1000
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=800000
+momentum=0.9
+decay=0.0005
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
+
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[avgpool]
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
+
+[softmax]
+groups=1
+
diff --git a/image.darknet/inst/include/darknet/cfg/msr_50.cfg b/image.darknet/inst/include/darknet/cfg/darknet53_448.cfg
similarity index 85%
rename from image.darknet/inst/include/darknet/cfg/msr_50.cfg
rename to image.darknet/inst/include/darknet/cfg/darknet53_448.cfg
index 2edd21c..dedab1b 100644
--- a/image.darknet/inst/include/darknet/cfg/msr_50.cfg
+++ b/image.darknet/inst/include/darknet/cfg/darknet53_448.cfg
@@ -1,48 +1,47 @@
 [net]
-batch=128
-subdivisions=8
-height=256
-width=256
+# Training - start training with darknet53.weights
+# batch=128
+# subdivisions=8
+
+# Testing
+batch=1
+subdivisions=1
+
+height=448
+width=448
 channels=3
-momentum=0.9
-decay=0.0001
+min_crop=448
+max_crop=512
 
-learning_rate=0.05
+learning_rate=0.001
 policy=poly
 power=4
-max_batches=500000
-
+max_batches=100000
+momentum=0.9
+decay=0.0005
 
 
-[crop]
-crop_height=224
-crop_width=224
-flip=1
-saturation=1
-exposure=1
-angle=0
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
 
-##### Conv 1 #####
+# Downsample
 
 [convolutional]
 batch_normalize=1
 filters=64
-size=7
+size=3
 stride=2
 pad=1
 activation=leaky
 
-[maxpool]
-size=3
-stride=2
-
-
-##### Conv 2_x #####
-
-
 [convolutional]
 batch_normalize=1
-filters=64
+filters=32
 size=1
 stride=1
 pad=1
@@ -56,27 +55,18 @@ stride=1
 pad=1
 activation=leaky
 
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
+[shortcut]
+from=-3
 activation=linear
 
-[route]
-layers=-4
+# Downsample
 
 [convolutional]
 batch_normalize=1
-size=1
-stride=1
+filters=128
+size=3
+stride=2
 pad=1
-activation=linear
-filters=256
-
-[shortcut]
-from = -3
 activation=leaky
 
 [convolutional]
@@ -89,54 +79,65 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=64
+filters=128
 size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
 [convolutional]
 batch_normalize=1
-filters=256
+filters=64
 size=1
 stride=1
 pad=1
-activation=linear
-
-[shortcut]
-from = -4
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=64
-size=1
+filters=128
+size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
 [convolutional]
 batch_normalize=1
-filters=64
+filters=256
 size=3
-stride=1
+stride=2
 pad=1
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=256
+filters=128
 size=1
 stride=1
 pad=1
-activation=linear
-
-[shortcut]
-from = -4
 activation=leaky
 
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
 
-##### Conv 3_x #####
+[shortcut]
+from=-3
+activation=linear
 
 [convolutional]
 batch_normalize=1
@@ -148,34 +149,35 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=128
+filters=256
 size=3
-stride=2
+stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
 [convolutional]
 batch_normalize=1
-filters=512
+filters=128
 size=1
 stride=1
 pad=1
-activation=linear
-
-[route]
-layers=-4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
-size=1
-stride=2
+filters=256
+size=3
+stride=1
 pad=1
-activation=linear
-filters=512
+activation=leaky
 
 [shortcut]
-from = -3
-activation=leaky
+from=-3
+activation=linear
 
 [convolutional]
 batch_normalize=1
@@ -187,51 +189,56 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=128
+filters=256
 size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
+
 [convolutional]
 batch_normalize=1
-filters=512
+filters=128
 size=1
 stride=1
 pad=1
-activation=linear
-
-[shortcut]
-from = -4
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=128
-size=1
+filters=256
+size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
 [convolutional]
 batch_normalize=1
 filters=128
-size=3
+size=1
 stride=1
 pad=1
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=512
-size=1
+filters=256
+size=3
 stride=1
 pad=1
-activation=linear
+activation=leaky
 
 [shortcut]
-from = -4
-activation=leaky
+from=-3
+activation=linear
 
 [convolutional]
 batch_normalize=1
@@ -243,38 +250,41 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=128
+filters=256
 size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
 [convolutional]
 batch_normalize=1
-filters=512
+filters=128
 size=1
 stride=1
 pad=1
-activation=linear
-
-[shortcut]
-from = -4
 activation=leaky
 
-
-##### Conv 4_x #####
-
 [convolutional]
 batch_normalize=1
 filters=256
-size=1
+size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
 [convolutional]
 batch_normalize=1
-filters=256
+filters=512
 size=3
 stride=2
 pad=1
@@ -282,26 +292,23 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=1024
+filters=256
 size=1
 stride=1
 pad=1
-activation=linear
-
-[route]
-layers=-4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
-size=1
-stride=2
+filters=512
+size=3
+stride=1
 pad=1
-activation=linear
-filters=1024
+activation=leaky
 
 [shortcut]
-from = -3
-activation=leaky
+from=-3
+activation=linear
 
 
 [convolutional]
@@ -314,23 +321,16 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=256
+filters=512
 size=3
 stride=1
 pad=1
 activation=leaky
 
-[convolutional]
-batch_normalize=1
-filters=1024
-size=1
-stride=1
-pad=1
+[shortcut]
+from=-3
 activation=linear
 
-[shortcut]
-from = -4
-activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -342,23 +342,16 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=256
+filters=512
 size=3
 stride=1
 pad=1
 activation=leaky
 
-[convolutional]
-batch_normalize=1
-filters=1024
-size=1
-stride=1
-pad=1
+[shortcut]
+from=-3
 activation=linear
 
-[shortcut]
-from = -4
-activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -370,51 +363,57 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=256
+filters=512
 size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
 [convolutional]
 batch_normalize=1
-filters=1024
+filters=256
 size=1
 stride=1
 pad=1
-activation=linear
-
-[shortcut]
-from = -4
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=256
-size=1
+filters=512
+size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
+
 [convolutional]
 batch_normalize=1
 filters=256
-size=3
+size=1
 stride=1
 pad=1
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=1024
-size=1
+filters=512
+size=3
 stride=1
 pad=1
-activation=linear
+activation=leaky
 
 [shortcut]
-from = -4
-activation=leaky
+from=-3
+activation=linear
+
 
 [convolutional]
 batch_normalize=1
@@ -426,38 +425,41 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=256
+filters=512
 size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
 [convolutional]
 batch_normalize=1
-filters=1024
+filters=256
 size=1
 stride=1
 pad=1
-activation=linear
-
-[shortcut]
-from = -4
 activation=leaky
 
-
-##### Conv 5_x #####
-
 [convolutional]
 batch_normalize=1
 filters=512
-size=1
+size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
 [convolutional]
 batch_normalize=1
-filters=512
+filters=1024
 size=3
 stride=2
 pad=1
@@ -465,28 +467,23 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=2048
+filters=512
 size=1
 stride=1
 pad=1
-activation=linear
-
-
-[route]
-layers=-4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
-size=1
-stride=2
+filters=1024
+size=3
+stride=1
 pad=1
-activation=linear
-filters=2048
-
-[shortcut]
-from = -3
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
 
 [convolutional]
 batch_normalize=1
@@ -498,61 +495,65 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=512
+filters=1024
 size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
 [convolutional]
 batch_normalize=1
-filters=2048
+filters=512
 size=1
 stride=1
 pad=1
-activation=linear
-
-[shortcut]
-from = -4
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=512
-size=1
+filters=1024
+size=3
 stride=1
 pad=1
 activation=leaky
 
+[shortcut]
+from=-3
+activation=linear
+
 [convolutional]
 batch_normalize=1
 filters=512
-size=3
+size=1
 stride=1
 pad=1
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=2048
-size=1
+filters=1024
+size=3
 stride=1
 pad=1
-activation=linear
+activation=leaky
 
 [shortcut]
-from = -4
-activation=leaky
+from=-3
+activation=linear
 
 [avgpool]
 
-[connected]
-output=1000
-activation=leaky
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
 
 [softmax]
 groups=1
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/yolov1/yolo2.cfg b/image.darknet/inst/include/darknet/cfg/darknet9000.cfg
similarity index 66%
rename from image.darknet/inst/include/darknet/cfg/yolov1/yolo2.cfg
rename to image.darknet/inst/include/darknet/cfg/darknet9000.cfg
index b46a0d6..9dd2dfb 100644
--- a/image.darknet/inst/include/darknet/cfg/yolov1/yolo2.cfg
+++ b/image.darknet/inst/include/darknet/cfg/darknet9000.cfg
@@ -1,23 +1,33 @@
 [net]
-batch=1
-subdivisions=1
+# Training
+# batch=128
+# subdivisions=4
+# Testing
+batch = 1
+subdivisions = 1
 height=448
 width=448
+max_crop=512
 channels=3
 momentum=0.9
 decay=0.0005
 
-learning_rate=0.0005
-policy=steps
-steps=200,400,600,20000,30000
-scales=2.5,2,2,.1,.1
-max_batches = 40000
+learning_rate=0.001
+policy=poly
+power=4
+max_batches=100000
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
 
 [convolutional]
 batch_normalize=1
-filters=64
-size=7
-stride=2
+filters=32
+size=3
+stride=1
 pad=1
 activation=leaky
 
@@ -27,7 +37,7 @@ stride=2
 
 [convolutional]
 batch_normalize=1
-filters=192
+filters=64
 size=3
 stride=1
 pad=1
@@ -40,14 +50,6 @@ stride=2
 [convolutional]
 batch_normalize=1
 filters=128
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
 size=3
 stride=1
 pad=1
@@ -55,7 +57,7 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=256
+filters=64
 size=1
 stride=1
 pad=1
@@ -63,7 +65,7 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=512
+filters=128
 size=3
 stride=1
 pad=1
@@ -76,14 +78,6 @@ stride=2
 [convolutional]
 batch_normalize=1
 filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
 size=3
 stride=1
 pad=1
@@ -91,7 +85,7 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=256
+filters=128
 size=1
 stride=1
 pad=1
@@ -99,19 +93,15 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=512
+filters=256
 size=3
 stride=1
 pad=1
 activation=leaky
 
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
+[maxpool]
+size=2
+stride=2
 
 [convolutional]
 batch_normalize=1
@@ -139,7 +129,7 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=512
+filters=256
 size=1
 stride=1
 pad=1
@@ -147,7 +137,7 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=1024
+filters=512
 size=3
 stride=1
 pad=1
@@ -157,14 +147,6 @@ activation=leaky
 size=2
 stride=2
 
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
 [convolutional]
 batch_normalize=1
 filters=1024
@@ -189,63 +171,35 @@ stride=1
 pad=1
 activation=leaky
 
-#######
-
 [convolutional]
 batch_normalize=1
-size=3
+filters=512
+size=1
 stride=1
 pad=1
-filters=1024
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-size=3
-stride=2
-pad=1
 filters=1024
-activation=leaky
-
-[convolutional]
-batch_normalize=1
 size=3
 stride=1
 pad=1
-filters=1024
 activation=leaky
 
 [convolutional]
-batch_normalize=1
-size=3
+filters=9418
+size=1
 stride=1
 pad=1
-filters=1024
-activation=leaky
+activation=linear
 
-[local]
-size=3
-stride=1
-pad=1
-filters=256
-activation=leaky
+[avgpool]
 
-[connected]
-output= 1715
-activation=linear
+[softmax]
+groups=1
+tree=data/9k.tree
 
-[detection]
-classes=20
-coords=4
-rescore=1
-side=7
-num=3
-softmax=0
-sqrt=1
-jitter=.2
-
-object_scale=1
-noobject_scale=.5
-class_scale=1
-coord_scale=5
+[cost]
+type=masked
 
diff --git a/image.darknet/inst/include/darknet/cfg/densenet201.cfg b/image.darknet/inst/include/darknet/cfg/densenet201.cfg
new file mode 100644
index 0000000..65b4aec
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/densenet201.cfg
@@ -0,0 +1,1951 @@
+[net]
+# Training
+# batch=128
+# subdivisions=4
+
+# Testing
+batch=1
+subdivisions=1
+
+height=256
+width=256
+max_crop=448
+channels=3
+momentum=0.9
+decay=0.0005
+
+burn_in=1000
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=1600000
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=7
+stride=2
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1,-3
+
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
+
+[avgpool]
+
+[softmax]
+groups=1
+
diff --git a/image.darknet/inst/include/darknet/cfg/extraction.cfg b/image.darknet/inst/include/darknet/cfg/extraction.cfg
index 94e1067..66cb15f 100644
--- a/image.darknet/inst/include/darknet/cfg/extraction.cfg
+++ b/image.darknet/inst/include/darknet/cfg/extraction.cfg
@@ -1,6 +1,12 @@
 [net]
-batch=128
+# Training
+# batch=128
+# subdivisions=4
+
+# Testing
+batch=1
 subdivisions=1
+
 height=224
 width=224
 max_crop=320
@@ -201,6 +207,3 @@ activation=leaky
 [softmax]
 groups=1
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/extraction22k.cfg b/image.darknet/inst/include/darknet/cfg/extraction22k.cfg
index 4cec6da..b5f5409 100644
--- a/image.darknet/inst/include/darknet/cfg/extraction22k.cfg
+++ b/image.darknet/inst/include/darknet/cfg/extraction22k.cfg
@@ -204,6 +204,3 @@ activation=leaky
 [softmax]
 groups=1
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/go.cfg b/image.darknet/inst/include/darknet/cfg/go.cfg
new file mode 100644
index 0000000..c730092
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/go.cfg
@@ -0,0 +1,132 @@
+[net]
+batch=512
+subdivisions=1
+height=19
+width=19
+channels=1
+momentum=0.9
+decay=0.0005
+
+burn_in=1000
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=10000000
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=1
+size=1
+stride=1
+pad=1
+activation=linear
+
+[reorg]
+extra=1
+stride=1
+
+[softmax]
+
diff --git a/image.darknet/inst/include/darknet/cfg/go.test.cfg b/image.darknet/inst/include/darknet/cfg/go.test.cfg
index 6b92d33..1e4e438 100644
--- a/image.darknet/inst/include/darknet/cfg/go.test.cfg
+++ b/image.darknet/inst/include/darknet/cfg/go.test.cfg
@@ -7,13 +7,13 @@ channels=1
 momentum=0.9
 decay=0.0005
 
-learning_rate=0.1
+learning_rate=0.01
 policy=poly
 power=4
-max_batches=400000
+max_batches=100000
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -21,7 +21,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -29,7 +29,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -37,7 +37,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -45,7 +45,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -53,7 +53,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -61,7 +61,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -69,7 +69,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -77,7 +77,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -85,7 +85,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -93,7 +93,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -101,7 +101,7 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@@ -109,14 +109,13 @@ activation=relu
 batch_normalize=1
 
 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
 activation=relu
 batch_normalize=1
 
-
 [convolutional]
 filters=1
 size=1
@@ -124,8 +123,10 @@ stride=1
 pad=1
 activation=linear
 
+[reorg]
+extra=1
+stride=1
+
 [softmax]
 
-[cost]
-type=sse
 
diff --git a/image.darknet/inst/include/darknet/cfg/gru.cfg b/image.darknet/inst/include/darknet/cfg/gru.cfg
index f9a0699..6064221 100644
--- a/image.darknet/inst/include/darknet/cfg/gru.cfg
+++ b/image.darknet/inst/include/darknet/cfg/gru.cfg
@@ -1,27 +1,25 @@
 [net]
-subdivisions=1
 inputs=256
-batch = 1
 momentum=0.9
-decay=0.001
+decay=0.0
+subdivisions=1
+batch = 1
 time_steps=1
-learning_rate=0.5
+learning_rate=.002
+adam=1
 
-policy=poly
+policy=constant
 power=4
-max_batches=2000
+max_batches=1000000
 
 [gru]
-batch_normalize=1
-output = 1024
+output = 256
 
 [gru]
-batch_normalize=1
-output = 1024
+output = 256
 
 [gru]
-batch_normalize=1
-output = 1024
+output = 256
 
 [connected]
 output=256
@@ -29,6 +27,3 @@ activation=linear
 
 [softmax]
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/imagenet22k.dataset b/image.darknet/inst/include/darknet/cfg/imagenet22k.dataset
index 920785d..e25ef00 100644
--- a/image.darknet/inst/include/darknet/cfg/imagenet22k.dataset
+++ b/image.darknet/inst/include/darknet/cfg/imagenet22k.dataset
@@ -1,6 +1,7 @@
 classes=21842
 train  = /data/imagenet/imagenet22k.train.list
 valid  = /data/imagenet/imagenet22k.valid.list
+#valid  = /data/imagenet/imagenet1k.valid.list
 backup = /home/pjreddie/backup/
 labels = data/imagenet.labels.list
 names  = data/imagenet.shortnames.list
diff --git a/image.darknet/inst/include/darknet/cfg/imagenet9k.hierarchy.dataset b/image.darknet/inst/include/darknet/cfg/imagenet9k.hierarchy.dataset
new file mode 100644
index 0000000..41fb71b
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/imagenet9k.hierarchy.dataset
@@ -0,0 +1,9 @@
+classes=9418
+train  = data/9k.train.list
+valid  = /data/imagenet/imagenet1k.valid.list
+leaves = data/imagenet1k.labels
+backup = /home/pjreddie/backup/
+labels = data/9k.labels
+names  = data/9k.names
+top=5
+
diff --git a/image.darknet/inst/include/darknet/cfg/openimages.data b/image.darknet/inst/include/darknet/cfg/openimages.data
new file mode 100644
index 0000000..fa80e5a
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/openimages.data
@@ -0,0 +1,8 @@
+classes= 601
+train  = /home/pjreddie/data/openimsv4/openimages.train.list
+#valid  = coco_testdev
+valid = data/coco_val_5k.list
+names = data/openimages.names
+backup = /home/pjreddie/backup/
+eval=coco
+
diff --git a/image.darknet/inst/include/darknet/cfg/resnet101.cfg b/image.darknet/inst/include/darknet/cfg/resnet101.cfg
new file mode 100644
index 0000000..de45882
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/resnet101.cfg
@@ -0,0 +1,990 @@
+[net]
+# Training
+# batch=128
+# subdivisions=2
+
+# Testing
+batch=1
+subdivisions=1
+
+height=256
+width=256
+channels=3
+min_crop=128
+max_crop=448
+
+burn_in=1000
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=800000
+momentum=0.9
+decay=0.0005
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
+
+
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=7
+stride=2
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+
+# Conv 4
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+#Conv 5
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+
+
+
+
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
+
+[avgpool]
+
+[softmax]
+groups=1
+
+[cost]
+type=sse
+
diff --git a/image.darknet/inst/include/darknet/cfg/msr_152.cfg b/image.darknet/inst/include/darknet/cfg/resnet152.cfg
similarity index 92%
rename from image.darknet/inst/include/darknet/cfg/msr_152.cfg
rename to image.darknet/inst/include/darknet/cfg/resnet152.cfg
index b19c999..e8e3297 100644
--- a/image.darknet/inst/include/darknet/cfg/msr_152.cfg
+++ b/image.darknet/inst/include/darknet/cfg/resnet152.cfg
@@ -1,26 +1,30 @@
 [net]
-batch=128
-subdivisions=8
+# Training
+# batch=128
+# subdivisions=8
+
+# Testing
+batch=1
+subdivisions=1
+
 height=256
 width=256
+max_crop=448
 channels=3
 momentum=0.9
-decay=0.0001
+decay=0.0005
 
+burn_in=1000
 learning_rate=0.1
 policy=poly
 power=4
-max_batches=500000
-
-[crop]
-crop_height=224
-crop_width=224
-flip=1
-saturation=1
-exposure=1
-angle=0
+max_batches=1600000
 
-##### Conv 1 #####
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
 
 [convolutional]
 batch_normalize=1
@@ -31,13 +35,9 @@ pad=1
 activation=leaky
 
 [maxpool]
-size=3
+size=2
 stride=2
 
-
-##### Conv 2_x #####
-
-
 [convolutional]
 batch_normalize=1
 filters=64
@@ -62,19 +62,8 @@ stride=1
 pad=1
 activation=linear
 
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=1
-pad=1
-activation=linear
-filters=256
-
 [shortcut]
-from = -3
+from=-4
 activation=leaky
 
 [convolutional]
@@ -102,8 +91,7 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
 [convolutional]
@@ -131,13 +119,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
-##### Conv 3_x #####
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -162,23 +146,10 @@ stride=1
 pad=1
 activation=linear
 
-
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=2
-pad=1
-activation=linear
-filters=512
-
 [shortcut]
-from = -3
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -204,11 +175,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -234,11 +203,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -264,11 +231,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -294,11 +259,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -324,11 +287,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -354,11 +315,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -384,14 +343,11 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
 
-
-##### Conv 4_x #####
-
+# Conv 4
 [convolutional]
 batch_normalize=1
 filters=256
@@ -416,23 +372,10 @@ stride=1
 pad=1
 activation=linear
 
-
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=2
-pad=1
-activation=linear
-filters=1024
-
 [shortcut]
-from = -3
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -458,11 +401,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -488,11 +429,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -518,11 +457,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -548,11 +485,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -578,11 +513,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -608,11 +541,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -638,11 +569,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -668,11 +597,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -698,11 +625,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -728,11 +653,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -758,11 +681,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -788,11 +709,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -818,11 +737,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -848,11 +765,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -878,11 +793,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -908,11 +821,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -938,11 +849,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -968,11 +877,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -998,11 +905,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1028,11 +933,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1058,11 +961,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1088,11 +989,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1118,11 +1017,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1148,11 +1045,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1178,11 +1073,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1208,11 +1101,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1238,11 +1129,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1268,11 +1157,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1298,11 +1185,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1328,11 +1213,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1358,11 +1241,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1388,11 +1269,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1418,11 +1297,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1448,11 +1325,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1478,13 +1353,10 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
-##### Conv 5_x #####
-
+#Conv 5
 [convolutional]
 batch_normalize=1
 filters=512
@@ -1509,23 +1381,10 @@ stride=1
 pad=1
 activation=linear
 
-
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=2
-pad=1
-activation=linear
-filters=2048
-
 [shortcut]
-from = -3
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=512
@@ -1551,8 +1410,7 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
 [convolutional]
@@ -1580,19 +1438,23 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-[avgpool]
 
-[connected]
-output=1000
-activation=leaky
+
+
+
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
+
+[avgpool]
 
 [softmax]
 groups=1
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/yolov1/yolo-coco.cfg b/image.darknet/inst/include/darknet/cfg/resnet18.cfg
similarity index 57%
rename from image.darknet/inst/include/darknet/cfg/yolov1/yolo-coco.cfg
rename to image.darknet/inst/include/darknet/cfg/resnet18.cfg
index ed3f2d6..275f4bd 100644
--- a/image.darknet/inst/include/darknet/cfg/yolov1/yolo-coco.cfg
+++ b/image.darknet/inst/include/darknet/cfg/resnet18.cfg
@@ -1,21 +1,32 @@
 [net]
-batch=64
-subdivisions=4
-height=448
-width=448
+# Training
+# batch=128
+# subdivisions=1
+
+# Testing
+batch=1
+subdivisions=1
+
+height=256
+width=256
 channels=3
+min_crop=128
+max_crop=448
+
+burn_in=1000
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=800000
 momentum=0.9
 decay=0.0005
 
-hue = .1
+angle=7
+hue=.1
 saturation=.75
 exposure=.75
+aspect=.75
 
-learning_rate=0.0005
-policy=steps
-steps=200,400,600,800,100000,150000
-scales=2.5,2,2,2,.1,.1
-max_batches = 200000
 
 [convolutional]
 batch_normalize=1
@@ -29,29 +40,32 @@ activation=leaky
 size=2
 stride=2
 
+
+# Residual Block
 [convolutional]
 batch_normalize=1
-filters=192
+filters=64
 size=3
 stride=1
 pad=1
 activation=leaky
 
-[maxpool]
-size=2
-stride=2
-
 [convolutional]
 batch_normalize=1
-filters=128
-size=1
+filters=64
+size=3
 stride=1
 pad=1
+activation=linear
+
+[shortcut]
 activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
-filters=256
+filters=64
 size=3
 stride=1
 pad=1
@@ -59,51 +73,41 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=256
-size=1
+filters=64
+size=3
 stride=1
 pad=1
+activation=linear
+
+[shortcut]
 activation=leaky
+from=-3
 
+# Strided Residual Block
 [convolutional]
 batch_normalize=1
-filters=512
+filters=128
 size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
 stride=2
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
 pad=1
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=512
+filters=128
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
+[shortcut]
 activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
-filters=512
+filters=128
 size=3
 stride=1
 pad=1
@@ -111,145 +115,114 @@ activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
+filters=128
 size=3
 stride=1
 pad=1
+activation=linear
+
+[shortcut]
 activation=leaky
+from=-3
 
+
+# Strided Residual Block
 [convolutional]
 batch_normalize=1
 filters=256
-size=1
-stride=1
+size=3
+stride=2
 pad=1
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=512
+filters=256
 size=3
 stride=1
 pad=1
+activation=linear
+
+[shortcut]
 activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
-filters=512
-size=1
+filters=256
+size=3
 stride=1
 pad=1
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=1024
+filters=256
 size=3
 stride=1
 pad=1
+activation=linear
+
+[shortcut]
 activation=leaky
+from=-3
 
-[maxpool]
-size=2
-stride=2
 
+# Strided Residual Block
 [convolutional]
 batch_normalize=1
 filters=512
-size=1
-stride=1
+size=3
+stride=2
 pad=1
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=1024
+filters=512
 size=3
 stride=1
 pad=1
+activation=linear
+
+[shortcut]
 activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=512
-size=1
+size=3
 stride=1
 pad=1
 activation=leaky
 
 [convolutional]
 batch_normalize=1
-filters=1024
+filters=512
 size=3
 stride=1
 pad=1
+activation=linear
+
+[shortcut]
 activation=leaky
+from=-3
 
-#######
 
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
 
-[convolutional]
-batch_normalize=1
-size=3
-stride=2
-pad=1
-filters=1024
-activation=leaky
 
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
+[avgpool]
 
 [convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[local]
-size=3
+filters=1000
+size=1
 stride=1
 pad=1
-filters=256
-activation=leaky
-
-[connected]
-output= 4655
 activation=linear
 
-[detection]
-classes=80
-coords=4
-rescore=1
-side=7
-num=3
-softmax=0
-sqrt=1
-jitter=.2
-
-object_scale=1
-noobject_scale=.5
-class_scale=1
-coord_scale=5
+[softmax]
+groups=1
 
diff --git a/image.darknet/inst/include/darknet/cfg/msr_34.cfg b/image.darknet/inst/include/darknet/cfg/resnet34.cfg
similarity index 77%
rename from image.darknet/inst/include/darknet/cfg/msr_34.cfg
rename to image.darknet/inst/include/darknet/cfg/resnet34.cfg
index 5ae23cf..9f68f09 100644
--- a/image.darknet/inst/include/darknet/cfg/msr_34.cfg
+++ b/image.darknet/inst/include/darknet/cfg/resnet34.cfg
@@ -1,24 +1,32 @@
 [net]
-batch=128
+# Training
+# batch=128
+# subdivisions=2
+
+# Testing
+batch=1
 subdivisions=1
+
 height=256
 width=256
 channels=3
-momentum=0.9
-decay=0.0005
+min_crop=128
+max_crop=448
 
+burn_in=1000
 learning_rate=0.1
 policy=poly
 power=4
-max_batches=500000
+max_batches=800000
+momentum=0.9
+decay=0.0005
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
 
-[crop]
-crop_height=224
-crop_width=224
-flip=1
-saturation=1
-exposure=1
-angle=0
 
 [convolutional]
 batch_normalize=1
@@ -29,9 +37,10 @@ pad=1
 activation=leaky
 
 [maxpool]
-size=3
+size=2
 stride=2
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=64
@@ -46,11 +55,13 @@ filters=64
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=64
@@ -65,11 +76,13 @@ filters=64
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=64
@@ -84,14 +97,13 @@ filters=64
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
-
-
-
+activation=leaky
+from=-3
 
+# Strided Residual Block
 [convolutional]
 batch_normalize=1
 filters=128
@@ -106,11 +118,13 @@ filters=128
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=128
@@ -125,11 +139,13 @@ filters=128
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=128
@@ -144,11 +160,13 @@ filters=128
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=128
@@ -163,16 +181,13 @@ filters=128
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
-
-
-
-
-
+activation=leaky
+from=-3
 
+# Strided Residual Block
 [convolutional]
 batch_normalize=1
 filters=256
@@ -187,11 +202,13 @@ filters=256
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=256
@@ -206,11 +223,13 @@ filters=256
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=256
@@ -225,11 +244,13 @@ filters=256
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=256
@@ -244,11 +265,13 @@ filters=256
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=256
@@ -263,11 +286,13 @@ filters=256
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=256
@@ -282,19 +307,13 @@ filters=256
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
-
-
-
-
-
-
-
-
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=512
@@ -309,11 +328,13 @@ filters=512
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=512
@@ -328,11 +349,13 @@ filters=512
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
 
+# Residual Block
 [convolutional]
 batch_normalize=1
 filters=512
@@ -347,20 +370,23 @@ filters=512
 size=3
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
-from = -3
+activation=leaky
+from=-3
+
+
 
 [avgpool]
 
-[connected]
-output=1000
-activation=leaky
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
 
 [softmax]
 groups=1
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/resnet50.cfg b/image.darknet/inst/include/darknet/cfg/resnet50.cfg
new file mode 100644
index 0000000..d0d7c51
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/resnet50.cfg
@@ -0,0 +1,510 @@
+[net]
+# Training
+# batch=128
+# subdivisions=4
+
+# Testing
+batch=1
+subdivisions=1
+
+height=256
+width=256
+channels=3
+min_crop=128
+max_crop=448
+
+burn_in=1000
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=800000
+momentum=0.9
+decay=0.0005
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
+
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=7
+stride=2
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+
+# Conv 4
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+#Conv 5
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+
+
+
+
+[avgpool]
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
+
+[softmax]
+groups=1
+
+
diff --git a/image.darknet/inst/include/darknet/cfg/resnext101-32x4d.cfg b/image.darknet/inst/include/darknet/cfg/resnext101-32x4d.cfg
new file mode 100644
index 0000000..8538ccc
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/resnext101-32x4d.cfg
@@ -0,0 +1,1053 @@
+[net]
+# Training
+# batch=128
+# subdivisions=8
+
+# Testing
+batch=1
+subdivisions=1
+
+height=256
+width=256
+channels=3
+min_crop=128
+max_crop=448
+
+burn_in=1000
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=800000
+momentum=0.9
+decay=0.0005
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
+
+
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=7
+stride=2
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=4096
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=4096
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=4096
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+
+[avgpool]
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
+
+[softmax]
+groups=1
+
diff --git a/image.darknet/inst/include/darknet/cfg/resnext152-32x4d.cfg b/image.darknet/inst/include/darknet/cfg/resnext152-32x4d.cfg
new file mode 100644
index 0000000..48279fd
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/resnext152-32x4d.cfg
@@ -0,0 +1,1562 @@
+[net]
+# Training
+# batch=128
+# subdivisions=16
+
+# Testing
+batch=1
+subdivisions=1
+
+height=256
+width=256
+channels=3
+min_crop=128
+max_crop=448
+
+burn_in=1000
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=800000
+momentum=0.9
+decay=0.0005
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
+
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=7
+stride=2
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=4096
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=4096
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+groups = 32
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=4096
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+    
+
+
+[avgpool]
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
+
+[softmax]
+groups=1
+
diff --git a/image.darknet/inst/include/darknet/cfg/resnext50.cfg b/image.darknet/inst/include/darknet/cfg/resnext50.cfg
new file mode 100644
index 0000000..12aebdf
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/resnext50.cfg
@@ -0,0 +1,523 @@
+[net]
+# Training
+# batch=128
+# subdivisions=4
+
+# Testing
+batch=1
+subdivisions=1
+
+height=256
+width=256
+channels=3
+min_crop=128
+max_crop=448
+
+burn_in=1000
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=800000
+momentum=0.9
+decay=0.0005
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
+
+
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=7
+stride=2
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+groups=32
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+
+# Conv 4
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+groups=32
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+#Conv 5
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+groups=32
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+groups=32
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+[avgpool]
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
+
+[softmax]
+groups=1
+
+
diff --git a/image.darknet/inst/include/darknet/cfg/rnn.cfg b/image.darknet/inst/include/darknet/cfg/rnn.cfg
index 68c032d..61b202f 100644
--- a/image.darknet/inst/include/darknet/cfg/rnn.cfg
+++ b/image.darknet/inst/include/darknet/cfg/rnn.cfg
@@ -35,6 +35,4 @@ activation=leaky
 
 [softmax]
 
-[cost]
-type=sse
 
diff --git a/image.darknet/inst/include/darknet/cfg/rnn.train.cfg b/image.darknet/inst/include/darknet/cfg/rnn.train.cfg
index 9139757..b974899 100644
--- a/image.darknet/inst/include/darknet/cfg/rnn.train.cfg
+++ b/image.darknet/inst/include/darknet/cfg/rnn.train.cfg
@@ -35,6 +35,4 @@ activation=leaky
 
 [softmax]
 
-[cost]
-type=sse
 
diff --git a/image.darknet/inst/include/darknet/cfg/strided.cfg b/image.darknet/inst/include/darknet/cfg/strided.cfg
index a52700b..2f74508 100644
--- a/image.darknet/inst/include/darknet/cfg/strided.cfg
+++ b/image.darknet/inst/include/darknet/cfg/strided.cfg
@@ -180,6 +180,3 @@ activation=ramp
 
 [softmax]
 
-[cost]
-type=sse
-
diff --git a/image.darknet/inst/include/darknet/cfg/tiny.cfg b/image.darknet/inst/include/darknet/cfg/tiny.cfg
index 99c2603..f97327c 100644
--- a/image.darknet/inst/include/darknet/cfg/tiny.cfg
+++ b/image.darknet/inst/include/darknet/cfg/tiny.cfg
@@ -1,6 +1,10 @@
 [net]
+# Train
 batch=128
 subdivisions=1
+# Test
+# batch=1
+# subdivisions=1
 height=224
 width=224
 channels=3
@@ -167,6 +171,4 @@ activation=linear
 [softmax]
 groups=1
 
-[cost]
-type=sse
 
diff --git a/image.darknet/inst/include/darknet/cfg/vgg-16.cfg b/image.darknet/inst/include/darknet/cfg/vgg-16.cfg
index 2b6f702..c73b17b 100644
--- a/image.darknet/inst/include/darknet/cfg/vgg-16.cfg
+++ b/image.darknet/inst/include/darknet/cfg/vgg-16.cfg
@@ -1,6 +1,12 @@
 [net]
-batch=128
-subdivisions=4
+# Training
+# batch=128
+# subdivisions=4
+
+# Testing
+batch=1
+subdivisions=1
+
 height=256
 width=256
 channels=3
@@ -148,6 +154,4 @@ activation=linear
 [softmax]
 groups=1
 
-[cost]
-type=sse
 
diff --git a/image.darknet/inst/include/darknet/cfg/yolo9000.cfg b/image.darknet/inst/include/darknet/cfg/yolo9000.cfg
index 981491d..e745f78 100644
--- a/image.darknet/inst/include/darknet/cfg/yolo9000.cfg
+++ b/image.darknet/inst/include/darknet/cfg/yolo9000.cfg
@@ -1,17 +1,24 @@
 [net]
+# Testing
 batch=1
 subdivisions=1
-height=416
-width=416
+# Training
+# batch=64
+# subdivisions=8
+batch=1
+subdivisions=1
+height=544
+width=544
 channels=3
 momentum=0.9
 decay=0.0005
 
-learning_rate=0.00001
-max_batches = 242200
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
 policy=steps
-steps=500,200000,240000
-scales=10,.1,.1
+steps=400000,450000
+scales=.1,.1
 
 hue=.1
 saturation=.75
diff --git a/image.darknet/inst/include/darknet/cfg/yolov1/tiny-yolo.cfg b/image.darknet/inst/include/darknet/cfg/yolov1-tiny.cfg
similarity index 94%
rename from image.darknet/inst/include/darknet/cfg/yolov1/tiny-yolo.cfg
rename to image.darknet/inst/include/darknet/cfg/yolov1-tiny.cfg
index ac4b346..a5e7b49 100644
--- a/image.darknet/inst/include/darknet/cfg/yolov1/tiny-yolo.cfg
+++ b/image.darknet/inst/include/darknet/cfg/yolov1-tiny.cfg
@@ -1,6 +1,10 @@
 [net]
-batch=64
-subdivisions=2
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=8
 height=448
 width=448
 channels=3
diff --git a/image.darknet/inst/include/darknet/cfg/yolov1/yolo.cfg b/image.darknet/inst/include/darknet/cfg/yolov1.cfg
similarity index 98%
rename from image.darknet/inst/include/darknet/cfg/yolov1/yolo.cfg
rename to image.darknet/inst/include/darknet/cfg/yolov1.cfg
index c4f415c..06cf6e6 100644
--- a/image.darknet/inst/include/darknet/cfg/yolov1/yolo.cfg
+++ b/image.darknet/inst/include/darknet/cfg/yolov1.cfg
@@ -1,6 +1,10 @@
 [net]
+# Testing
 batch=1
 subdivisions=1
+# Training
+# batch=64
+# subdivisions=8
 height=448
 width=448
 channels=3
diff --git a/image.darknet/inst/include/darknet/cfg/yolov1/tiny-coco.cfg b/image.darknet/inst/include/darknet/cfg/yolov1/tiny-coco.cfg
deleted file mode 100644
index e58c73a..0000000
--- a/image.darknet/inst/include/darknet/cfg/yolov1/tiny-coco.cfg
+++ /dev/null
@@ -1,125 +0,0 @@
-[net]
-batch=64
-subdivisions=2
-height=448
-width=448
-channels=3
-momentum=0.9
-decay=0.0005
-
-hue = .1
-saturation=.75
-exposure=.75
-
-learning_rate=0.0005
-policy=steps
-steps=200,400,600,800,100000,150000
-scales=2.5,2,2,2,.1,.1
-max_batches = 200000
-
-[convolutional]
-batch_normalize=1
-filters=16
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=32
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=256
-activation=leaky
-
-[connected]
-output= 4655
-activation=linear
-
-[detection]
-classes=80
-coords=4
-rescore=1
-side=7
-num=3
-softmax=0
-sqrt=1
-jitter=.2
-
-object_scale=1
-noobject_scale=.5
-class_scale=1
-coord_scale=5
diff --git a/image.darknet/inst/include/darknet/cfg/yolov1/xyolo.test.cfg b/image.darknet/inst/include/darknet/cfg/yolov1/xyolo.test.cfg
deleted file mode 100644
index 5f3e6f4..0000000
--- a/image.darknet/inst/include/darknet/cfg/yolov1/xyolo.test.cfg
+++ /dev/null
@@ -1,143 +0,0 @@
-[net]
-batch=1
-subdivisions=1
-height=448
-width=448
-channels=3
-momentum=0.9
-decay=0.0005
-
-learning_rate=0.0001
-policy=steps
-steps=20,40,60,80,20000,30000
-scales=5,5,2,2,.1,.1
-max_batches = 40000
-
-[convolutional]
-batch_normalize=1
-filters=16
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[batchnorm]
-
-[convolutional]
-xnor = 1
-batch_normalize=1
-filters=32
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[batchnorm]
-
-[convolutional]
-xnor = 1
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[batchnorm]
-
-[convolutional]
-xnor = 1
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[batchnorm]
-
-[convolutional]
-xnor = 1
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[batchnorm]
-
-[convolutional]
-xnor = 1
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[batchnorm]
-
-[convolutional]
-xnor = 1
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[batchnorm]
-
-[convolutional]
-xnor = 1
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[connected]
-output= 1470
-activation=linear
-
-[detection]
-classes=20
-coords=4
-rescore=1
-side=7
-num=2
-softmax=0
-sqrt=1
-jitter=.2
-
-object_scale=1
-noobject_scale=.5
-class_scale=1
-coord_scale=5
-
diff --git a/image.darknet/inst/include/darknet/cfg/yolov1/yolo-small.cfg b/image.darknet/inst/include/darknet/cfg/yolov1/yolo-small.cfg
deleted file mode 100644
index 2a84485..0000000
--- a/image.darknet/inst/include/darknet/cfg/yolov1/yolo-small.cfg
+++ /dev/null
@@ -1,239 +0,0 @@
-[net]
-batch=64
-subdivisions=64
-height=448
-width=448
-channels=3
-momentum=0.9
-decay=0.0005
-
-learning_rate=0.001
-policy=steps
-steps=200,400,600,20000,30000
-scales=2.5,2,2,.1,.1
-max_batches = 40000
-
-[crop]
-crop_width=448
-crop_height=448
-flip=0
-angle=0
-saturation = 1.5
-exposure = 1.5
-
-[convolutional]
-filters=64
-size=7
-stride=2
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-filters=192
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-filters=128
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-#######
-
-[convolutional]
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-size=3
-stride=2
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[connected]
-output=512
-activation=leaky
-
-[connected]
-output=4096
-activation=leaky
-
-[dropout]
-probability=.5
-
-[connected]
-output= 1470
-activation=linear
-
-[detection]
-classes=20
-coords=4
-rescore=1
-side=7
-num=2
-softmax=0
-sqrt=1
-jitter=.2
-
-object_scale=1
-noobject_scale=.5
-class_scale=1
-coord_scale=5
-
diff --git a/image.darknet/inst/include/darknet/cfg/yolov1/yolo.train.cfg b/image.darknet/inst/include/darknet/cfg/yolov1/yolo.train.cfg
deleted file mode 100644
index 01aeb5e..0000000
--- a/image.darknet/inst/include/darknet/cfg/yolov1/yolo.train.cfg
+++ /dev/null
@@ -1,257 +0,0 @@
-[net]
-batch=64
-subdivisions=4
-height=448
-width=448
-channels=3
-momentum=0.9
-decay=0.0005
-saturation=1.5
-exposure=1.5
-hue=.1
-
-learning_rate=0.0005
-policy=steps
-steps=200,400,600,20000,30000
-scales=2.5,2,2,.1,.1
-max_batches = 40000
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=7
-stride=2
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=192
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[maxpool]
-size=2
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=3
-stride=1
-pad=1
-activation=leaky
-
-#######
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=2
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-size=3
-stride=1
-pad=1
-filters=1024
-activation=leaky
-
-[local]
-size=3
-stride=1
-pad=1
-filters=256
-activation=leaky
-
-[dropout]
-probability=.5
-
-[connected]
-output= 1715
-activation=linear
-
-[detection]
-classes=20
-coords=4
-rescore=1
-side=7
-num=3
-softmax=0
-sqrt=1
-jitter=.2
-
-object_scale=1
-noobject_scale=.5
-class_scale=1
-coord_scale=5
-
diff --git a/image.darknet/inst/include/darknet/cfg/tiny-yolo-voc.cfg b/image.darknet/inst/include/darknet/cfg/yolov2-tiny-voc.cfg
similarity index 93%
rename from image.darknet/inst/include/darknet/cfg/tiny-yolo-voc.cfg
rename to image.darknet/inst/include/darknet/cfg/yolov2-tiny-voc.cfg
index 1f33c35..c4c127c 100644
--- a/image.darknet/inst/include/darknet/cfg/tiny-yolo-voc.cfg
+++ b/image.darknet/inst/include/darknet/cfg/yolov2-tiny-voc.cfg
@@ -1,6 +1,10 @@
 [net]
-batch=64
-subdivisions=8
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=2
 width=416
 height=416
 channels=3
@@ -12,7 +16,7 @@ exposure = 1.5
 hue=.1
 
 learning_rate=0.001
-max_batches = 40100
+max_batches = 40200
 policy=steps
 steps=-1,100,20000,30000
 scales=.1,10,.1,.1
diff --git a/image.darknet/inst/include/darknet/cfg/tiny-yolo.cfg b/image.darknet/inst/include/darknet/cfg/yolov2-tiny.cfg
similarity index 82%
rename from image.darknet/inst/include/darknet/cfg/tiny-yolo.cfg
rename to image.darknet/inst/include/darknet/cfg/yolov2-tiny.cfg
index 5580098..81d0ac4 100644
--- a/image.darknet/inst/include/darknet/cfg/tiny-yolo.cfg
+++ b/image.darknet/inst/include/darknet/cfg/yolov2-tiny.cfg
@@ -1,6 +1,10 @@
 [net]
-batch=64
-subdivisions=8
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=2
 width=416
 height=416
 channels=3
@@ -12,10 +16,11 @@ exposure = 1.5
 hue=.1
 
 learning_rate=0.001
-max_batches = 120000
+burn_in=1000
+max_batches = 500200
 policy=steps
-steps=-1,100,80000,100000
-scales=.1,10,.1,.1
+steps=400000,450000
+scales=.1,.1
 
 [convolutional]
 batch_normalize=1
@@ -104,7 +109,7 @@ batch_normalize=1
 size=3
 stride=1
 pad=1
-filters=1024
+filters=512
 activation=leaky
 
 [convolutional]
@@ -115,14 +120,14 @@ filters=425
 activation=linear
 
 [region]
-anchors = 0.738768,0.874946,  2.42204,2.65704,  4.30971,7.04493,  10.246,4.59428,  12.6868,11.8741
+anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
 bias_match=1
 classes=80
 coords=4
 num=5
 softmax=1
 jitter=.2
-rescore=1
+rescore=0
 
 object_scale=5
 noobject_scale=1
diff --git a/image.darknet/inst/include/darknet/cfg/yolo.cfg b/image.darknet/inst/include/darknet/cfg/yolov2-voc.cfg
similarity index 87%
rename from image.darknet/inst/include/darknet/cfg/yolo.cfg
rename to image.darknet/inst/include/darknet/cfg/yolov2-voc.cfg
index fda339a..dbf2de2 100644
--- a/image.darknet/inst/include/darknet/cfg/yolo.cfg
+++ b/image.darknet/inst/include/darknet/cfg/yolov2-voc.cfg
@@ -1,8 +1,12 @@
 [net]
+# Testing
 batch=1
 subdivisions=1
-width=416
+# Training
+# batch=64
+# subdivisions=8
 height=416
+width=416
 channels=3
 momentum=0.9
 decay=0.0005
@@ -12,10 +16,11 @@ exposure = 1.5
 hue=.1
 
 learning_rate=0.001
-max_batches = 120000
+burn_in=1000
+max_batches = 80200
 policy=steps
-steps=-1,100,80000,100000
-scales=.1,10,.1,.1
+steps=40000,60000
+scales=.1,.1
 
 [convolutional]
 batch_normalize=1
@@ -203,11 +208,19 @@ activation=leaky
 [route]
 layers=-9
 
+[convolutional]
+batch_normalize=1
+size=1
+stride=1
+pad=1
+filters=64
+activation=leaky
+
 [reorg]
 stride=2
 
 [route]
-layers=-1,-3
+layers=-1,-4
 
 [convolutional]
 batch_normalize=1
@@ -221,17 +234,18 @@ activation=leaky
 size=1
 stride=1
 pad=1
-filters=425
+filters=125
 activation=linear
 
+
 [region]
-anchors = 0.738768,0.874946,  2.42204,2.65704,  4.30971,7.04493,  10.246,4.59428,  12.6868,11.8741
+anchors =  1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
 bias_match=1
-classes=80
+classes=20
 coords=4
 num=5
 softmax=1
-jitter=.2
+jitter=.3
 rescore=1
 
 object_scale=5
@@ -241,4 +255,4 @@ coord_scale=1
 
 absolute=1
 thresh = .6
-random=0
+random=1
diff --git a/image.darknet/inst/include/darknet/cfg/yolo-voc.cfg b/image.darknet/inst/include/darknet/cfg/yolov2.cfg
similarity index 84%
rename from image.darknet/inst/include/darknet/cfg/yolo-voc.cfg
rename to image.darknet/inst/include/darknet/cfg/yolov2.cfg
index ceb3f2a..088edf8 100644
--- a/image.darknet/inst/include/darknet/cfg/yolo-voc.cfg
+++ b/image.darknet/inst/include/darknet/cfg/yolov2.cfg
@@ -1,8 +1,12 @@
 [net]
-batch=64
-subdivisions=8
-height=416
-width=416
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=8
+width=608
+height=608
 channels=3
 momentum=0.9
 decay=0.0005
@@ -11,11 +15,12 @@ saturation = 1.5
 exposure = 1.5
 hue=.1
 
-learning_rate=0.0001
-max_batches = 45000
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
 policy=steps
-steps=100,25000,35000
-scales=10,.1,.1
+steps=400000,450000
+scales=.1,.1
 
 [convolutional]
 batch_normalize=1
@@ -203,11 +208,19 @@ activation=leaky
 [route]
 layers=-9
 
+[convolutional]
+batch_normalize=1
+size=1
+stride=1
+pad=1
+filters=64
+activation=leaky
+
 [reorg]
 stride=2
 
 [route]
-layers=-1,-3
+layers=-1,-4
 
 [convolutional]
 batch_normalize=1
@@ -221,17 +234,18 @@ activation=leaky
 size=1
 stride=1
 pad=1
-filters=125
+filters=425
 activation=linear
 
+
 [region]
-anchors = 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52
+anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
 bias_match=1
-classes=20
+classes=80
 coords=4
 num=5
 softmax=1
-jitter=.2
+jitter=.3
 rescore=1
 
 object_scale=5
@@ -241,4 +255,4 @@ coord_scale=1
 
 absolute=1
 thresh = .6
-random=0
+random=1
diff --git a/image.darknet/inst/include/darknet/cfg/yolov3-openimages.cfg b/image.darknet/inst/include/darknet/cfg/yolov3-openimages.cfg
new file mode 100644
index 0000000..65d241a
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/yolov3-openimages.cfg
@@ -0,0 +1,789 @@
+[net]
+# Testing
+ batch=1
+ subdivisions=1
+# Training
+batch=64
+subdivisions=16
+width=608
+height=608
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=5000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=1818
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=601
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=1818
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=601
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=1818
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=601
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
diff --git a/image.darknet/inst/include/darknet/cfg/yolov3-spp.cfg b/image.darknet/inst/include/darknet/cfg/yolov3-spp.cfg
new file mode 100644
index 0000000..4ad2a05
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/yolov3-spp.cfg
@@ -0,0 +1,822 @@
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=16
+width=608
+height=608
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+### SPP ###
+[maxpool]
+stride=1
+size=5
+
+[route]
+layers=-2
+
+[maxpool]
+stride=1
+size=9
+
+[route]
+layers=-4
+
+[maxpool]
+stride=1
+size=13
+
+[route]
+layers=-1,-3,-5,-6
+
+### End SPP ###
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
diff --git a/image.darknet/inst/include/darknet/cfg/yolov3-tiny.cfg b/image.darknet/inst/include/darknet/cfg/yolov3-tiny.cfg
new file mode 100644
index 0000000..cfca3cf
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/yolov3-tiny.cfg
@@ -0,0 +1,182 @@
+[net]
+# Testing
+batch=1
+subdivisions=1
+# Training
+# batch=64
+# subdivisions=2
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=16
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=1
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+###########
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 8
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+[yolo]
+mask = 0,1,2
+anchors = 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
+classes=80
+num=6
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
diff --git a/image.darknet/inst/include/darknet/cfg/yolov3-voc.cfg b/image.darknet/inst/include/darknet/cfg/yolov3-voc.cfg
new file mode 100644
index 0000000..3f3e8df
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/yolov3-voc.cfg
@@ -0,0 +1,785 @@
+[net]
+# Testing
+ batch=1
+ subdivisions=1
+# Training
+# batch=64
+# subdivisions=16
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 50200
+policy=steps
+steps=40000,45000
+scales=.1,.1
+
+
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=75
+activation=linear
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=20
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=75
+activation=linear
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=20
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=75
+activation=linear
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=20
+num=9
+jitter=.3
+ignore_thresh = .5
+truth_thresh = 1
+random=1
+
diff --git a/image.darknet/inst/include/darknet/cfg/yolov3.cfg b/image.darknet/inst/include/darknet/cfg/yolov3.cfg
new file mode 100644
index 0000000..938ffff
--- /dev/null
+++ b/image.darknet/inst/include/darknet/cfg/yolov3.cfg
@@ -0,0 +1,789 @@
+[net]
+# Testing
+# batch=1
+# subdivisions=1
+# Training
+batch=64
+subdivisions=16
+width=608
+height=608
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[shortcut]
+from=-3
+activation=linear
+
+######################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 61
+
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 36
+
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=255
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
+classes=80
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+
diff --git a/image.darknet/inst/include/darknet/data/kite.jpg b/image.darknet/inst/include/darknet/data/kite.jpg
new file mode 100644
index 0000000..9eb325a
Binary files /dev/null and b/image.darknet/inst/include/darknet/data/kite.jpg differ
diff --git a/image.darknet/inst/include/darknet/data/openimages.names b/image.darknet/inst/include/darknet/data/openimages.names
new file mode 100644
index 0000000..ddfd8f2
--- /dev/null
+++ b/image.darknet/inst/include/darknet/data/openimages.names
@@ -0,0 +1,601 @@
+Tortoise
+Container
+Magpie
+Sea turtle
+Football
+Ambulance
+Ladder
+Toothbrush
+Syringe
+Sink
+Toy
+Organ
+Cassette deck
+Apple
+Human eye
+Cosmetics
+Paddle
+Snowman
+Beer
+Chopsticks
+Human beard
+Bird
+Parking meter
+Traffic light
+Croissant
+Cucumber
+Radish
+Towel
+Doll
+Skull
+Washing machine
+Glove
+Tick
+Belt
+Sunglasses
+Banjo
+Cart
+Ball
+Backpack
+Bicycle
+Home appliance
+Centipede
+Boat
+Surfboard
+Boot
+Headphones
+Hot dog
+Shorts
+Fast food
+Bus
+Boy
+Screwdriver
+Bicycle wheel
+Barge
+Laptop
+Miniskirt
+Drill
+Dress
+Bear
+Waffle
+Pancake
+Brown bear
+Woodpecker
+Blue jay
+Pretzel
+Bagel
+Tower
+Teapot
+Person
+Bow and arrow
+Swimwear
+Beehive
+Brassiere
+Bee
+Bat
+Starfish
+Popcorn
+Burrito
+Chainsaw
+Balloon
+Wrench
+Tent
+Vehicle registration plate
+Lantern
+Toaster
+Flashlight
+Billboard
+Tiara
+Limousine
+Necklace
+Carnivore
+Scissors
+Stairs
+Computer keyboard
+Printer
+Traffic sign
+Chair
+Shirt
+Poster
+Cheese
+Sock
+Fire hydrant
+Land vehicle
+Earrings
+Tie
+Watercraft
+Cabinetry
+Suitcase
+Muffin
+Bidet
+Snack
+Snowmobile
+Clock
+Medical equipment
+Cattle
+Cello
+Jet ski
+Camel
+Coat
+Suit
+Desk
+Cat
+Bronze sculpture
+Juice
+Gondola
+Beetle
+Cannon
+Computer mouse
+Cookie
+Office building
+Fountain
+Coin
+Calculator
+Cocktail
+Computer monitor
+Box
+Stapler
+Christmas tree
+Cowboy hat
+Hiking equipment
+Studio couch
+Drum
+Dessert
+Wine rack
+Drink
+Zucchini
+Ladle
+Human mouth
+Dairy
+Dice
+Oven
+Dinosaur
+Ratchet
+Couch
+Cricket ball
+Winter melon
+Spatula
+Whiteboard
+Pencil sharpener
+Door
+Hat
+Shower
+Eraser
+Fedora
+Guacamole
+Dagger
+Scarf
+Dolphin
+Sombrero
+Tin can
+Mug
+Tap
+Harbor seal
+Stretcher
+Can opener
+Goggles
+Human body
+Roller skates
+Coffee cup
+Cutting board
+Blender
+Plumbing fixture
+Stop sign
+Office supplies
+Volleyball
+Vase
+Slow cooker
+Wardrobe
+Coffee
+Whisk
+Paper towel
+Personal care
+Food
+Sun hat
+Tree house
+Flying disc
+Skirt
+Gas stove
+Salt and pepper shakers
+Mechanical fan
+Face powder
+Fax
+Fruit
+French fries
+Nightstand
+Barrel
+Kite
+Tart
+Treadmill
+Fox
+Flag
+Horn
+Window blind
+Human foot
+Golf cart
+Jacket
+Egg
+Street light
+Guitar
+Pillow
+Human leg
+Isopod
+Grape
+Human ear
+Power plugs and sockets
+Panda
+Giraffe
+Woman
+Door handle
+Rhinoceros
+Bathtub
+Goldfish
+Houseplant
+Goat
+Baseball bat
+Baseball glove
+Mixing bowl
+Marine invertebrates
+Kitchen utensil
+Light switch
+House
+Horse
+Stationary bicycle
+Hammer
+Ceiling fan
+Sofa bed
+Adhesive tape
+Harp
+Sandal
+Bicycle helmet
+Saucer
+Harpsichord
+Human hair
+Heater
+Harmonica
+Hamster
+Curtain
+Bed
+Kettle
+Fireplace
+Scale
+Drinking straw
+Insect
+Hair dryer
+Kitchenware
+Indoor rower
+Invertebrate
+Food processor
+Bookcase
+Refrigerator
+Wood-burning stove
+Punching bag
+Common fig
+Cocktail shaker
+Jaguar
+Golf ball
+Fashion accessory
+Alarm clock
+Filing cabinet
+Artichoke
+Table
+Tableware
+Kangaroo
+Koala
+Knife
+Bottle
+Bottle opener
+Lynx
+Lavender
+Lighthouse
+Dumbbell
+Human head
+Bowl
+Humidifier
+Porch
+Lizard
+Billiard table
+Mammal
+Mouse
+Motorcycle
+Musical instrument
+Swim cap
+Frying pan
+Snowplow
+Bathroom cabinet
+Missile
+Bust
+Man
+Waffle iron
+Milk
+Ring binder
+Plate
+Mobile phone
+Baked goods
+Mushroom
+Crutch
+Pitcher
+Mirror
+Lifejacket
+Table tennis racket
+Pencil case
+Musical keyboard
+Scoreboard
+Briefcase
+Kitchen knife
+Nail
+Tennis ball
+Plastic bag
+Oboe
+Chest of drawers
+Ostrich
+Piano
+Girl
+Plant
+Potato
+Hair spray
+Sports equipment
+Pasta
+Penguin
+Pumpkin
+Pear
+Infant bed
+Polar bear
+Mixer
+Cupboard
+Jacuzzi
+Pizza
+Digital clock
+Pig
+Reptile
+Rifle
+Lipstick
+Skateboard
+Raven
+High heels
+Red panda
+Rose
+Rabbit
+Sculpture
+Saxophone
+Shotgun
+Seafood
+Submarine sandwich
+Snowboard
+Sword
+Picture frame
+Sushi
+Loveseat
+Ski
+Squirrel
+Tripod
+Stethoscope
+Submarine
+Scorpion
+Segway
+Training bench
+Snake
+Coffee table
+Skyscraper
+Sheep
+Television
+Trombone
+Tea
+Tank
+Taco
+Telephone
+Torch
+Tiger
+Strawberry
+Trumpet
+Tree
+Tomato
+Train
+Tool
+Picnic basket
+Cooking spray
+Trousers
+Bowling equipment
+Football helmet
+Truck
+Measuring cup
+Coffeemaker
+Violin
+Vehicle
+Handbag
+Paper cutter
+Wine
+Weapon
+Wheel
+Worm
+Wok
+Whale
+Zebra
+Auto part
+Jug
+Pizza cutter
+Cream
+Monkey
+Lion
+Bread
+Platter
+Chicken
+Eagle
+Helicopter
+Owl
+Duck
+Turtle
+Hippopotamus
+Crocodile
+Toilet
+Toilet paper
+Squid
+Clothing
+Footwear
+Lemon
+Spider
+Deer
+Frog
+Banana
+Rocket
+Wine glass
+Countertop
+Tablet computer
+Waste container
+Swimming pool
+Dog
+Book
+Elephant
+Shark
+Candle
+Leopard
+Axe
+Hand dryer
+Soap dispenser
+Porcupine
+Flower
+Canary
+Cheetah
+Palm tree
+Hamburger
+Maple
+Building
+Fish
+Lobster
+Asparagus
+Furniture
+Hedgehog
+Airplane
+Spoon
+Otter
+Bull
+Oyster
+Horizontal bar
+Convenience store
+Bomb
+Bench
+Ice cream
+Caterpillar
+Butterfly
+Parachute
+Orange
+Antelope
+Beaker
+Moths and butterflies
+Window
+Closet
+Castle
+Jellyfish
+Goose
+Mule
+Swan
+Peach
+Coconut
+Seat belt
+Raccoon
+Chisel
+Fork
+Lamp
+Camera
+Squash
+Racket
+Human face
+Human arm
+Vegetable
+Diaper
+Unicycle
+Falcon
+Chime
+Snail
+Shellfish
+Cabbage
+Carrot
+Mango
+Jeans
+Flowerpot
+Pineapple
+Drawer
+Stool
+Envelope
+Cake
+Dragonfly
+Sunflower
+Microwave oven
+Honeycomb
+Marine mammal
+Sea lion
+Ladybug
+Shelf
+Watch
+Candy
+Salad
+Parrot
+Handgun
+Sparrow
+Van
+Grinder
+Spice rack
+Light bulb
+Corded phone
+Sports uniform
+Tennis racket
+Wall clock
+Serving tray
+Kitchen & dining room table
+Dog bed
+Cake stand
+Cat furniture
+Bathroom accessory
+Facial tissue holder
+Pressure cooker
+Kitchen appliance
+Tire
+Ruler
+Luggage and bags
+Microphone
+Broccoli
+Umbrella
+Pastry
+Grapefruit
+Band-aid
+Animal
+Bell pepper
+Turkey
+Lily
+Pomegranate
+Doughnut
+Glasses
+Human nose
+Pen
+Ant
+Car
+Aircraft
+Human hand
+Skunk
+Teddy bear
+Watermelon
+Cantaloupe
+Dishwasher
+Flute
+Balance beam
+Sandwich
+Shrimp
+Sewing machine
+Binoculars
+Rays and skates
+Ipod
+Accordion
+Willow
+Crab
+Crown
+Seahorse
+Perfume
+Alpaca
+Taxi
+Canoe
+Remote control
+Wheelchair
+Rugby ball
+Armadillo
+Maracas
+Helmet
diff --git a/image.darknet/src/art.c b/image.darknet/inst/include/darknet/examples/art.c
similarity index 64%
rename from image.darknet/src/art.c
rename to image.darknet/inst/include/darknet/examples/art.c
index 71d3719..932688e 100644
--- a/image.darknet/src/art.c
+++ b/image.darknet/inst/include/darknet/examples/art.c
@@ -1,43 +1,26 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-#include "option_list.h"
-#include "blas.h"
-#include "classifier.h"
-#include <sys/time.h>
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-image get_image_from_stream(CvCapture *cap);
-#endif
+#include "darknet.h"
 
+#include <sys/time.h>
 
 void demo_art(char *cfgfile, char *weightfile, int cam_index)
 {
 #ifdef OPENCV
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
 
     srand(2222222);
-    CvCapture * cap;
 
-    cap = cvCaptureFromCAM(cam_index);
+    void * cap = open_video_stream(0, cam_index, 0,0,0);
 
     char *window = "ArtJudgementBot9000!!!";
     if(!cap) error("Couldn't connect to webcam.\n");
-    cvNamedWindow(window, CV_WINDOW_NORMAL); 
-    cvResizeWindow(window, 512, 512);
     int i;
     int idx[] = {37, 401, 434};
     int n = sizeof(idx)/sizeof(idx[0]);
 
     while(1){
         image in = get_image_from_stream(cap);
-        image in_s = resize_image(in, net.w, net.h);
-        show_image(in, window);
+        image in_s = resize_image(in, net->w, net->h);
 
         float *p = network_predict(net, in_s.data);
 
@@ -58,10 +41,9 @@ void demo_art(char *cfgfile, char *weightfile, int cam_index)
         }
         printf("]\n");
 
+        show_image(in, window, 1);
         free_image(in_s);
         free_image(in);
-
-        cvWaitKey(1);
     }
 #endif
 }
diff --git a/image.darknet/inst/include/darknet/examples/attention.c b/image.darknet/inst/include/darknet/examples/attention.c
new file mode 100644
index 0000000..cd1e579
--- /dev/null
+++ b/image.darknet/inst/include/darknet/examples/attention.c
@@ -0,0 +1,459 @@
+#include "darknet.h"
+
+#include <sys/time.h>
+#include <assert.h>
+
+void extend_data_truth(data *d, int n, float val)
+{
+    int i, j;
+    for(i = 0; i < d->y.rows; ++i){
+        d->y.vals[i] = realloc(d->y.vals[i], (d->y.cols+n)*sizeof(float));
+        for(j = 0; j < n; ++j){
+            d->y.vals[i][d->y.cols + j] = val;
+        }
+    }
+    d->y.cols += n;
+}
+
+matrix network_loss_data(network *net, data test)
+{
+    int i,b;
+    int k = 1;
+    matrix pred = make_matrix(test.X.rows, k);
+    float *X = calloc(net->batch*test.X.cols, sizeof(float));
+    float *y = calloc(net->batch*test.y.cols, sizeof(float));
+    for(i = 0; i < test.X.rows; i += net->batch){
+        for(b = 0; b < net->batch; ++b){
+            if(i+b == test.X.rows) break;
+            memcpy(X+b*test.X.cols, test.X.vals[i+b], test.X.cols*sizeof(float));
+            memcpy(y+b*test.y.cols, test.y.vals[i+b], test.y.cols*sizeof(float));
+        }
+
+        network orig = *net;
+        net->input = X;
+        net->truth = y;
+        net->train = 0;
+        net->delta = 0;
+        forward_network(net);
+        *net = orig;
+
+        float *delta = net->layers[net->n-1].output;
+        for(b = 0; b < net->batch; ++b){
+            if(i+b == test.X.rows) break;
+            int t = max_index(y + b*test.y.cols, 1000);
+            float err = sum_array(delta + b*net->outputs, net->outputs);
+            pred.vals[i+b][0] = -err;
+            //pred.vals[i+b][0] = 1-delta[b*net->outputs + t];
+        }
+    }
+    free(X);
+    free(y);
+    return pred;   
+}
+
+void train_attention(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
+{
+    int i, j;
+
+    float avg_cls_loss = -1;
+    float avg_att_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    printf("%d\n", ngpus);
+    network **nets = calloc(ngpus, sizeof(network*));
+
+    srand(time(0));
+    int seed = rand();
+    for(i = 0; i < ngpus; ++i){
+        srand(seed);
+#ifdef GPU
+        cuda_set_device(gpus[i]);
+#endif
+        nets[i] = load_network(cfgfile, weightfile, clear);
+        nets[i]->learning_rate *= ngpus;
+    }
+    srand(time(0));
+    network *net = nets[0];
+
+    int imgs = net->batch * net->subdivisions * ngpus;
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    list *options = read_data_cfg(datacfg);
+
+    char *backup_directory = option_find_str(options, "backup", "/backup/");
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *train_list = option_find_str(options, "train", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(train_list);
+    char **paths = (char **)list_to_array(plist);
+    printf("%d\n", plist->size);
+    int N = plist->size;
+    double time;
+
+    int divs=3;
+    int size=2;
+
+    load_args args = {0};
+    args.w = divs*net->w/size;
+    args.h = divs*net->h/size;
+    args.size = divs*net->w/size;
+    args.threads = 32;
+    args.hierarchy = net->hierarchy;
+
+    args.min = net->min_ratio*args.w;
+    args.max = net->max_ratio*args.w;
+    args.angle = net->angle;
+    args.aspect = net->aspect;
+    args.exposure = net->exposure;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
+
+    args.paths = paths;
+    args.classes = classes;
+    args.n = imgs;
+    args.m = N;
+    args.labels = labels;
+    args.type = CLASSIFICATION_DATA;
+
+    data train;
+    data buffer;
+    pthread_t load_thread;
+    args.d = &buffer;
+    load_thread = load_data(args);
+
+    int epoch = (*net->seen)/N;
+    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
+        time = what_time_is_it_now();
+
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data(args);
+        data resized = resize_data(train, net->w, net->h);
+        extend_data_truth(&resized, divs*divs, 0);
+        data *tiles = tile_data(train, divs, size);
+
+        printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
+        time = what_time_is_it_now();
+
+        float aloss = 0;
+        float closs = 0;
+        int z;
+        for (i = 0; i < divs*divs/ngpus; ++i) {
+#pragma omp parallel for
+            for(j = 0; j < ngpus; ++j){
+                int index = i*ngpus + j;
+                extend_data_truth(tiles+index, divs*divs, SECRET_NUM);
+                matrix deltas = network_loss_data(nets[j], tiles[index]);
+                for(z = 0; z < resized.y.rows; ++z){
+                    resized.y.vals[z][train.y.cols + index] = deltas.vals[z][0];
+                }
+                free_matrix(deltas);
+            }
+        }
+        int *inds = calloc(resized.y.rows, sizeof(int));
+        for(z = 0; z < resized.y.rows; ++z){
+            int index = max_index(resized.y.vals[z] + train.y.cols, divs*divs);
+            inds[z] = index;
+            for(i = 0; i < divs*divs; ++i){
+                resized.y.vals[z][train.y.cols + i] = (i == index)? 1 : 0;
+            }
+        }
+        data best = select_data(tiles, inds);
+        free(inds);
+        #ifdef GPU
+        if (ngpus == 1) {
+            closs = train_network(net, best);
+        } else {
+            closs = train_networks(nets, ngpus, best, 4);
+        }
+        #endif
+        for (i = 0; i < divs*divs; ++i) {
+            printf("%.2f ", resized.y.vals[0][train.y.cols + i]);
+            if((i+1)%divs == 0) printf("\n");
+            free_data(tiles[i]);
+        }
+        free_data(best);
+        printf("\n");
+        image im = float_to_image(64,64,3,resized.X.vals[0]);
+        //show_image(im, "orig");
+        //cvWaitKey(100);
+        /*
+           image im1 = float_to_image(64,64,3,tiles[i].X.vals[0]);
+           image im2 = float_to_image(64,64,3,resized.X.vals[0]);
+           show_image(im1, "tile");
+           show_image(im2, "res");
+         */
+#ifdef GPU
+        if (ngpus == 1) {
+            aloss = train_network(net, resized);
+        } else {
+            aloss = train_networks(nets, ngpus, resized, 4);
+        }
+#endif
+        for(i = 0; i < divs*divs; ++i){
+            printf("%f ", nets[0]->output[1000 + i]);
+            if ((i+1) % divs == 0) printf("\n");
+        }
+        printf("\n");
+
+        free_data(resized);
+        free_data(train);
+        if(avg_cls_loss == -1) avg_cls_loss = closs;
+        if(avg_att_loss == -1) avg_att_loss = aloss;
+        avg_cls_loss = avg_cls_loss*.9 + closs*.1;
+        avg_att_loss = avg_att_loss*.9 + aloss*.1;
+
+        printf("%ld, %.3f: Att: %f, %f avg, Class: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, aloss, avg_att_loss, closs, avg_cls_loss, get_current_rate(net), what_time_is_it_now()-time, *net->seen);
+        if(*net->seen/N > epoch){
+            epoch = *net->seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            save_weights(net, buff);
+        }
+        if(get_current_batch(net)%1000 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s.weights", backup_directory, base);
+    save_weights(net, buff);
+    pthread_join(load_thread, 0);
+
+    free_network(net);
+    free_ptrs((void**)labels, classes);
+    free_ptrs((void**)paths, plist->size);
+    free_list(plist);
+    free(base);
+}
+
+void validate_attention_single(char *datacfg, char *filename, char *weightfile)
+{
+    int i, j;
+    network *net = load_network(filename, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *leaf_list = option_find_str(options, "leaves", 0);
+    if(leaf_list) change_leaves(net->hierarchy, leaf_list);
+    char *valid_list = option_find_str(options, "valid", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+    int topk = option_find_int(options, "top", 1);
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(valid_list);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    float avg_acc = 0;
+    float avg_topk = 0;
+    int *indexes = calloc(topk, sizeof(int));
+    int divs = 4;
+    int size = 2;
+    int extra = 0;
+    float *avgs = calloc(classes, sizeof(float));
+    int *inds = calloc(divs*divs, sizeof(int));
+
+    for(i = 0; i < m; ++i){
+        int class = -1;
+        char *path = paths[i];
+        for(j = 0; j < classes; ++j){
+            if(strstr(path, labels[j])){
+                class = j;
+                break;
+            }
+        }
+        image im = load_image_color(paths[i], 0, 0);
+        image resized = resize_min(im, net->w*divs/size);
+        image crop = crop_image(resized, (resized.w - net->w*divs/size)/2, (resized.h - net->h*divs/size)/2, net->w*divs/size, net->h*divs/size);
+        image rcrop = resize_image(crop, net->w, net->h);
+        //show_image(im, "orig");
+        //show_image(crop, "cropped");
+        //cvWaitKey(0);
+        float *pred = network_predict(net, rcrop.data);
+        //pred[classes + 56] = 0;
+        for(j = 0; j < divs*divs; ++j){
+            printf("%.2f ", pred[classes + j]);
+            if((j+1)%divs == 0) printf("\n");
+        }
+        printf("\n");
+        copy_cpu(classes, pred, 1, avgs, 1);
+        top_k(pred + classes, divs*divs, divs*divs, inds);
+        show_image(crop, "crop");
+        for(j = 0; j < extra; ++j){
+            int index = inds[j];
+            int row = index / divs;
+            int col = index % divs;
+            int y = row * crop.h / divs - (net->h - crop.h/divs)/2;
+            int x = col * crop.w / divs - (net->w - crop.w/divs)/2;
+            printf("%d %d %d %d\n", row, col, y, x);
+            image tile = crop_image(crop, x, y, net->w, net->h);
+            float *pred = network_predict(net, tile.data);
+            axpy_cpu(classes, 1., pred, 1, avgs, 1);
+            show_image(tile, "tile");
+            //cvWaitKey(10);
+        }
+        if(net->hierarchy) hierarchy_predictions(pred, net->outputs, net->hierarchy, 1, 1);
+
+        if(rcrop.data != resized.data) free_image(rcrop);
+        if(resized.data != im.data) free_image(resized);
+        free_image(im);
+        free_image(crop);
+        top_k(pred, classes, topk, indexes);
+
+        if(indexes[0] == class) avg_acc += 1;
+        for(j = 0; j < topk; ++j){
+            if(indexes[j] == class) avg_topk += 1;
+        }
+
+        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
+    }
+}
+
+void validate_attention_multi(char *datacfg, char *filename, char *weightfile)
+{
+    int i, j;
+    network *net = load_network(filename, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *valid_list = option_find_str(options, "valid", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+    int topk = option_find_int(options, "top", 1);
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(valid_list);
+    int scales[] = {224, 288, 320, 352, 384};
+    int nscales = sizeof(scales)/sizeof(scales[0]);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    float avg_acc = 0;
+    float avg_topk = 0;
+    int *indexes = calloc(topk, sizeof(int));
+
+    for(i = 0; i < m; ++i){
+        int class = -1;
+        char *path = paths[i];
+        for(j = 0; j < classes; ++j){
+            if(strstr(path, labels[j])){
+                class = j;
+                break;
+            }
+        }
+        float *pred = calloc(classes, sizeof(float));
+        image im = load_image_color(paths[i], 0, 0);
+        for(j = 0; j < nscales; ++j){
+            image r = resize_min(im, scales[j]);
+            resize_network(net, r.w, r.h);
+            float *p = network_predict(net, r.data);
+            if(net->hierarchy) hierarchy_predictions(p, net->outputs, net->hierarchy, 1 , 1);
+            axpy_cpu(classes, 1, p, 1, pred, 1);
+            flip_image(r);
+            p = network_predict(net, r.data);
+            axpy_cpu(classes, 1, p, 1, pred, 1);
+            if(r.data != im.data) free_image(r);
+        }
+        free_image(im);
+        top_k(pred, classes, topk, indexes);
+        free(pred);
+        if(indexes[0] == class) avg_acc += 1;
+        for(j = 0; j < topk; ++j){
+            if(indexes[j] == class) avg_topk += 1;
+        }
+
+        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
+    }
+}
+
+void predict_attention(char *datacfg, char *cfgfile, char *weightfile, char *filename, int top)
+{
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(2222222);
+
+    list *options = read_data_cfg(datacfg);
+
+    char *name_list = option_find_str(options, "names", 0);
+    if(!name_list) name_list = option_find_str(options, "labels", "data/labels.list");
+    if(top == 0) top = option_find_int(options, "top", 1);
+
+    int i = 0;
+    char **names = get_labels(name_list);
+    clock_t time;
+    int *indexes = calloc(top, sizeof(int));
+    char buff[256];
+    char *input = buff;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        image r = letterbox_image(im, net->w, net->h);
+        //resize_network(&net, r.w, r.h);
+        //printf("%d %d\n", r.w, r.h);
+
+        float *X = r.data;
+        time=clock();
+        float *predictions = network_predict(net, X);
+        if(net->hierarchy) hierarchy_predictions(predictions, net->outputs, net->hierarchy, 1, 1);
+        top_k(predictions, net->outputs, top, indexes);
+        fprintf(stderr, "%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        for(i = 0; i < top; ++i){
+            int index = indexes[i];
+            //if(net->hierarchy) printf("%d, %s: %f, parent: %s \n",index, names[index], predictions[index], (net->hierarchy->parent[index] >= 0) ? names[net->hierarchy->parent[index]] : "Root");
+            //else printf("%s: %f\n",names[index], predictions[index]);
+            printf("%5.2f%%: %s\n", predictions[index]*100, names[index]);
+        }
+        if(r.data != im.data) free_image(r);
+        free_image(im);
+        if (filename) break;
+    }
+}
+
+
+void run_attention(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    int ngpus;
+    int *gpus = read_intlist(gpu_list, &ngpus, gpu_index);
+
+
+    int top = find_int_arg(argc, argv, "-t", 0);
+    int clear = find_arg(argc, argv, "-clear");
+    char *data = argv[3];
+    char *cfg = argv[4];
+    char *weights = (argc > 5) ? argv[5] : 0;
+    char *filename = (argc > 6) ? argv[6]: 0;
+    char *layer_s = (argc > 7) ? argv[7]: 0;
+    if(0==strcmp(argv[2], "predict")) predict_attention(data, cfg, weights, filename, top);
+    else if(0==strcmp(argv[2], "train")) train_attention(data, cfg, weights, gpus, ngpus, clear);
+    else if(0==strcmp(argv[2], "valid")) validate_attention_single(data, cfg, weights);
+    else if(0==strcmp(argv[2], "validmulti")) validate_attention_multi(data, cfg, weights);
+}
+
+
diff --git a/image.darknet/inst/include/darknet/src/captcha.c b/image.darknet/inst/include/darknet/examples/captcha.c
similarity index 88%
rename from image.darknet/inst/include/darknet/src/captcha.c
rename to image.darknet/inst/include/darknet/examples/captcha.c
index 3d449b2..41d6d07 100644
--- a/image.darknet/inst/include/darknet/src/captcha.c
+++ b/image.darknet/inst/include/darknet/examples/captcha.c
@@ -1,6 +1,4 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
+#include "darknet.h"
 
 void fix_data_captcha(data d, int mask)
 {
@@ -32,13 +30,10 @@ void train_captcha(char *cfgfile, char *weightfile)
     float avg_loss = -1;
     char *base = basecfg(cfgfile);
     printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    network *net = load_network(cfgfile, weightfile, 0);
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
     int imgs = 1024;
-    int i = *net.seen/imgs;
+    int i = *net->seen/imgs;
     int solved = 1;
     list *plist;
     char **labels = get_labels("/data/captcha/reimgs.labels.list");
@@ -55,8 +50,8 @@ void train_captcha(char *cfgfile, char *weightfile)
     data buffer;
 
     load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
+    args.w = net->w;
+    args.h = net->h;
     args.paths = paths;
     args.classes = 26;
     args.n = imgs;
@@ -85,7 +80,7 @@ void train_captcha(char *cfgfile, char *weightfile)
         float loss = train_network(net, train);
         if(avg_loss == -1) avg_loss = loss;
         avg_loss = avg_loss*.9 + loss*.1;
-        printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), *net.seen);
+        printf("%d: %f, %f avg, %lf seconds, %ld images\n", i, loss, avg_loss, sec(clock()-time), *net->seen);
         free_data(train);
         if(i%100==0){
             char buff[256];
@@ -97,11 +92,8 @@ void train_captcha(char *cfgfile, char *weightfile)
 
 void test_captcha(char *cfgfile, char *weightfile, char *filename)
 {
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
     srand(2222222);
     int i = 0;
     char **names = get_labels("/data/captcha/reimgs.labels.list");
@@ -118,7 +110,7 @@ void test_captcha(char *cfgfile, char *weightfile, char *filename)
             if(!input) return;
             strtok(input, "\n");
         }
-        image im = load_image_color(input, net.w, net.h);
+        image im = load_image_color(input, net->w, net->h);
         float *X = im.data;
         float *predictions = network_predict(net, X);
         top_predictions(net, 26, indexes);
@@ -138,21 +130,18 @@ void test_captcha(char *cfgfile, char *weightfile, char *filename)
 void valid_captcha(char *cfgfile, char *weightfile, char *filename)
 {
     char **labels = get_labels("/data/captcha/reimgs.labels.list");
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfgfile, weightfile, 0);
     list *plist = get_paths("/data/captcha/reimgs.fg.list");
     char **paths = (char **)list_to_array(plist);
     int N = plist->size;
-    int outputs = net.outputs;
+    int outputs = net->outputs;
 
-    set_batch_network(&net, 1);
+    set_batch_network(net, 1);
     srand(2222222);
     int i, j;
     for(i = 0; i < N; ++i){
         if (i%100 == 0) fprintf(stderr, "%d\n", i);
-        image im = load_image_color(paths[i], net.w, net.h);
+        image im = load_image_color(paths[i], net->w, net->h);
         float *X = im.data;
         float *predictions = network_predict(net, X);
         //printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
@@ -187,9 +176,9 @@ void valid_captcha(char *cfgfile, char *weightfile, char *filename)
    if(weightfile){
    load_weights(&net, weightfile);
    }
-   printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+   printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
    int imgs = 1024;
-   int i = net.seen/imgs;
+   int i = net->seen/imgs;
    list *plist = get_paths("/data/captcha/train.auto5");
    char **paths = (char **)list_to_array(plist);
    printf("%d\n", plist->size);
@@ -203,10 +192,10 @@ void valid_captcha(char *cfgfile, char *weightfile, char *filename)
    printf("Loaded: %lf seconds\n", sec(clock()-time));
    time=clock();
    float loss = train_network(net, train);
-   net.seen += imgs;
+   net->seen += imgs;
    if(avg_loss == -1) avg_loss = loss;
    avg_loss = avg_loss*.9 + loss*.1;
-   printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen);
+   printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net->seen);
    free_data(train);
    if(i%10==0){
    char buff[256];
@@ -253,9 +242,9 @@ network net = parse_network_cfg(cfgfile);
 if(weightfile){
     load_weights(&net, weightfile);
 }
-printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
 int imgs = 1024;
-int i = net.seen/imgs;
+int i = net->seen/imgs;
 list *plist = get_paths("/data/captcha/encode.list");
 char **paths = (char **)list_to_array(plist);
 printf("%d\n", plist->size);
@@ -268,10 +257,10 @@ while(1){
     printf("Loaded: %lf seconds\n", sec(clock()-time));
     time=clock();
     float loss = train_network(net, train);
-    net.seen += imgs;
+    net->seen += imgs;
     if(avg_loss == -1) avg_loss = loss;
     avg_loss = avg_loss*.9 + loss*.1;
-    printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen);
+    printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net->seen);
     free_matrix(train.X);
     if(i%100==0){
         char buff[256];
diff --git a/image.darknet/src/cifar.c b/image.darknet/inst/include/darknet/examples/cifar.c
similarity index 77%
rename from image.darknet/src/cifar.c
rename to image.darknet/inst/include/darknet/examples/cifar.c
index d0ac459..a5f5f24 100644
--- a/image.darknet/src/cifar.c
+++ b/image.darknet/inst/include/darknet/examples/cifar.c
@@ -1,12 +1,4 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-#include "option_list.h"
-#include "blas.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
+#include "darknet.h"
 
 void train_cifar(char *cfgfile, char *weightfile)
 {
@@ -14,28 +6,25 @@ void train_cifar(char *cfgfile, char *weightfile)
     float avg_loss = -1;
     char *base = basecfg(cfgfile);
     printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    network *net = load_network(cfgfile, weightfile, 0);
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
 
     char *backup_directory = "/home/pjreddie/backup/";
     int classes = 10;
     int N = 50000;
 
     char **labels = get_labels("data/cifar/labels.txt");
-    int epoch = (*net.seen)/N;
+    int epoch = (*net->seen)/N;
     data train = load_all_cifar10();
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
         clock_t time=clock();
 
         float loss = train_network_sgd(net, train, 1);
         if(avg_loss == -1) avg_loss = loss;
         avg_loss = avg_loss*.95 + loss*.05;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net->seen);
+        if(*net->seen/N > epoch){
+            epoch = *net->seen/N;
             char buff[256];
             sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
             save_weights(net, buff);
@@ -62,18 +51,15 @@ void train_cifar_distill(char *cfgfile, char *weightfile)
     float avg_loss = -1;
     char *base = basecfg(cfgfile);
     printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    network *net = load_network(cfgfile, weightfile, 0);
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
 
     char *backup_directory = "/home/pjreddie/backup/";
     int classes = 10;
     int N = 50000;
 
     char **labels = get_labels("data/cifar/labels.txt");
-    int epoch = (*net.seen)/N;
+    int epoch = (*net->seen)/N;
 
     data train = load_all_cifar10();
     matrix soft = csv_to_matrix("results/ensemble.csv");
@@ -83,15 +69,15 @@ void train_cifar_distill(char *cfgfile, char *weightfile)
     scale_matrix(train.y, 1. - weight);
     matrix_add_matrix(soft, train.y);
 
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
         clock_t time=clock();
 
         float loss = train_network_sgd(net, train, 1);
         if(avg_loss == -1) avg_loss = loss;
         avg_loss = avg_loss*.95 + loss*.05;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net->seen);
+        if(*net->seen/N > epoch){
+            epoch = *net->seen/N;
             char buff[256];
             sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
             save_weights(net, buff);
@@ -114,11 +100,8 @@ void train_cifar_distill(char *cfgfile, char *weightfile)
 
 void test_cifar_multi(char *filename, char *weightfile)
 {
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(filename, weightfile, 0);
+    set_batch_network(net, 1);
     srand(time(0));
 
     float avg_acc = 0;
@@ -146,10 +129,7 @@ void test_cifar_multi(char *filename, char *weightfile)
 
 void test_cifar(char *filename, char *weightfile)
 {
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(filename, weightfile, 0);
     srand(time(0));
 
     clock_t time;
@@ -177,23 +157,20 @@ char *labels[] = {"airplane","automobile","bird","cat","deer","dog","frog","hors
         int class = max_index(train.y.vals[i], 10);
         char buff[256];
         sprintf(buff, "data/cifar/train/%d_%s",i,labels[class]);
-        save_image_png(im, buff);
+        save_image_options(im, buff, PNG, 0);
     }
     for(i = 0; i < test.X.rows; ++i){
         image im = float_to_image(32, 32, 3, test.X.vals[i]);
         int class = max_index(test.y.vals[i], 10);
         char buff[256];
         sprintf(buff, "data/cifar/test/%d_%s",i,labels[class]);
-        save_image_png(im, buff);
+        save_image_options(im, buff, PNG, 0);
     }
 }
 
 void test_cifar_csv(char *filename, char *weightfile)
 {
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(filename, weightfile, 0);
     srand(time(0));
 
     data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
@@ -215,12 +192,9 @@ void test_cifar_csv(char *filename, char *weightfile)
     free_data(test);
 }
 
-void test_cifar_csvtrain(char *filename, char *weightfile)
+void test_cifar_csvtrain(char *cfg, char *weights)
 {
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfg, weights, 0);
     srand(time(0));
 
     data test = load_all_cifar10();
diff --git a/image.darknet/inst/include/darknet/src/classifier.c b/image.darknet/inst/include/darknet/examples/classifier.c
similarity index 72%
rename from image.darknet/inst/include/darknet/src/classifier.c
rename to image.darknet/inst/include/darknet/examples/classifier.c
index 586530a..df91a08 100644
--- a/image.darknet/inst/include/darknet/src/classifier.c
+++ b/image.darknet/inst/include/darknet/examples/classifier.c
@@ -1,17 +1,7 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-#include "option_list.h"
-#include "blas.h"
-#include "assert.h"
-#include "classifier.h"
-#include "cuda.h"
-#include <sys/time.h>
+#include "darknet.h"
 
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-image get_image_from_stream(CvCapture *cap);
-#endif
+#include <sys/time.h>
+#include <assert.h>
 
 float *get_regression_values(char **labels, int n)
 {
@@ -33,7 +23,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     char *base = basecfg(cfgfile);
     printf("%s\n", base);
     printf("%d\n", ngpus);
-    network *nets = calloc(ngpus, sizeof(network));
+    network **nets = calloc(ngpus, sizeof(network*));
 
     srand(time(0));
     int seed = rand();
@@ -42,54 +32,61 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
 #ifdef GPU
         cuda_set_device(gpus[i]);
 #endif
-        nets[i] = parse_network_cfg(cfgfile);
-        if(weightfile){
-            load_weights(&nets[i], weightfile);
-        }
-        if(clear) *nets[i].seen = 0;
-        nets[i].learning_rate *= ngpus;
+        nets[i] = load_network(cfgfile, weightfile, clear);
+        nets[i]->learning_rate *= ngpus;
     }
     srand(time(0));
-    network net = nets[0];
+    network *net = nets[0];
 
-    int imgs = net.batch * net.subdivisions * ngpus;
+    int imgs = net->batch * net->subdivisions * ngpus;
 
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
     list *options = read_data_cfg(datacfg);
 
     char *backup_directory = option_find_str(options, "backup", "/backup/");
+    int tag = option_find_int_quiet(options, "tag", 0);
     char *label_list = option_find_str(options, "labels", "data/labels.list");
     char *train_list = option_find_str(options, "train", "data/train.list");
+    char *tree = option_find_str(options, "tree", 0);
+    if (tree) net->hierarchy = read_tree(tree);
     int classes = option_find_int(options, "classes", 2);
 
-    char **labels = get_labels(label_list);
+    char **labels = 0;
+    if(!tag){
+        labels = get_labels(label_list);
+    }
     list *plist = get_paths(train_list);
     char **paths = (char **)list_to_array(plist);
     printf("%d\n", plist->size);
     int N = plist->size;
-    clock_t time;
+    double time;
 
     load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
+    args.w = net->w;
+    args.h = net->h;
     args.threads = 32;
-    args.hierarchy = net.hierarchy;
-
-    args.min = net.min_crop;
-    args.max = net.max_crop;
-    args.angle = net.angle;
-    args.aspect = net.aspect;
-    args.exposure = net.exposure;
-    args.saturation = net.saturation;
-    args.hue = net.hue;
-    args.size = net.w;
+    args.hierarchy = net->hierarchy;
+
+    args.min = net->min_ratio*net->w;
+    args.max = net->max_ratio*net->w;
+    printf("%d %d\n", args.min, args.max);
+    args.angle = net->angle;
+    args.aspect = net->aspect;
+    args.exposure = net->exposure;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
+    args.size = net->w;
 
     args.paths = paths;
     args.classes = classes;
     args.n = imgs;
     args.m = N;
     args.labels = labels;
-    args.type = CLASSIFICATION_DATA;
+    if (tag){
+        args.type = TAG_DATA;
+    } else {
+        args.type = CLASSIFICATION_DATA;
+    }
 
     data train;
     data buffer;
@@ -97,16 +94,40 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     args.d = &buffer;
     load_thread = load_data(args);
 
-    int epoch = (*net.seen)/N;
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
-        time=clock();
+    int count = 0;
+    int epoch = (*net->seen)/N;
+    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
+        if(net->random && count++%40 == 0){
+            printf("Resizing\n");
+            int dim = (rand() % 11 + 4) * 32;
+            //if (get_current_batch(net)+200 > net->max_batches) dim = 608;
+            //int dim = (rand() % 4 + 16) * 32;
+            printf("%d\n", dim);
+            args.w = dim;
+            args.h = dim;
+            args.size = dim;
+            args.min = net->min_ratio*dim;
+            args.max = net->max_ratio*dim;
+            printf("%d %d\n", args.min, args.max);
+
+            pthread_join(load_thread, 0);
+            train = buffer;
+            free_data(train);
+            load_thread = load_data(args);
+
+            for(i = 0; i < ngpus; ++i){
+                resize_network(nets[i], dim, dim);
+            }
+            net = nets[0];
+        }
+        time = what_time_is_it_now();
 
         pthread_join(load_thread, 0);
         train = buffer;
         load_thread = load_data(args);
 
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-        time=clock();
+        printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
+        time = what_time_is_it_now();
 
         float loss = 0;
 #ifdef GPU
@@ -120,15 +141,15 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
 #endif
         if(avg_loss == -1) avg_loss = loss;
         avg_loss = avg_loss*.9 + loss*.1;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, *net->seen);
         free_data(train);
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
+        if(*net->seen/N > epoch){
+            epoch = *net->seen/N;
             char buff[256];
             sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
             save_weights(net, buff);
         }
-        if(get_current_batch(net)%100 == 0){
+        if(get_current_batch(net)%1000 == 0){
             char buff[256];
             sprintf(buff, "%s/%s.backup",backup_directory,base);
             save_weights(net, buff);
@@ -137,132 +158,19 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     char buff[256];
     sprintf(buff, "%s/%s.weights", backup_directory, base);
     save_weights(net, buff);
+    pthread_join(load_thread, 0);
 
     free_network(net);
-    free_ptrs((void**)labels, classes);
+    if(labels) free_ptrs((void**)labels, classes);
     free_ptrs((void**)paths, plist->size);
     free_list(plist);
     free(base);
 }
 
-
-/*
-   void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int clear)
-   {
-   srand(time(0));
-   float avg_loss = -1;
-   char *base = basecfg(cfgfile);
-   printf("%s\n", base);
-   network net = parse_network_cfg(cfgfile);
-   if(weightfile){
-   load_weights(&net, weightfile);
-   }
-   if(clear) *net.seen = 0;
-
-   int imgs = net.batch * net.subdivisions;
-
-   printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-   list *options = read_data_cfg(datacfg);
-
-   char *backup_directory = option_find_str(options, "backup", "/backup/");
-   char *label_list = option_find_str(options, "labels", "data/labels.list");
-   char *train_list = option_find_str(options, "train", "data/train.list");
-   int classes = option_find_int(options, "classes", 2);
-
-   char **labels = get_labels(label_list);
-   list *plist = get_paths(train_list);
-   char **paths = (char **)list_to_array(plist);
-   printf("%d\n", plist->size);
-   int N = plist->size;
-   clock_t time;
-
-   load_args args = {0};
-   args.w = net.w;
-   args.h = net.h;
-   args.threads = 8;
-
-   args.min = net.min_crop;
-   args.max = net.max_crop;
-   args.angle = net.angle;
-   args.aspect = net.aspect;
-   args.exposure = net.exposure;
-   args.saturation = net.saturation;
-   args.hue = net.hue;
-   args.size = net.w;
-   args.hierarchy = net.hierarchy;
-
-   args.paths = paths;
-   args.classes = classes;
-   args.n = imgs;
-   args.m = N;
-   args.labels = labels;
-   args.type = CLASSIFICATION_DATA;
-
-   data train;
-   data buffer;
-   pthread_t load_thread;
-   args.d = &buffer;
-   load_thread = load_data(args);
-
-   int epoch = (*net.seen)/N;
-   while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
-   time=clock();
-
-   pthread_join(load_thread, 0);
-   train = buffer;
-   load_thread = load_data(args);
-
-   printf("Loaded: %lf seconds\n", sec(clock()-time));
-   time=clock();
-
-#ifdef OPENCV
-if(0){
-int u;
-for(u = 0; u < imgs; ++u){
-    image im = float_to_image(net.w, net.h, 3, train.X.vals[u]);
-    show_image(im, "loaded");
-    cvWaitKey(0);
-}
-}
-#endif
-
-float loss = train_network(net, train);
-free_data(train);
-
-if(avg_loss == -1) avg_loss = loss;
-avg_loss = avg_loss*.9 + loss*.1;
-printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-if(*net.seen/N > epoch){
-    epoch = *net.seen/N;
-    char buff[256];
-    sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
-    save_weights(net, buff);
-}
-if(get_current_batch(net)%100 == 0){
-    char buff[256];
-    sprintf(buff, "%s/%s.backup",backup_directory,base);
-    save_weights(net, buff);
-}
-}
-char buff[256];
-sprintf(buff, "%s/%s.weights", backup_directory, base);
-save_weights(net, buff);
-
-free_network(net);
-free_ptrs((void**)labels, classes);
-free_ptrs((void**)paths, plist->size);
-free_list(plist);
-free(base);
-}
-*/
-
 void validate_classifier_crop(char *datacfg, char *filename, char *weightfile)
 {
     int i = 0;
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(filename, weightfile, 0);
     srand(time(0));
 
     list *options = read_data_cfg(datacfg);
@@ -288,8 +196,8 @@ void validate_classifier_crop(char *datacfg, char *filename, char *weightfile)
     data val, buffer;
 
     load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
+    args.w = net->w;
+    args.h = net->h;
 
     args.paths = paths;
     args.classes = classes;
@@ -326,11 +234,8 @@ void validate_classifier_crop(char *datacfg, char *filename, char *weightfile)
 void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
 {
     int i, j;
-    network net = parse_network_cfg(filename);
-    set_batch_network(&net, 1);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(filename, weightfile, 0);
+    set_batch_network(net, 1);
     srand(time(0));
 
     list *options = read_data_cfg(datacfg);
@@ -360,8 +265,8 @@ void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
                 break;
             }
         }
-        int w = net.w;
-        int h = net.h;
+        int w = net->w;
+        int h = net->h;
         int shift = 32;
         image im = load_image_color(paths[i], w+shift, h+shift);
         image images[10];
@@ -379,7 +284,7 @@ void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
         float *pred = calloc(classes, sizeof(float));
         for(j = 0; j < 10; ++j){
             float *p = network_predict(net, images[j].data);
-            if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1);
+            if(net->hierarchy) hierarchy_predictions(p, net->outputs, net->hierarchy, 1, 1);
             axpy_cpu(classes, 1, p, 1, pred, 1);
             free_image(images[j]);
         }
@@ -398,11 +303,8 @@ void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
 void validate_classifier_full(char *datacfg, char *filename, char *weightfile)
 {
     int i, j;
-    network net = parse_network_cfg(filename);
-    set_batch_network(&net, 1);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(filename, weightfile, 0);
+    set_batch_network(net, 1);
     srand(time(0));
 
     list *options = read_data_cfg(datacfg);
@@ -423,7 +325,7 @@ void validate_classifier_full(char *datacfg, char *filename, char *weightfile)
     float avg_topk = 0;
     int *indexes = calloc(topk, sizeof(int));
 
-    int size = net.w;
+    int size = net->w;
     for(i = 0; i < m; ++i){
         int class = -1;
         char *path = paths[i];
@@ -435,12 +337,12 @@ void validate_classifier_full(char *datacfg, char *filename, char *weightfile)
         }
         image im = load_image_color(paths[i], 0, 0);
         image resized = resize_min(im, size);
-        resize_network(&net, resized.w, resized.h);
+        resize_network(net, resized.w, resized.h);
         //show_image(im, "orig");
         //show_image(crop, "cropped");
         //cvWaitKey(0);
         float *pred = network_predict(net, resized.data);
-        if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1);
+        if(net->hierarchy) hierarchy_predictions(pred, net->outputs, net->hierarchy, 1, 1);
 
         free_image(im);
         free_image(resized);
@@ -459,18 +361,15 @@ void validate_classifier_full(char *datacfg, char *filename, char *weightfile)
 void validate_classifier_single(char *datacfg, char *filename, char *weightfile)
 {
     int i, j;
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(filename, weightfile, 0);
+    set_batch_network(net, 1);
     srand(time(0));
 
     list *options = read_data_cfg(datacfg);
 
     char *label_list = option_find_str(options, "labels", "data/labels.list");
     char *leaf_list = option_find_str(options, "leaves", 0);
-    if(leaf_list) change_leaves(net.hierarchy, leaf_list);
+    if(leaf_list) change_leaves(net->hierarchy, leaf_list);
     char *valid_list = option_find_str(options, "valid", "data/train.list");
     int classes = option_find_int(options, "classes", 2);
     int topk = option_find_int(options, "top", 1);
@@ -496,15 +395,14 @@ void validate_classifier_single(char *datacfg, char *filename, char *weightfile)
             }
         }
         image im = load_image_color(paths[i], 0, 0);
-        image resized = resize_min(im, net.w);
-        image crop = crop_image(resized, (resized.w - net.w)/2, (resized.h - net.h)/2, net.w, net.h);
+        image crop = center_crop_image(im, net->w, net->h);
+        //grayscale_image_3c(crop);
         //show_image(im, "orig");
         //show_image(crop, "cropped");
         //cvWaitKey(0);
         float *pred = network_predict(net, crop.data);
-        if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1);
+        if(net->hierarchy) hierarchy_predictions(pred, net->outputs, net->hierarchy, 1, 1);
 
-        if(resized.data != im.data) free_image(resized);
         free_image(im);
         free_image(crop);
         top_k(pred, classes, topk, indexes);
@@ -514,18 +412,16 @@ void validate_classifier_single(char *datacfg, char *filename, char *weightfile)
             if(indexes[j] == class) avg_topk += 1;
         }
 
+        printf("%s, %d, %f, %f, \n", paths[i], class, pred[0], pred[1]);
         printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
     }
 }
 
-void validate_classifier_multi(char *datacfg, char *filename, char *weightfile)
+void validate_classifier_multi(char *datacfg, char *cfg, char *weights)
 {
     int i, j;
-    network net = parse_network_cfg(filename);
-    set_batch_network(&net, 1);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfg, weights, 0);
+    set_batch_network(net, 1);
     srand(time(0));
 
     list *options = read_data_cfg(datacfg);
@@ -537,7 +433,8 @@ void validate_classifier_multi(char *datacfg, char *filename, char *weightfile)
 
     char **labels = get_labels(label_list);
     list *plist = get_paths(valid_list);
-    int scales[] = {224, 288, 320, 352, 384};
+    //int scales[] = {224, 288, 320, 352, 384};
+    int scales[] = {224, 256, 288, 320};
     int nscales = sizeof(scales)/sizeof(scales[0]);
 
     char **paths = (char **)list_to_array(plist);
@@ -560,10 +457,10 @@ void validate_classifier_multi(char *datacfg, char *filename, char *weightfile)
         float *pred = calloc(classes, sizeof(float));
         image im = load_image_color(paths[i], 0, 0);
         for(j = 0; j < nscales; ++j){
-            image r = resize_min(im, scales[j]);
-            resize_network(&net, r.w, r.h);
+            image r = resize_max(im, scales[j]);
+            resize_network(net, r.w, r.h);
             float *p = network_predict(net, r.data);
-            if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1);
+            if(net->hierarchy) hierarchy_predictions(p, net->outputs, net->hierarchy, 1 , 1);
             axpy_cpu(classes, 1, p, 1, pred, 1);
             flip_image(r);
             p = network_predict(net, r.data);
@@ -584,11 +481,8 @@ void validate_classifier_multi(char *datacfg, char *filename, char *weightfile)
 
 void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int layer_num)
 {
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
     srand(2222222);
 
     list *options = read_data_cfg(datacfg);
@@ -629,7 +523,7 @@ void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filena
         time=clock();
         float *predictions = network_predict(net, X);
 
-        layer l = net.layers[layer_num];
+        layer l = net->layers[layer_num];
         for(i = 0; i < l.c; ++i){
             if(l.rolling_mean) printf("%f %f %f\n", l.rolling_mean[i], l.rolling_variance[i], l.scales[i]);
         }
@@ -665,11 +559,8 @@ void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filena
 
 void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int top)
 {
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
     srand(2222222);
 
     list *options = read_data_cfg(datacfg);
@@ -684,7 +575,6 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi
     int *indexes = calloc(top, sizeof(int));
     char buff[256];
     char *input = buff;
-    int size = net.w;
     while(1){
         if(filename){
             strncpy(input, filename, 256);
@@ -696,20 +586,23 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi
             strtok(input, "\n");
         }
         image im = load_image_color(input, 0, 0);
-        image r = resize_min(im, size);
-        resize_network(&net, r.w, r.h);
-        printf("%d %d\n", r.w, r.h);
+        image r = letterbox_image(im, net->w, net->h);
+        //image r = resize_min(im, 320);
+        //printf("%d %d\n", r.w, r.h);
+        //resize_network(net, r.w, r.h);
+        //printf("%d %d\n", r.w, r.h);
 
         float *X = r.data;
         time=clock();
         float *predictions = network_predict(net, X);
-        if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 0);
-        top_k(predictions, net.outputs, top, indexes);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        if(net->hierarchy) hierarchy_predictions(predictions, net->outputs, net->hierarchy, 1, 1);
+        top_k(predictions, net->outputs, top, indexes);
+        fprintf(stderr, "%s: Predicted in %f seconds.\n", input, sec(clock()-time));
         for(i = 0; i < top; ++i){
             int index = indexes[i];
-            if(net.hierarchy) printf("%d, %s: %f, parent: %s \n",index, names[index], predictions[index], (net.hierarchy->parent[index] >= 0) ? names[net.hierarchy->parent[index]] : "Root");
-            else printf("%s: %f\n",names[index], predictions[index]);
+            //if(net->hierarchy) printf("%d, %s: %f, parent: %s \n",index, names[index], predictions[index], (net->hierarchy->parent[index] >= 0) ? names[net->hierarchy->parent[index]] : "Root");
+            //else printf("%s: %f\n",names[index], predictions[index]);
+            printf("%5.2f%%: %s\n", predictions[index]*100, names[index]);
         }
         if(r.data != im.data) free_image(r);
         free_image(im);
@@ -721,11 +614,8 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi
 void label_classifier(char *datacfg, char *filename, char *weightfile)
 {
     int i;
-    network net = parse_network_cfg(filename);
-    set_batch_network(&net, 1);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(filename, weightfile, 0);
+    set_batch_network(net, 1);
     srand(time(0));
 
     list *options = read_data_cfg(datacfg);
@@ -743,8 +633,8 @@ void label_classifier(char *datacfg, char *filename, char *weightfile)
 
     for(i = 0; i < m; ++i){
         image im = load_image_color(paths[i], 0, 0);
-        image resized = resize_min(im, net.w);
-        image crop = crop_image(resized, (resized.w - net.w)/2, (resized.h - net.h)/2, net.w, net.h);
+        image resized = resize_min(im, net->w);
+        image crop = crop_image(resized, (resized.w - net->w)/2, (resized.h - net->h)/2, net->w, net->h);
         float *pred = network_predict(net, crop.data);
 
         if(resized.data != im.data) free_image(resized);
@@ -756,14 +646,50 @@ void label_classifier(char *datacfg, char *filename, char *weightfile)
     }
 }
 
+void csv_classifier(char *datacfg, char *cfgfile, char *weightfile)
+{
+    int i,j;
+    network *net = load_network(cfgfile, weightfile, 0);
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *test_list = option_find_str(options, "test", "data/test.list");
+    int top = option_find_int(options, "top", 1);
+
+    list *plist = get_paths(test_list);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+    int *indexes = calloc(top, sizeof(int));
+
+    for(i = 0; i < m; ++i){
+        double time = what_time_is_it_now();
+        char *path = paths[i];
+        image im = load_image_color(path, 0, 0);
+        image r = letterbox_image(im, net->w, net->h);
+        float *predictions = network_predict(net, r.data);
+        if(net->hierarchy) hierarchy_predictions(predictions, net->outputs, net->hierarchy, 1, 1);
+        top_k(predictions, net->outputs, top, indexes);
+
+        printf("%s", path);
+        for(j = 0; j < top; ++j){
+            printf("\t%d", indexes[j]);
+        }
+        printf("\n");
+
+        free_image(im);
+        free_image(r);
+
+        fprintf(stderr, "%lf seconds, %d images, %d total\n", what_time_is_it_now() - time, i+1, m);
+    }
+}
 
 void test_classifier(char *datacfg, char *cfgfile, char *weightfile, int target_layer)
 {
     int curr = 0;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfgfile, weightfile, 0);
     srand(time(0));
 
     list *options = read_data_cfg(datacfg);
@@ -782,18 +708,18 @@ void test_classifier(char *datacfg, char *cfgfile, char *weightfile, int target_
     data val, buffer;
 
     load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
+    args.w = net->w;
+    args.h = net->h;
     args.paths = paths;
     args.classes = classes;
-    args.n = net.batch;
+    args.n = net->batch;
     args.m = 0;
     args.labels = 0;
     args.d = &buffer;
     args.type = OLD_CLASSIFICATION_DATA;
 
     pthread_t load_thread = load_data_in_thread(args);
-    for(curr = net.batch; curr < m; curr += net.batch){
+    for(curr = net->batch; curr < m; curr += net->batch){
         time=clock();
 
         pthread_join(load_thread, 0);
@@ -801,7 +727,7 @@ void test_classifier(char *datacfg, char *cfgfile, char *weightfile, int target_
 
         if(curr < m){
             args.paths = paths + curr;
-            if (curr + net.batch > m) args.n = m - curr;
+            if (curr + net->batch > m) args.n = m - curr;
             load_thread = load_data_in_thread(args);
         }
         fprintf(stderr, "Loaded: %d images in %lf seconds\n", val.X.rows, sec(clock()-time));
@@ -811,11 +737,11 @@ void test_classifier(char *datacfg, char *cfgfile, char *weightfile, int target_
 
         int i, j;
         if (target_layer >= 0){
-            //layer l = net.layers[target_layer];
+            //layer l = net->layers[target_layer];
         }
 
         for(i = 0; i < pred.rows; ++i){
-            printf("%s", paths[curr-net.batch+i]);
+            printf("%s", paths[curr-net->batch+i]);
             for(j = 0; j < pred.cols; ++j){
                 printf("\t%g", pred.vals[i][j]);
             }
@@ -829,6 +755,44 @@ void test_classifier(char *datacfg, char *cfgfile, char *weightfile, int target_
     }
 }
 
+void file_output_classifier(char *datacfg, char *filename, char *weightfile, char *listfile)
+{
+    int i,j;
+    network *net = load_network(filename, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    //char *label_list = option_find_str(options, "names", "data/labels.list");
+    int classes = option_find_int(options, "classes", 2);
+
+    list *plist = get_paths(listfile);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    for(i = 0; i < m; ++i){
+        image im = load_image_color(paths[i], 0, 0);
+        image resized = resize_min(im, net->w);
+        image crop = crop_image(resized, (resized.w - net->w)/2, (resized.h - net->h)/2, net->w, net->h);
+
+        float *pred = network_predict(net, crop.data);
+        if(net->hierarchy) hierarchy_predictions(pred, net->outputs, net->hierarchy, 0, 1);
+
+        if(resized.data != im.data) free_image(resized);
+        free_image(im);
+        free_image(crop);
+
+        printf("%s", paths[i]);
+        for(j = 0; j < classes; ++j){
+            printf("\t%g", pred[j]);
+        }
+        printf("\n");
+    }
+}
+
 
 void threat_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
 {
@@ -837,21 +801,12 @@ void threat_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_i
     float roll = .2;
 
     printf("Classifier Demo\n");
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
     list *options = read_data_cfg(datacfg);
 
     srand(2222222);
-    CvCapture * cap;
-
-    if(filename){
-        cap = cvCaptureFromFile(filename);
-    }else{
-        cap = cvCaptureFromCAM(cam_index);
-    }
+    void * cap = open_video_stream(filename, cam_index, 0,0,0);
 
     int top = option_find_int(options, "top", 1);
 
@@ -875,7 +830,7 @@ void threat_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_i
 
         image in = get_image_from_stream(cap);
         if(!in.data) break;
-        image in_s = resize_image(in, net.w, net.h);
+        image in_s = resize_image(in, net->w, net->h);
 
         image out = in;
         int x1 = out.w / 20;
@@ -948,8 +903,7 @@ void threat_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_i
         }
 
         if(1){
-            show_image(out, "Threat");
-            cvWaitKey(10);
+            show_image(out, "Threat", 10);
         }
         free_image(in_s);
         free_image(in);
@@ -969,21 +923,12 @@ void gun_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
     int bad_cats[] = {218, 539, 540, 1213, 1501, 1742, 1911, 2415, 4348, 19223, 368, 369, 370, 1133, 1200, 1306, 2122, 2301, 2537, 2823, 3179, 3596, 3639, 4489, 5107, 5140, 5289, 6240, 6631, 6762, 7048, 7171, 7969, 7984, 7989, 8824, 8927, 9915, 10270, 10448, 13401, 15205, 18358, 18894, 18895, 19249, 19697};
 
     printf("Classifier Demo\n");
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
     list *options = read_data_cfg(datacfg);
 
     srand(2222222);
-    CvCapture * cap;
-
-    if(filename){
-        cap = cvCaptureFromFile(filename);
-    }else{
-        cap = cvCaptureFromCAM(cam_index);
-    }
+    void * cap = open_video_stream(filename, cam_index, 0,0,0);
 
     int top = option_find_int(options, "top", 1);
 
@@ -993,8 +938,6 @@ void gun_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
     int *indexes = calloc(top, sizeof(int));
 
     if(!cap) error("Couldn't connect to webcam.\n");
-    cvNamedWindow("Threat Detection", CV_WINDOW_NORMAL); 
-    cvResizeWindow("Threat Detection", 512, 512);
     float fps = 0;
     int i;
 
@@ -1003,8 +946,7 @@ void gun_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
         gettimeofday(&tval_before, NULL);
 
         image in = get_image_from_stream(cap);
-        image in_s = resize_image(in, net.w, net.h);
-        show_image(in, "Threat Detection");
+        image in_s = resize_image(in, net->w, net->h);
 
         float *predictions = network_predict(net, in_s.data);
         top_predictions(net, top, indexes);
@@ -1029,11 +971,10 @@ void gun_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
             }
         }
 
+        show_image(in, "Threat Detection", 10);
         free_image(in_s);
         free_image(in);
 
-        cvWaitKey(10);
-
         gettimeofday(&tval_after, NULL);
         timersub(&tval_after, &tval_before, &tval_result);
         float curr = 1000000.f/((long int)tval_result.tv_usec);
@@ -1045,33 +986,28 @@ void gun_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
 void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
 {
 #ifdef OPENCV
+    char *base = basecfg(cfgfile);
+    image **alphabet = load_alphabet();
     printf("Classifier Demo\n");
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
     list *options = read_data_cfg(datacfg);
 
     srand(2222222);
-    CvCapture * cap;
 
-    if(filename){
-        cap = cvCaptureFromFile(filename);
-    }else{
-        cap = cvCaptureFromCAM(cam_index);
-    }
+    int w = 1280;
+    int h = 720;
+    void * cap = open_video_stream(filename, cam_index, w, h, 0);
 
     int top = option_find_int(options, "top", 1);
 
-    char *name_list = option_find_str(options, "names", 0);
+    char *label_list = option_find_str(options, "labels", 0);
+    char *name_list = option_find_str(options, "names", label_list);
     char **names = get_labels(name_list);
 
     int *indexes = calloc(top, sizeof(int));
 
     if(!cap) error("Couldn't connect to webcam.\n");
-    cvNamedWindow("Classifier", CV_WINDOW_NORMAL); 
-    cvResizeWindow("Classifier", 512, 512);
     float fps = 0;
     int i;
 
@@ -1080,27 +1016,38 @@ void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_ind
         gettimeofday(&tval_before, NULL);
 
         image in = get_image_from_stream(cap);
-        image in_s = resize_image(in, net.w, net.h);
-        show_image(in, "Classifier");
+        //image in_s = resize_image(in, net->w, net->h);
+        image in_s = letterbox_image(in, net->w, net->h);
 
         float *predictions = network_predict(net, in_s.data);
-        if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 1);
+        if(net->hierarchy) hierarchy_predictions(predictions, net->outputs, net->hierarchy, 1, 1);
         top_predictions(net, top, indexes);
 
         printf("\033[2J");
         printf("\033[1;1H");
         printf("\nFPS:%.0f\n",fps);
 
+        int lh = in.h*.03;
+        int toph = 3*lh;
+
+        float rgb[3] = {1,1,1};
         for(i = 0; i < top; ++i){
+            printf("%d\n", toph);
             int index = indexes[i];
             printf("%.1f%%: %s\n", predictions[index]*100, names[index]);
+
+            char buff[1024];
+            sprintf(buff, "%3.1f%%: %s\n", predictions[index]*100, names[index]);
+            image label = get_label(alphabet, buff, lh);
+            draw_label(in, toph, lh, label, rgb);
+            toph += 2*lh;
+            free_image(label);
         }
 
+        show_image(in, base, 10);
         free_image(in_s);
         free_image(in);
 
-        cvWaitKey(10);
-
         gettimeofday(&tval_after, NULL);
         timersub(&tval_after, &tval_before, &tval_result);
         float curr = 1000000.f/((long int)tval_result.tv_usec);
@@ -1118,27 +1065,9 @@ void run_classifier(int argc, char **argv)
     }
 
     char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
-    int *gpus = 0;
-    int gpu = 0;
-    int ngpus = 0;
-    if(gpu_list){
-        printf("%s\n", gpu_list);
-        int len = strlen(gpu_list);
-        ngpus = 1;
-        int i;
-        for(i = 0; i < len; ++i){
-            if (gpu_list[i] == ',') ++ngpus;
-        }
-        gpus = calloc(ngpus, sizeof(int));
-        for(i = 0; i < ngpus; ++i){
-            gpus[i] = atoi(gpu_list);
-            gpu_list = strchr(gpu_list, ',')+1;
-        }
-    } else {
-        gpu = gpu_index;
-        gpus = &gpu;
-        ngpus = 1;
-    }
+    int ngpus;
+    int *gpus = read_intlist(gpu_list, &ngpus, gpu_index);
+
 
     int cam_index = find_int_arg(argc, argv, "-c", 0);
     int top = find_int_arg(argc, argv, "-t", 0);
@@ -1150,12 +1079,14 @@ void run_classifier(int argc, char **argv)
     char *layer_s = (argc > 7) ? argv[7]: 0;
     int layer = layer_s ? atoi(layer_s) : -1;
     if(0==strcmp(argv[2], "predict")) predict_classifier(data, cfg, weights, filename, top);
+    else if(0==strcmp(argv[2], "fout")) file_output_classifier(data, cfg, weights, filename);
     else if(0==strcmp(argv[2], "try")) try_classifier(data, cfg, weights, filename, atoi(layer_s));
     else if(0==strcmp(argv[2], "train")) train_classifier(data, cfg, weights, gpus, ngpus, clear);
     else if(0==strcmp(argv[2], "demo")) demo_classifier(data, cfg, weights, cam_index, filename);
     else if(0==strcmp(argv[2], "gun")) gun_classifier(data, cfg, weights, cam_index, filename);
     else if(0==strcmp(argv[2], "threat")) threat_classifier(data, cfg, weights, cam_index, filename);
     else if(0==strcmp(argv[2], "test")) test_classifier(data, cfg, weights, layer);
+    else if(0==strcmp(argv[2], "csv")) csv_classifier(data, cfg, weights);
     else if(0==strcmp(argv[2], "label")) label_classifier(data, cfg, weights);
     else if(0==strcmp(argv[2], "valid")) validate_classifier_single(data, cfg, weights);
     else if(0==strcmp(argv[2], "validmulti")) validate_classifier_multi(data, cfg, weights);
diff --git a/image.darknet/src/coco.c b/image.darknet/inst/include/darknet/examples/coco.c
similarity index 72%
rename from image.darknet/src/coco.c
rename to image.darknet/inst/include/darknet/examples/coco.c
index 8f3c968..6a50b89 100644
--- a/image.darknet/src/coco.c
+++ b/image.darknet/inst/include/darknet/examples/coco.c
@@ -1,16 +1,6 @@
-#include <stdio.h>
-
-#include "network.h"
-#include "detection_layer.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "box.h"
-#include "demo.h"
+#include "darknet.h"
 
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
+#include <stdio.h>
 
 char *coco_classes[] = {"person","bicycle","car","motorcycle","airplane","bus","train","truck","boat","traffic light","fire hydrant","stop sign","parking meter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra","giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite","baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle","wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch","potted plant","bed","dining table","toilet","tv","laptop","mouse","remote","keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","book","clock","vase","scissors","teddy bear","hair drier","toothbrush"};
 
@@ -27,17 +17,14 @@ void train_coco(char *cfgfile, char *weightfile)
     char *base = basecfg(cfgfile);
     printf("%s\n", base);
     float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = net.batch*net.subdivisions;
-    int i = *net.seen/imgs;
+    network *net = load_network(cfgfile, weightfile, 0);
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    int imgs = net->batch*net->subdivisions;
+    int i = *net->seen/imgs;
     data train, buffer;
 
 
-    layer l = net.layers[net.n - 1];
+    layer l = net->layers[net->n - 1];
 
     int side = l.side;
     int classes = l.classes;
@@ -48,8 +35,8 @@ void train_coco(char *cfgfile, char *weightfile)
     char **paths = (char **)list_to_array(plist);
 
     load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
+    args.w = net->w;
+    args.h = net->h;
     args.paths = paths;
     args.n = imgs;
     args.m = plist->size;
@@ -59,15 +46,15 @@ void train_coco(char *cfgfile, char *weightfile)
     args.d = &buffer;
     args.type = REGION_DATA;
 
-    args.angle = net.angle;
-    args.exposure = net.exposure;
-    args.saturation = net.saturation;
-    args.hue = net.hue;
+    args.angle = net->angle;
+    args.exposure = net->exposure;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
 
     pthread_t load_thread = load_data_in_thread(args);
     clock_t time;
     //while(i*imgs < N*120){
-    while(get_current_batch(net) < net.max_batches){
+    while(get_current_batch(net) < net->max_batches){
         i += 1;
         time=clock();
         pthread_join(load_thread, 0);
@@ -77,7 +64,7 @@ void train_coco(char *cfgfile, char *weightfile)
         printf("Loaded: %lf seconds\n", sec(clock()-time));
 
         /*
-           image im = float_to_image(net.w, net.h, 3, train.X.vals[113]);
+           image im = float_to_image(net->w, net->h, 3, train.X.vals[113]);
            image copy = copy_image(im);
            draw_coco(copy, train.y.vals[113], 7, "truth");
            cvWaitKey(0);
@@ -107,14 +94,14 @@ void train_coco(char *cfgfile, char *weightfile)
     save_weights(net, buff);
 }
 
-void print_cocos(FILE *fp, int image_id, box *boxes, float **probs, int num_boxes, int classes, int w, int h)
+static void print_cocos(FILE *fp, int image_id, detection *dets, int num_boxes, int classes, int w, int h)
 {
     int i, j;
     for(i = 0; i < num_boxes; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
+        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
 
         if (xmin < 0) xmin = 0;
         if (ymin < 0) ymin = 0;
@@ -127,7 +114,7 @@ void print_cocos(FILE *fp, int image_id, box *boxes, float **probs, int num_boxe
         float bh = ymax - ymin;
 
         for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, probs[i][j]);
+            if (dets[i].prob[j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, dets[i].prob[j]);
         }
     }
 }
@@ -138,14 +125,11 @@ int get_coco_image_id(char *filename)
     return atoi(p+1);
 }
 
-void validate_coco(char *cfgfile, char *weightfile)
+void validate_coco(char *cfg, char *weights)
 {
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    network *net = load_network(cfg, weights, 0);
+    set_batch_network(net, 1);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
     srand(time(0));
 
     char *base = "results/";
@@ -154,20 +138,14 @@ void validate_coco(char *cfgfile, char *weightfile)
     //list *plist = get_paths("/home/pjreddie/data/voc/test/2007_test.txt");
     char **paths = (char **)list_to_array(plist);
 
-    layer l = net.layers[net.n-1];
+    layer l = net->layers[net->n-1];
     int classes = l.classes;
-    int side = l.side;
 
-    int j;
     char buff[1024];
     snprintf(buff, 1024, "%s/coco_results.json", base);
     FILE *fp = fopen(buff, "w");
     fprintf(fp, "[\n");
 
-    box *boxes = calloc(side*side*l.n, sizeof(box));
-    float **probs = calloc(side*side*l.n, sizeof(float *));
-    for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
-
     int m = plist->size;
     int i=0;
     int t;
@@ -184,8 +162,8 @@ void validate_coco(char *cfgfile, char *weightfile)
     pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
 
     load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
+    args.w = net->w;
+    args.h = net->h;
     args.type = IMAGE_DATA;
 
     for(t = 0; t < nthreads; ++t){
@@ -215,9 +193,11 @@ void validate_coco(char *cfgfile, char *weightfile)
             network_predict(net, X);
             int w = val[t].w;
             int h = val[t].h;
-            get_detection_boxes(l, w, h, thresh, probs, boxes, 0);
-            if (nms) do_nms_sort(boxes, probs, side*side*l.n, classes, iou_thresh);
-            print_cocos(fp, image_id, boxes, probs, side*side*l.n, classes, w, h);
+            int nboxes = 0;
+            detection *dets = get_network_boxes(net, w, h, thresh, 0, 0, 0, &nboxes);
+            if (nms) do_nms_sort(dets, l.side*l.side*l.n, classes, iou_thresh);
+            print_cocos(fp, image_id, dets, l.side*l.side*l.n, classes, w, h);
+            free_detections(dets, nboxes);
             free_image(val[t]);
             free_image(val_resized[t]);
         }
@@ -231,19 +211,16 @@ void validate_coco(char *cfgfile, char *weightfile)
 
 void validate_coco_recall(char *cfgfile, char *weightfile)
 {
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
     srand(time(0));
 
     char *base = "results/comp4_det_test_";
     list *plist = get_paths("/home/pjreddie/data/voc/test/2007_test.txt");
     char **paths = (char **)list_to_array(plist);
 
-    layer l = net.layers[net.n-1];
+    layer l = net->layers[net->n-1];
     int classes = l.classes;
     int side = l.side;
 
@@ -254,9 +231,6 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
         snprintf(buff, 1024, "%s%s.txt", base, coco_classes[j]);
         fps[j] = fopen(buff, "w");
     }
-    box *boxes = calloc(side*side*l.n, sizeof(box));
-    float **probs = calloc(side*side*l.n, sizeof(float *));
-    for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
 
     int m = plist->size;
     int i=0;
@@ -264,7 +238,6 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
     float thresh = .001;
     int nms = 0;
     float iou_thresh = .5;
-    float nms_thresh = .5;
 
     int total = 0;
     int correct = 0;
@@ -274,11 +247,13 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
     for(i = 0; i < m; ++i){
         char *path = paths[i];
         image orig = load_image_color(path, 0, 0);
-        image sized = resize_image(orig, net.w, net.h);
+        image sized = resize_image(orig, net->w, net->h);
         char *id = basecfg(path);
         network_predict(net, sized.data);
-        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 1);
-        if (nms) do_nms(boxes, probs, side*side*l.n, 1, nms_thresh);
+
+        int nboxes = 0;
+        detection *dets = get_network_boxes(net, orig.w, orig.h, thresh, 0, 0, 1, &nboxes);
+        if (nms) do_nms_obj(dets, side*side*l.n, 1, nms);
 
         char labelpath[4096];
         find_replace(path, "images", "labels", labelpath);
@@ -289,7 +264,7 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
         int num_labels = 0;
         box_label *truth = read_boxes(labelpath, &num_labels);
         for(k = 0; k < side*side*l.n; ++k){
-            if(probs[k][0] > thresh){
+            if(dets[k].objectness > thresh){
                 ++proposals;
             }
         }
@@ -298,8 +273,8 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
             box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
             float best_iou = 0;
             for(k = 0; k < side*side*l.n; ++k){
-                float iou = box_iou(boxes[k], t);
-                if(probs[k][0] > thresh && iou > best_iou){
+                float iou = box_iou(dets[k].bbox, t);
+                if(dets[k].objectness > thresh && iou > best_iou){
                     best_iou = iou;
                 }
             }
@@ -308,7 +283,7 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
                 ++correct;
             }
         }
-
+        free_detections(dets, nboxes);
         fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
         free(id);
         free_image(orig);
@@ -319,21 +294,14 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
 void test_coco(char *cfgfile, char *weightfile, char *filename, float thresh)
 {
     image **alphabet = load_alphabet();
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    detection_layer l = net.layers[net.n-1];
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    layer l = net->layers[net->n-1];
+    set_batch_network(net, 1);
     srand(2222222);
     float nms = .4;
     clock_t time;
     char buff[256];
     char *input = buff;
-    int j;
-    box *boxes = calloc(l.side*l.side*l.n, sizeof(box));
-    float **probs = calloc(l.side*l.side*l.n, sizeof(float *));
-    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
     while(1){
         if(filename){
             strncpy(input, filename, 256);
@@ -345,22 +313,22 @@ void test_coco(char *cfgfile, char *weightfile, char *filename, float thresh)
             strtok(input, "\n");
         }
         image im = load_image_color(input,0,0);
-        image sized = resize_image(im, net.w, net.h);
+        image sized = resize_image(im, net->w, net->h);
         float *X = sized.data;
         time=clock();
         network_predict(net, X);
         printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 0);
-        if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms);
-        draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, coco_classes, alphabet, 80);
+
+        int nboxes = 0;
+        detection *dets = get_network_boxes(net, 1, 1, thresh, 0, 0, 0, &nboxes);
+        if (nms) do_nms_sort(dets, l.side*l.side*l.n, l.classes, nms);
+
+        draw_detections(im, dets, l.side*l.side*l.n, thresh, coco_classes, alphabet, 80);
         save_image(im, "prediction");
-        show_image(im, "predictions");
+        show_image(im, "predictions", 0);
+        free_detections(dets, nboxes);
         free_image(im);
         free_image(sized);
-#ifdef OPENCV
-        cvWaitKey(0);
-        cvDestroyAllWindows();
-#endif
         if (filename) break;
     }
 }
@@ -380,9 +348,10 @@ void run_coco(int argc, char **argv)
     char *cfg = argv[3];
     char *weights = (argc > 4) ? argv[4] : 0;
     char *filename = (argc > 5) ? argv[5]: 0;
+    int avg = find_int_arg(argc, argv, "-avg", 1);
     if(0==strcmp(argv[2], "test")) test_coco(cfg, weights, filename, thresh);
     else if(0==strcmp(argv[2], "train")) train_coco(cfg, weights);
     else if(0==strcmp(argv[2], "valid")) validate_coco(cfg, weights);
     else if(0==strcmp(argv[2], "recall")) validate_coco_recall(cfg, weights);
-    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, coco_classes, 80, frame_skip, prefix, .5);
+    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, coco_classes, 80, frame_skip, prefix, avg, .5, 0,0,0,0);
 }
diff --git a/image.darknet/inst/include/darknet/src/darknet.c b/image.darknet/inst/include/darknet/examples/darknet.c
similarity index 62%
rename from image.darknet/inst/include/darknet/src/darknet.c
rename to image.darknet/inst/include/darknet/examples/darknet.c
index 6e56072..d538359 100644
--- a/image.darknet/inst/include/darknet/src/darknet.c
+++ b/image.darknet/inst/include/darknet/examples/darknet.c
@@ -1,56 +1,46 @@
+#include "darknet.h"
+
 #include <time.h>
 #include <stdlib.h>
 #include <stdio.h>
 
-#include "parser.h"
-#include "utils.h"
-#include "cuda.h"
-#include "blas.h"
-#include "connected_layer.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
 extern void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int top);
-extern void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh);
-extern void run_voxel(int argc, char **argv);
+extern void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, char *outfile, int fullscreen);
 extern void run_yolo(int argc, char **argv);
 extern void run_detector(int argc, char **argv);
 extern void run_coco(int argc, char **argv);
-extern void run_writing(int argc, char **argv);
-extern void run_captcha(int argc, char **argv);
 extern void run_nightmare(int argc, char **argv);
-extern void run_dice(int argc, char **argv);
-extern void run_compare(int argc, char **argv);
 extern void run_classifier(int argc, char **argv);
+extern void run_regressor(int argc, char **argv);
+extern void run_segmenter(int argc, char **argv);
+extern void run_isegmenter(int argc, char **argv);
 extern void run_char_rnn(int argc, char **argv);
-extern void run_vid_rnn(int argc, char **argv);
 extern void run_tag(int argc, char **argv);
 extern void run_cifar(int argc, char **argv);
 extern void run_go(int argc, char **argv);
 extern void run_art(int argc, char **argv);
 extern void run_super(int argc, char **argv);
+extern void run_lsd(int argc, char **argv);
 
 void average(int argc, char *argv[])
 {
     char *cfgfile = argv[2];
     char *outfile = argv[3];
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    network sum = parse_network_cfg(cfgfile);
+    network *net = parse_network_cfg(cfgfile);
+    network *sum = parse_network_cfg(cfgfile);
 
     char *weightfile = argv[4];   
-    load_weights(&sum, weightfile);
+    load_weights(sum, weightfile);
 
     int i, j;
     int n = argc - 5;
     for(i = 0; i < n; ++i){
         weightfile = argv[i+5];   
-        load_weights(&net, weightfile);
-        for(j = 0; j < net.n; ++j){
-            layer l = net.layers[j];
-            layer out = sum.layers[j];
+        load_weights(net, weightfile);
+        for(j = 0; j < net->n; ++j){
+            layer l = net->layers[j];
+            layer out = sum->layers[j];
             if(l.type == CONVOLUTIONAL){
                 int num = l.n*l.c*l.size*l.size;
                 axpy_cpu(l.n, 1, l.biases, 1, out.biases, 1);
@@ -68,8 +58,8 @@ void average(int argc, char *argv[])
         }
     }
     n = n+1;
-    for(j = 0; j < net.n; ++j){
-        layer l = sum.layers[j];
+    for(j = 0; j < net->n; ++j){
+        layer l = sum->layers[j];
         if(l.type == CONVOLUTIONAL){
             int num = l.n*l.c*l.size*l.size;
             scal_cpu(l.n, 1./n, l.biases, 1);
@@ -88,19 +78,57 @@ void average(int argc, char *argv[])
     save_weights(sum, outfile);
 }
 
+long numops(network *net)
+{
+    int i;
+    long ops = 0;
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        if(l.type == CONVOLUTIONAL){
+            ops += 2l * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w;
+        } else if(l.type == CONNECTED){
+            ops += 2l * l.inputs * l.outputs;
+        } else if (l.type == RNN){
+            ops += 2l * l.input_layer->inputs * l.input_layer->outputs;
+            ops += 2l * l.self_layer->inputs * l.self_layer->outputs;
+            ops += 2l * l.output_layer->inputs * l.output_layer->outputs;
+        } else if (l.type == GRU){
+            ops += 2l * l.uz->inputs * l.uz->outputs;
+            ops += 2l * l.uh->inputs * l.uh->outputs;
+            ops += 2l * l.ur->inputs * l.ur->outputs;
+            ops += 2l * l.wz->inputs * l.wz->outputs;
+            ops += 2l * l.wh->inputs * l.wh->outputs;
+            ops += 2l * l.wr->inputs * l.wr->outputs;
+        } else if (l.type == LSTM){
+            ops += 2l * l.uf->inputs * l.uf->outputs;
+            ops += 2l * l.ui->inputs * l.ui->outputs;
+            ops += 2l * l.ug->inputs * l.ug->outputs;
+            ops += 2l * l.uo->inputs * l.uo->outputs;
+            ops += 2l * l.wf->inputs * l.wf->outputs;
+            ops += 2l * l.wi->inputs * l.wi->outputs;
+            ops += 2l * l.wg->inputs * l.wg->outputs;
+            ops += 2l * l.wo->inputs * l.wo->outputs;
+        }
+    }
+    return ops;
+}
+
 void speed(char *cfgfile, int tics)
 {
     if (tics == 0) tics = 1000;
-    network net = parse_network_cfg(cfgfile);
-    set_batch_network(&net, 1);
+    network *net = parse_network_cfg(cfgfile);
+    set_batch_network(net, 1);
     int i;
-    time_t start = time(0);
-    image im = make_image(net.w, net.h, net.c);
+    double time=what_time_is_it_now();
+    image im = make_image(net->w, net->h, net->c*net->batch);
     for(i = 0; i < tics; ++i){
         network_predict(net, im.data);
     }
-    double t = difftime(time(0), start);
+    double t = what_time_is_it_now() - time;
+    long ops = numops(net);
     printf("\n%d evals, %f Seconds\n", tics, t);
+    printf("Floating Point Operations: %.2f Bn\n", (float)ops/1000000000.);
+    printf("FLOPS: %.2f Bn\n", (float)ops/1000000000.*tics/t);
     printf("Speed: %f sec/eval\n", t/tics);
     printf("Speed: %f Hz\n", tics/t);
 }
@@ -108,17 +136,8 @@ void speed(char *cfgfile, int tics)
 void operations(char *cfgfile)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    int i;
-    long ops = 0;
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
-        if(l.type == CONVOLUTIONAL){
-            ops += 2l * l.n * l.size*l.size*l.c * l.out_h*l.out_w;
-        } else if(l.type == CONNECTED){
-            ops += 2l * l.inputs * l.outputs;
-        }
-    }
+    network *net = parse_network_cfg(cfgfile);
+    long ops = numops(net);
     printf("Floating Point Operations: %ld\n", ops);
     printf("Floating Point Operations: %.2f Bn\n", (float)ops/1000000000.);
 }
@@ -126,52 +145,75 @@ void operations(char *cfgfile)
 void oneoff(char *cfgfile, char *weightfile, char *outfile)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    int oldn = net.layers[net.n - 2].n;
-    int c = net.layers[net.n - 2].c;
-    scal_cpu(oldn*c, .1, net.layers[net.n - 2].weights, 1);
-    scal_cpu(oldn, 0, net.layers[net.n - 2].biases, 1);
-    net.layers[net.n - 2].n = 9418;
-    net.layers[net.n - 2].biases += 5;
-    net.layers[net.n - 2].weights += 5*c;
+    network *net = parse_network_cfg(cfgfile);
+    int oldn = net->layers[net->n - 2].n;
+    int c = net->layers[net->n - 2].c;
+    scal_cpu(oldn*c, .1, net->layers[net->n - 2].weights, 1);
+    scal_cpu(oldn, 0, net->layers[net->n - 2].biases, 1);
+    net->layers[net->n - 2].n = 11921;
+    net->layers[net->n - 2].biases += 5;
+    net->layers[net->n - 2].weights += 5*c;
     if(weightfile){
-        load_weights(&net, weightfile);
+        load_weights(net, weightfile);
     }
-    net.layers[net.n - 2].biases -= 5;
-    net.layers[net.n - 2].weights -= 5*c;
-    net.layers[net.n - 2].n = oldn;
+    net->layers[net->n - 2].biases -= 5;
+    net->layers[net->n - 2].weights -= 5*c;
+    net->layers[net->n - 2].n = oldn;
     printf("%d\n", oldn);
-    layer l = net.layers[net.n - 2];
+    layer l = net->layers[net->n - 2];
     copy_cpu(l.n/3, l.biases, 1, l.biases +   l.n/3, 1);
     copy_cpu(l.n/3, l.biases, 1, l.biases + 2*l.n/3, 1);
     copy_cpu(l.n/3*l.c, l.weights, 1, l.weights +   l.n/3*l.c, 1);
     copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + 2*l.n/3*l.c, 1);
-    *net.seen = 0;
+    *net->seen = 0;
     save_weights(net, outfile);
 }
 
-void partial(char *cfgfile, char *weightfile, char *outfile, int max)
+void oneoff2(char *cfgfile, char *weightfile, char *outfile, int l)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
+    network *net = parse_network_cfg(cfgfile);
     if(weightfile){
-        load_weights_upto(&net, weightfile, max);
+        load_weights_upto(net, weightfile, 0, net->n);
+        load_weights_upto(net, weightfile, l, net->n);
     }
-    *net.seen = 0;
+    *net->seen = 0;
+    save_weights_upto(net, outfile, net->n);
+}
+
+void partial(char *cfgfile, char *weightfile, char *outfile, int max)
+{
+    gpu_index = -1;
+    network *net = load_network(cfgfile, weightfile, 1);
     save_weights_upto(net, outfile, max);
 }
 
-#include "convolutional_layer.h"
-void rescale_net(char *cfgfile, char *weightfile, char *outfile)
+void print_weights(char *cfgfile, char *weightfile, int n)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
+    network *net = load_network(cfgfile, weightfile, 1);
+    layer l = net->layers[n];
+    int i, j;
+    //printf("[");
+    for(i = 0; i < l.n; ++i){
+        //printf("[");
+        for(j = 0; j < l.size*l.size*l.c; ++j){
+            //if(j > 0) printf(",");
+            printf("%g ", l.weights[i*l.size*l.size*l.c + j]);
+        }
+        printf("\n");
+        //printf("]%s\n", (i == l.n-1)?"":",");
     }
+    //printf("]");
+}
+
+void rescale_net(char *cfgfile, char *weightfile, char *outfile)
+{
+    gpu_index = -1;
+    network *net = load_network(cfgfile, weightfile, 0);
     int i;
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
         if(l.type == CONVOLUTIONAL){
             rescale_weights(l, 2, -.5);
             break;
@@ -183,13 +225,10 @@ void rescale_net(char *cfgfile, char *weightfile, char *outfile)
 void rgbgr_net(char *cfgfile, char *weightfile, char *outfile)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfgfile, weightfile, 0);
     int i;
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
         if(l.type == CONVOLUTIONAL){
             rgbgr_weights(l);
             break;
@@ -201,13 +240,10 @@ void rgbgr_net(char *cfgfile, char *weightfile, char *outfile)
 void reset_normalize_net(char *cfgfile, char *weightfile, char *outfile)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if (weightfile) {
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfgfile, weightfile, 0);
     int i;
-    for (i = 0; i < net.n; ++i) {
-        layer l = net.layers[i];
+    for (i = 0; i < net->n; ++i) {
+        layer l = net->layers[i];
         if (l.type == CONVOLUTIONAL && l.batch_normalize) {
             denormalize_convolutional_layer(l);
         }
@@ -242,18 +278,15 @@ layer normalize_layer(layer l, int n)
 void normalize_net(char *cfgfile, char *weightfile, char *outfile)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfgfile, weightfile, 0);
     int i;
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
         if(l.type == CONVOLUTIONAL && !l.batch_normalize){
-            net.layers[i] = normalize_layer(l, l.n);
+            net->layers[i] = normalize_layer(l, l.n);
         }
         if (l.type == CONNECTED && !l.batch_normalize) {
-            net.layers[i] = normalize_layer(l, l.outputs);
+            net->layers[i] = normalize_layer(l, l.outputs);
         }
         if (l.type == GRU && l.batch_normalize) {
             *l.input_z_layer = normalize_layer(*l.input_z_layer, l.input_z_layer->outputs);
@@ -262,7 +295,7 @@ void normalize_net(char *cfgfile, char *weightfile, char *outfile)
             *l.state_z_layer = normalize_layer(*l.state_z_layer, l.state_z_layer->outputs);
             *l.state_r_layer = normalize_layer(*l.state_r_layer, l.state_r_layer->outputs);
             *l.state_h_layer = normalize_layer(*l.state_h_layer, l.state_h_layer->outputs);
-            net.layers[i].batch_normalize=1;
+            net->layers[i].batch_normalize=1;
         }
     }
     save_weights(net, outfile);
@@ -271,13 +304,10 @@ void normalize_net(char *cfgfile, char *weightfile, char *outfile)
 void statistics_net(char *cfgfile, char *weightfile)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if (weightfile) {
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfgfile, weightfile, 0);
     int i;
-    for (i = 0; i < net.n; ++i) {
-        layer l = net.layers[i];
+    for (i = 0; i < net->n; ++i) {
+        layer l = net->layers[i];
         if (l.type == CONNECTED && l.batch_normalize) {
             printf("Connected Layer %d\n", i);
             statistics_connected_layer(l);
@@ -304,20 +334,17 @@ void statistics_net(char *cfgfile, char *weightfile)
 void denormalize_net(char *cfgfile, char *weightfile, char *outfile)
 {
     gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if (weightfile) {
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfgfile, weightfile, 0);
     int i;
-    for (i = 0; i < net.n; ++i) {
-        layer l = net.layers[i];
-        if (l.type == CONVOLUTIONAL && l.batch_normalize) {
+    for (i = 0; i < net->n; ++i) {
+        layer l = net->layers[i];
+        if ((l.type == DECONVOLUTIONAL || l.type == CONVOLUTIONAL) && l.batch_normalize) {
             denormalize_convolutional_layer(l);
-            net.layers[i].batch_normalize=0;
+            net->layers[i].batch_normalize=0;
         }
         if (l.type == CONNECTED && l.batch_normalize) {
             denormalize_connected_layer(l);
-            net.layers[i].batch_normalize=0;
+            net->layers[i].batch_normalize=0;
         }
         if (l.type == GRU && l.batch_normalize) {
             denormalize_connected_layer(*l.input_z_layer);
@@ -332,22 +359,42 @@ void denormalize_net(char *cfgfile, char *weightfile, char *outfile)
             l.state_z_layer->batch_normalize = 0;
             l.state_r_layer->batch_normalize = 0;
             l.state_h_layer->batch_normalize = 0;
-            net.layers[i].batch_normalize=0;
+            net->layers[i].batch_normalize=0;
         }
     }
     save_weights(net, outfile);
 }
 
-void visualize(char *cfgfile, char *weightfile)
+void mkimg(char *cfgfile, char *weightfile, int h, int w, int num, char *prefix)
 {
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
+    network *net = load_network(cfgfile, weightfile, 0);
+    image *ims = get_weights(net->layers[0]);
+    int n = net->layers[0].n;
+    int z;
+    for(z = 0; z < num; ++z){
+        image im = make_image(h, w, 3);
+        fill_image(im, .5);
+        int i;
+        for(i = 0; i < 100; ++i){
+            image r = copy_image(ims[rand()%n]);
+            rotate_image_cw(r, rand()%4);
+            random_distort_image(r, 1, 1.5, 1.5);
+            int dx = rand()%(w-r.w);
+            int dy = rand()%(h-r.h);
+            ghost_image(r, im, dx, dy);
+            free_image(r);
+        }
+        char buff[256];
+        sprintf(buff, "%s/gen_%d", prefix, z);
+        save_image(im, buff);
+        free_image(im);
     }
+}
+
+void visualize(char *cfgfile, char *weightfile)
+{
+    network *net = load_network(cfgfile, weightfile, 0);
     visualize_network(net);
-#ifdef OPENCV
-    cvWaitKey(0);
-#endif
 }
 
 int main(int argc, char **argv)
@@ -376,46 +423,44 @@ int main(int argc, char **argv)
         average(argc, argv);
     } else if (0 == strcmp(argv[1], "yolo")){
         run_yolo(argc, argv);
-    } else if (0 == strcmp(argv[1], "voxel")){
-        run_voxel(argc, argv);
     } else if (0 == strcmp(argv[1], "super")){
         run_super(argc, argv);
+    } else if (0 == strcmp(argv[1], "lsd")){
+        run_lsd(argc, argv);
     } else if (0 == strcmp(argv[1], "detector")){
         run_detector(argc, argv);
     } else if (0 == strcmp(argv[1], "detect")){
-        float thresh = find_float_arg(argc, argv, "-thresh", .24);
+        float thresh = find_float_arg(argc, argv, "-thresh", .5);
         char *filename = (argc > 4) ? argv[4]: 0;
-        test_detector("cfg/coco.data", argv[2], argv[3], filename, thresh, .5);
+        char *outfile = find_char_arg(argc, argv, "-out", 0);
+        int fullscreen = find_arg(argc, argv, "-fullscreen");
+        test_detector("cfg/coco.data", argv[2], argv[3], filename, thresh, .5, outfile, fullscreen);
     } else if (0 == strcmp(argv[1], "cifar")){
         run_cifar(argc, argv);
     } else if (0 == strcmp(argv[1], "go")){
         run_go(argc, argv);
     } else if (0 == strcmp(argv[1], "rnn")){
         run_char_rnn(argc, argv);
-    } else if (0 == strcmp(argv[1], "vid")){
-        run_vid_rnn(argc, argv);
     } else if (0 == strcmp(argv[1], "coco")){
         run_coco(argc, argv);
     } else if (0 == strcmp(argv[1], "classify")){
         predict_classifier("cfg/imagenet1k.data", argv[2], argv[3], argv[4], 5);
     } else if (0 == strcmp(argv[1], "classifier")){
         run_classifier(argc, argv);
+    } else if (0 == strcmp(argv[1], "regressor")){
+        run_regressor(argc, argv);
+    } else if (0 == strcmp(argv[1], "isegmenter")){
+        run_isegmenter(argc, argv);
+    } else if (0 == strcmp(argv[1], "segmenter")){
+        run_segmenter(argc, argv);
     } else if (0 == strcmp(argv[1], "art")){
         run_art(argc, argv);
     } else if (0 == strcmp(argv[1], "tag")){
         run_tag(argc, argv);
-    } else if (0 == strcmp(argv[1], "compare")){
-        run_compare(argc, argv);
-    } else if (0 == strcmp(argv[1], "dice")){
-        run_dice(argc, argv);
-    } else if (0 == strcmp(argv[1], "writing")){
-        run_writing(argc, argv);
     } else if (0 == strcmp(argv[1], "3d")){
         composite_3d(argv[2], argv[3], argv[4], (argc > 5) ? atof(argv[5]) : 0);
     } else if (0 == strcmp(argv[1], "test")){
         test_resize(argv[2]);
-    } else if (0 == strcmp(argv[1], "captcha")){
-        run_captcha(argc, argv);
     } else if (0 == strcmp(argv[1], "nightmare")){
         run_nightmare(argc, argv);
     } else if (0 == strcmp(argv[1], "rgbgr")){
@@ -436,12 +481,18 @@ int main(int argc, char **argv)
         speed(argv[2], (argc > 3 && argv[3]) ? atoi(argv[3]) : 0);
     } else if (0 == strcmp(argv[1], "oneoff")){
         oneoff(argv[2], argv[3], argv[4]);
+    } else if (0 == strcmp(argv[1], "oneoff2")){
+        oneoff2(argv[2], argv[3], argv[4], atoi(argv[5]));
+    } else if (0 == strcmp(argv[1], "print")){
+        print_weights(argv[2], argv[3], atoi(argv[4]));
     } else if (0 == strcmp(argv[1], "partial")){
         partial(argv[2], argv[3], argv[4], atoi(argv[5]));
     } else if (0 == strcmp(argv[1], "average")){
         average(argc, argv);
     } else if (0 == strcmp(argv[1], "visualize")){
         visualize(argv[2], (argc > 3) ? argv[3] : 0);
+    } else if (0 == strcmp(argv[1], "mkimg")){
+        mkimg(argv[2], argv[3], atoi(argv[4]), atoi(argv[5]), atoi(argv[6]), argv[7]);
     } else if (0 == strcmp(argv[1], "imtest")){
         test_resize(argv[2]);
     } else {
diff --git a/image.darknet/inst/include/darknet/examples/detector-scipy-opencv.py b/image.darknet/inst/include/darknet/examples/detector-scipy-opencv.py
new file mode 100644
index 0000000..3bfc591
--- /dev/null
+++ b/image.darknet/inst/include/darknet/examples/detector-scipy-opencv.py
@@ -0,0 +1,56 @@
+# Stupid python path shit.
+# Instead just add darknet.py to somewhere in your python path
+# OK actually that might not be a great idea, idk, work in progress
+# Use at your own risk. or don't, i don't care
+
+from scipy.misc import imread
+import cv2
+
+def array_to_image(arr):
+    arr = arr.transpose(2,0,1)
+    c = arr.shape[0]
+    h = arr.shape[1]
+    w = arr.shape[2]
+    arr = (arr/255.0).flatten()
+    data = dn.c_array(dn.c_float, arr)
+    im = dn.IMAGE(w,h,c,data)
+    return im
+
+def detect2(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
+    boxes = dn.make_boxes(net)
+    probs = dn.make_probs(net)
+    num =   dn.num_boxes(net)
+    dn.network_detect(net, image, thresh, hier_thresh, nms, boxes, probs)
+    res = []
+    for j in range(num):
+        for i in range(meta.classes):
+            if probs[j][i] > 0:
+                res.append((meta.names[i], probs[j][i], (boxes[j].x, boxes[j].y, boxes[j].w, boxes[j].h)))
+    res = sorted(res, key=lambda x: -x[1])
+    dn.free_ptrs(dn.cast(probs, dn.POINTER(dn.c_void_p)), num)
+    return res
+
+import sys, os
+sys.path.append(os.path.join(os.getcwd(),'python/'))
+
+import darknet as dn
+
+# Darknet
+net = dn.load_net("cfg/tiny-yolo.cfg", "tiny-yolo.weights", 0)
+meta = dn.load_meta("cfg/coco.data")
+r = dn.detect(net, meta, "data/dog.jpg")
+print r
+
+# scipy
+arr= imread('data/dog.jpg')
+im = array_to_image(arr)
+r = detect2(net, meta, im)
+print r
+
+# OpenCV
+arr = cv2.imread('data/dog.jpg')
+im = array_to_image(arr)
+dn.rgbgr_image(im)
+r = detect2(net, meta, im)
+print r
+
diff --git a/image.darknet/inst/include/darknet/examples/detector.c b/image.darknet/inst/include/darknet/examples/detector.c
new file mode 100644
index 0000000..318f7fb
--- /dev/null
+++ b/image.darknet/inst/include/darknet/examples/detector.c
@@ -0,0 +1,850 @@
+#include "darknet.h"
+
+static int coco_ids[] = {1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90};
+
+
+void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
+{
+    list *options = read_data_cfg(datacfg);
+    char *train_images = option_find_str(options, "train", "data/train.list");
+    char *backup_directory = option_find_str(options, "backup", "/backup/");
+
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    float avg_loss = -1;
+    network **nets = calloc(ngpus, sizeof(network));
+
+    srand(time(0));
+    int seed = rand();
+    int i;
+    for(i = 0; i < ngpus; ++i){
+        srand(seed);
+#ifdef GPU
+        cuda_set_device(gpus[i]);
+#endif
+        nets[i] = load_network(cfgfile, weightfile, clear);
+        nets[i]->learning_rate *= ngpus;
+    }
+    srand(time(0));
+    network *net = nets[0];
+
+    int imgs = net->batch * net->subdivisions * ngpus;
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    data train, buffer;
+
+    layer l = net->layers[net->n - 1];
+
+    int classes = l.classes;
+    float jitter = l.jitter;
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = get_base_args(net);
+    args.coords = l.coords;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.classes = classes;
+    args.jitter = jitter;
+    args.num_boxes = l.max_boxes;
+    args.d = &buffer;
+    args.type = DETECTION_DATA;
+    //args.type = INSTANCE_DATA;
+    args.threads = 64;
+
+    pthread_t load_thread = load_data(args);
+    double time;
+    int count = 0;
+    //while(i*imgs < N*120){
+    while(get_current_batch(net) < net->max_batches){
+        if(l.random && count++%10 == 0){
+            printf("Resizing\n");
+            int dim = (rand() % 10 + 10) * 32;
+            if (get_current_batch(net)+200 > net->max_batches) dim = 608;
+            //int dim = (rand() % 4 + 16) * 32;
+            printf("%d\n", dim);
+            args.w = dim;
+            args.h = dim;
+
+            pthread_join(load_thread, 0);
+            train = buffer;
+            free_data(train);
+            load_thread = load_data(args);
+
+            #pragma omp parallel for
+            for(i = 0; i < ngpus; ++i){
+                resize_network(nets[i], dim, dim);
+            }
+            net = nets[0];
+        }
+        time=what_time_is_it_now();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data(args);
+
+        /*
+           int k;
+           for(k = 0; k < l.max_boxes; ++k){
+           box b = float_to_box(train.y.vals[10] + 1 + k*5);
+           if(!b.x) break;
+           printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
+           }
+         */
+        /*
+           int zz;
+           for(zz = 0; zz < train.X.cols; ++zz){
+           image im = float_to_image(net->w, net->h, 3, train.X.vals[zz]);
+           int k;
+           for(k = 0; k < l.max_boxes; ++k){
+           box b = float_to_box(train.y.vals[zz] + k*5, 1);
+           printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
+           draw_bbox(im, b, 1, 1,0,0);
+           }
+           show_image(im, "truth11");
+           cvWaitKey(0);
+           save_image(im, "truth11");
+           }
+         */
+
+        printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
+
+        time=what_time_is_it_now();
+        float loss = 0;
+#ifdef GPU
+        if(ngpus == 1){
+            loss = train_network(net, train);
+        } else {
+            loss = train_networks(nets, ngpus, train, 4);
+        }
+#else
+        loss = train_network(net, train);
+#endif
+        if (avg_loss < 0) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        i = get_current_batch(net);
+        printf("%ld: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, i*imgs);
+        if(i%100==0){
+#ifdef GPU
+            if(ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+        }
+        if(i%10000==0 || (i < 1000 && i%100 == 0)){
+#ifdef GPU
+            if(ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
+        free_data(train);
+    }
+#ifdef GPU
+    if(ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+}
+
+
+static int get_coco_image_id(char *filename)
+{
+    char *p = strrchr(filename, '/');
+    char *c = strrchr(filename, '_');
+    if(c) p = c;
+    return atoi(p+1);
+}
+
+static void print_cocos(FILE *fp, char *image_path, detection *dets, int num_boxes, int classes, int w, int h)
+{
+    int i, j;
+    int image_id = get_coco_image_id(image_path);
+    for(i = 0; i < num_boxes; ++i){
+        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
+
+        if (xmin < 0) xmin = 0;
+        if (ymin < 0) ymin = 0;
+        if (xmax > w) xmax = w;
+        if (ymax > h) ymax = h;
+
+        float bx = xmin;
+        float by = ymin;
+        float bw = xmax - xmin;
+        float bh = ymax - ymin;
+
+        for(j = 0; j < classes; ++j){
+            if (dets[i].prob[j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, dets[i].prob[j]);
+        }
+    }
+}
+
+void print_detector_detections(FILE **fps, char *id, detection *dets, int total, int classes, int w, int h)
+{
+    int i, j;
+    for(i = 0; i < total; ++i){
+        float xmin = dets[i].bbox.x - dets[i].bbox.w/2. + 1;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w/2. + 1;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h/2. + 1;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h/2. + 1;
+
+        if (xmin < 1) xmin = 1;
+        if (ymin < 1) ymin = 1;
+        if (xmax > w) xmax = w;
+        if (ymax > h) ymax = h;
+
+        for(j = 0; j < classes; ++j){
+            if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
+                    xmin, ymin, xmax, ymax);
+        }
+    }
+}
+
+void print_imagenet_detections(FILE *fp, int id, detection *dets, int total, int classes, int w, int h)
+{
+    int i, j;
+    for(i = 0; i < total; ++i){
+        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
+
+        if (xmin < 0) xmin = 0;
+        if (ymin < 0) ymin = 0;
+        if (xmax > w) xmax = w;
+        if (ymax > h) ymax = h;
+
+        for(j = 0; j < classes; ++j){
+            int class = j;
+            if (dets[i].prob[class]) fprintf(fp, "%d %d %f %f %f %f %f\n", id, j+1, dets[i].prob[class],
+                    xmin, ymin, xmax, ymax);
+        }
+    }
+}
+
+void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
+{
+    int j;
+    list *options = read_data_cfg(datacfg);
+    char *valid_images = option_find_str(options, "valid", "data/train.list");
+    char *name_list = option_find_str(options, "names", "data/names.list");
+    char *prefix = option_find_str(options, "results", "results");
+    char **names = get_labels(name_list);
+    char *mapf = option_find_str(options, "map", 0);
+    int *map = 0;
+    if (mapf) map = read_map(mapf);
+
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 2);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    srand(time(0));
+
+    list *plist = get_paths(valid_images);
+    char **paths = (char **)list_to_array(plist);
+
+    layer l = net->layers[net->n-1];
+    int classes = l.classes;
+
+    char buff[1024];
+    char *type = option_find_str(options, "eval", "voc");
+    FILE *fp = 0;
+    FILE **fps = 0;
+    int coco = 0;
+    int imagenet = 0;
+    if(0==strcmp(type, "coco")){
+        if(!outfile) outfile = "coco_results";
+        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
+        fp = fopen(buff, "w");
+        fprintf(fp, "[\n");
+        coco = 1;
+    } else if(0==strcmp(type, "imagenet")){
+        if(!outfile) outfile = "imagenet-detection";
+        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
+        fp = fopen(buff, "w");
+        imagenet = 1;
+        classes = 200;
+    } else {
+        if(!outfile) outfile = "comp4_det_test_";
+        fps = calloc(classes, sizeof(FILE *));
+        for(j = 0; j < classes; ++j){
+            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
+            fps[j] = fopen(buff, "w");
+        }
+    }
+
+    int m = plist->size;
+    int i=0;
+    int t;
+
+    float thresh = .005;
+    float nms = .45;
+
+    int nthreads = 4;
+    image *val = calloc(nthreads, sizeof(image));
+    image *val_resized = calloc(nthreads, sizeof(image));
+    image *buf = calloc(nthreads, sizeof(image));
+    image *buf_resized = calloc(nthreads, sizeof(image));
+    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
+
+    image input = make_image(net->w, net->h, net->c*2);
+
+    load_args args = {0};
+    args.w = net->w;
+    args.h = net->h;
+    //args.type = IMAGE_DATA;
+    args.type = LETTERBOX_DATA;
+
+    for(t = 0; t < nthreads; ++t){
+        args.path = paths[i+t];
+        args.im = &buf[t];
+        args.resized = &buf_resized[t];
+        thr[t] = load_data_in_thread(args);
+    }
+    double start = what_time_is_it_now();
+    for(i = nthreads; i < m+nthreads; i += nthreads){
+        fprintf(stderr, "%d\n", i);
+        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+            pthread_join(thr[t], 0);
+            val[t] = buf[t];
+            val_resized[t] = buf_resized[t];
+        }
+        for(t = 0; t < nthreads && i+t < m; ++t){
+            args.path = paths[i+t];
+            args.im = &buf[t];
+            args.resized = &buf_resized[t];
+            thr[t] = load_data_in_thread(args);
+        }
+        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+            char *path = paths[i+t-nthreads];
+            char *id = basecfg(path);
+            copy_cpu(net->w*net->h*net->c, val_resized[t].data, 1, input.data, 1);
+            flip_image(val_resized[t]);
+            copy_cpu(net->w*net->h*net->c, val_resized[t].data, 1, input.data + net->w*net->h*net->c, 1);
+
+            network_predict(net, input.data);
+            int w = val[t].w;
+            int h = val[t].h;
+            int num = 0;
+            detection *dets = get_network_boxes(net, w, h, thresh, .5, map, 0, &num);
+            if (nms) do_nms_sort(dets, num, classes, nms);
+            if (coco){
+                print_cocos(fp, path, dets, num, classes, w, h);
+            } else if (imagenet){
+                print_imagenet_detections(fp, i+t-nthreads+1, dets, num, classes, w, h);
+            } else {
+                print_detector_detections(fps, id, dets, num, classes, w, h);
+            }
+            free_detections(dets, num);
+            free(id);
+            free_image(val[t]);
+            free_image(val_resized[t]);
+        }
+    }
+    for(j = 0; j < classes; ++j){
+        if(fps) fclose(fps[j]);
+    }
+    if(coco){
+        fseek(fp, -2, SEEK_CUR); 
+        fprintf(fp, "\n]\n");
+        fclose(fp);
+    }
+    fprintf(stderr, "Total Detection Time: %f Seconds\n", what_time_is_it_now() - start);
+}
+
+
+void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
+{
+    int j;
+    list *options = read_data_cfg(datacfg);
+    char *valid_images = option_find_str(options, "valid", "data/train.list");
+    char *name_list = option_find_str(options, "names", "data/names.list");
+    char *prefix = option_find_str(options, "results", "results");
+    char **names = get_labels(name_list);
+    char *mapf = option_find_str(options, "map", 0);
+    int *map = 0;
+    if (mapf) map = read_map(mapf);
+
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    srand(time(0));
+
+    list *plist = get_paths(valid_images);
+    char **paths = (char **)list_to_array(plist);
+
+    layer l = net->layers[net->n-1];
+    int classes = l.classes;
+
+    char buff[1024];
+    char *type = option_find_str(options, "eval", "voc");
+    FILE *fp = 0;
+    FILE **fps = 0;
+    int coco = 0;
+    int imagenet = 0;
+    if(0==strcmp(type, "coco")){
+        if(!outfile) outfile = "coco_results";
+        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
+        fp = fopen(buff, "w");
+        fprintf(fp, "[\n");
+        coco = 1;
+    } else if(0==strcmp(type, "imagenet")){
+        if(!outfile) outfile = "imagenet-detection";
+        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
+        fp = fopen(buff, "w");
+        imagenet = 1;
+        classes = 200;
+    } else {
+        if(!outfile) outfile = "comp4_det_test_";
+        fps = calloc(classes, sizeof(FILE *));
+        for(j = 0; j < classes; ++j){
+            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
+            fps[j] = fopen(buff, "w");
+        }
+    }
+
+
+    int m = plist->size;
+    int i=0;
+    int t;
+
+    float thresh = .005;
+    float nms = .45;
+
+    int nthreads = 4;
+    image *val = calloc(nthreads, sizeof(image));
+    image *val_resized = calloc(nthreads, sizeof(image));
+    image *buf = calloc(nthreads, sizeof(image));
+    image *buf_resized = calloc(nthreads, sizeof(image));
+    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
+
+    load_args args = {0};
+    args.w = net->w;
+    args.h = net->h;
+    //args.type = IMAGE_DATA;
+    args.type = LETTERBOX_DATA;
+
+    for(t = 0; t < nthreads; ++t){
+        args.path = paths[i+t];
+        args.im = &buf[t];
+        args.resized = &buf_resized[t];
+        thr[t] = load_data_in_thread(args);
+    }
+    double start = what_time_is_it_now();
+    for(i = nthreads; i < m+nthreads; i += nthreads){
+        fprintf(stderr, "%d\n", i);
+        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+            pthread_join(thr[t], 0);
+            val[t] = buf[t];
+            val_resized[t] = buf_resized[t];
+        }
+        for(t = 0; t < nthreads && i+t < m; ++t){
+            args.path = paths[i+t];
+            args.im = &buf[t];
+            args.resized = &buf_resized[t];
+            thr[t] = load_data_in_thread(args);
+        }
+        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+            char *path = paths[i+t-nthreads];
+            char *id = basecfg(path);
+            float *X = val_resized[t].data;
+            network_predict(net, X);
+            int w = val[t].w;
+            int h = val[t].h;
+            int nboxes = 0;
+            detection *dets = get_network_boxes(net, w, h, thresh, .5, map, 0, &nboxes);
+            if (nms) do_nms_sort(dets, nboxes, classes, nms);
+            if (coco){
+                print_cocos(fp, path, dets, nboxes, classes, w, h);
+            } else if (imagenet){
+                print_imagenet_detections(fp, i+t-nthreads+1, dets, nboxes, classes, w, h);
+            } else {
+                print_detector_detections(fps, id, dets, nboxes, classes, w, h);
+            }
+            free_detections(dets, nboxes);
+            free(id);
+            free_image(val[t]);
+            free_image(val_resized[t]);
+        }
+    }
+    for(j = 0; j < classes; ++j){
+        if(fps) fclose(fps[j]);
+    }
+    if(coco){
+        fseek(fp, -2, SEEK_CUR); 
+        fprintf(fp, "\n]\n");
+        fclose(fp);
+    }
+    fprintf(stderr, "Total Detection Time: %f Seconds\n", what_time_is_it_now() - start);
+}
+
+void validate_detector_recall(char *cfgfile, char *weightfile)
+{
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    srand(time(0));
+
+    list *plist = get_paths("data/coco_val_5k.list");
+    char **paths = (char **)list_to_array(plist);
+
+    layer l = net->layers[net->n-1];
+
+    int j, k;
+
+    int m = plist->size;
+    int i=0;
+
+    float thresh = .001;
+    float iou_thresh = .5;
+    float nms = .4;
+
+    int total = 0;
+    int correct = 0;
+    int proposals = 0;
+    float avg_iou = 0;
+
+    for(i = 0; i < m; ++i){
+        char *path = paths[i];
+        image orig = load_image_color(path, 0, 0);
+        image sized = resize_image(orig, net->w, net->h);
+        char *id = basecfg(path);
+        network_predict(net, sized.data);
+        int nboxes = 0;
+        detection *dets = get_network_boxes(net, sized.w, sized.h, thresh, .5, 0, 1, &nboxes);
+        if (nms) do_nms_obj(dets, nboxes, 1, nms);
+
+        char labelpath[4096];
+        find_replace(path, "images", "labels", labelpath);
+        find_replace(labelpath, "JPEGImages", "labels", labelpath);
+        find_replace(labelpath, ".jpg", ".txt", labelpath);
+        find_replace(labelpath, ".JPEG", ".txt", labelpath);
+
+        int num_labels = 0;
+        box_label *truth = read_boxes(labelpath, &num_labels);
+        for(k = 0; k < nboxes; ++k){
+            if(dets[k].objectness > thresh){
+                ++proposals;
+            }
+        }
+        for (j = 0; j < num_labels; ++j) {
+            ++total;
+            box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
+            float best_iou = 0;
+            for(k = 0; k < l.w*l.h*l.n; ++k){
+                float iou = box_iou(dets[k].bbox, t);
+                if(dets[k].objectness > thresh && iou > best_iou){
+                    best_iou = iou;
+                }
+            }
+            avg_iou += best_iou;
+            if(best_iou > iou_thresh){
+                ++correct;
+            }
+        }
+
+        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
+        free(id);
+        free_image(orig);
+        free_image(sized);
+    }
+}
+
+
+void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, char *outfile, int fullscreen)
+{
+    list *options = read_data_cfg(datacfg);
+    char *name_list = option_find_str(options, "names", "data/names.list");
+    char **names = get_labels(name_list);
+
+    image **alphabet = load_alphabet();
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(2222222);
+    double time;
+    char buff[256];
+    char *input = buff;
+    float nms=.45;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        } else {
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input,0,0);
+        image sized = letterbox_image(im, net->w, net->h);
+        //image sized = resize_image(im, net->w, net->h);
+        //image sized2 = resize_max(im, net->w);
+        //image sized = crop_image(sized2, -((net->w - sized2.w)/2), -((net->h - sized2.h)/2), net->w, net->h);
+        //resize_network(net, sized.w, sized.h);
+        layer l = net->layers[net->n-1];
+
+
+        float *X = sized.data;
+        time=what_time_is_it_now();
+        network_predict(net, X);
+        printf("%s: Predicted in %f seconds.\n", input, what_time_is_it_now()-time);
+        int nboxes = 0;
+        detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes);
+        //printf("%d\n", nboxes);
+        //if (nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+        draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes);
+        free_detections(dets, nboxes);
+        if(outfile){
+            save_image(im, outfile);
+        }
+        else{
+            save_image(im, "predictions");
+#ifdef OPENCV
+            make_window("predictions", 512, 512, 0);
+            show_image(im, "predictions", 0);
+#endif
+        }
+
+        free_image(im);
+        free_image(sized);
+        if (filename) break;
+    }
+}
+
+/*
+void censor_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename, int class, float thresh, int skip)
+{
+#ifdef OPENCV
+    char *base = basecfg(cfgfile);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+
+    srand(2222222);
+    CvCapture * cap;
+
+    int w = 1280;
+    int h = 720;
+
+    if(filename){
+        cap = cvCaptureFromFile(filename);
+    }else{
+        cap = cvCaptureFromCAM(cam_index);
+    }
+
+    if(w){
+        cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_WIDTH, w);
+    }
+    if(h){
+        cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_HEIGHT, h);
+    }
+
+    if(!cap) error("Couldn't connect to webcam.\n");
+    cvNamedWindow(base, CV_WINDOW_NORMAL); 
+    cvResizeWindow(base, 512, 512);
+    float fps = 0;
+    int i;
+    float nms = .45;
+
+    while(1){
+        image in = get_image_from_stream(cap);
+        //image in_s = resize_image(in, net->w, net->h);
+        image in_s = letterbox_image(in, net->w, net->h);
+        layer l = net->layers[net->n-1];
+
+        float *X = in_s.data;
+        network_predict(net, X);
+        int nboxes = 0;
+        detection *dets = get_network_boxes(net, in.w, in.h, thresh, 0, 0, 0, &nboxes);
+        //if (nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+
+        for(i = 0; i < nboxes; ++i){
+            if(dets[i].prob[class] > thresh){
+                box b = dets[i].bbox;
+                int left  = b.x-b.w/2.;
+                int top   = b.y-b.h/2.;
+                censor_image(in, left, top, b.w, b.h);
+            }
+        }
+        show_image(in, base);
+        cvWaitKey(10);
+        free_detections(dets, nboxes);
+
+
+        free_image(in_s);
+        free_image(in);
+
+
+        float curr = 0;
+        fps = .9*fps + .1*curr;
+        for(i = 0; i < skip; ++i){
+            image in = get_image_from_stream(cap);
+            free_image(in);
+        }
+    }
+    #endif
+}
+
+void extract_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename, int class, float thresh, int skip)
+{
+#ifdef OPENCV
+    char *base = basecfg(cfgfile);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+
+    srand(2222222);
+    CvCapture * cap;
+
+    int w = 1280;
+    int h = 720;
+
+    if(filename){
+        cap = cvCaptureFromFile(filename);
+    }else{
+        cap = cvCaptureFromCAM(cam_index);
+    }
+
+    if(w){
+        cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_WIDTH, w);
+    }
+    if(h){
+        cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_HEIGHT, h);
+    }
+
+    if(!cap) error("Couldn't connect to webcam.\n");
+    cvNamedWindow(base, CV_WINDOW_NORMAL); 
+    cvResizeWindow(base, 512, 512);
+    float fps = 0;
+    int i;
+    int count = 0;
+    float nms = .45;
+
+    while(1){
+        image in = get_image_from_stream(cap);
+        //image in_s = resize_image(in, net->w, net->h);
+        image in_s = letterbox_image(in, net->w, net->h);
+        layer l = net->layers[net->n-1];
+
+        show_image(in, base);
+
+        int nboxes = 0;
+        float *X = in_s.data;
+        network_predict(net, X);
+        detection *dets = get_network_boxes(net, in.w, in.h, thresh, 0, 0, 1, &nboxes);
+        //if (nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+
+        for(i = 0; i < nboxes; ++i){
+            if(dets[i].prob[class] > thresh){
+                box b = dets[i].bbox;
+                int size = b.w*in.w > b.h*in.h ? b.w*in.w : b.h*in.h;
+                int dx  = b.x*in.w-size/2.;
+                int dy  = b.y*in.h-size/2.;
+                image bim = crop_image(in, dx, dy, size, size);
+                char buff[2048];
+                sprintf(buff, "results/extract/%07d", count);
+                ++count;
+                save_image(bim, buff);
+                free_image(bim);
+            }
+        }
+        free_detections(dets, nboxes);
+
+
+        free_image(in_s);
+        free_image(in);
+
+
+        float curr = 0;
+        fps = .9*fps + .1*curr;
+        for(i = 0; i < skip; ++i){
+            image in = get_image_from_stream(cap);
+            free_image(in);
+        }
+    }
+    #endif
+}
+*/
+
+/*
+void network_detect(network *net, image im, float thresh, float hier_thresh, float nms, detection *dets)
+{
+    network_predict_image(net, im);
+    layer l = net->layers[net->n-1];
+    int nboxes = num_boxes(net);
+    fill_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 0, dets);
+    if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+}
+*/
+
+void run_detector(int argc, char **argv)
+{
+    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
+    float thresh = find_float_arg(argc, argv, "-thresh", .5);
+    float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
+    int cam_index = find_int_arg(argc, argv, "-c", 0);
+    int frame_skip = find_int_arg(argc, argv, "-s", 0);
+    int avg = find_int_arg(argc, argv, "-avg", 3);
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    char *outfile = find_char_arg(argc, argv, "-out", 0);
+    int *gpus = 0;
+    int gpu = 0;
+    int ngpus = 0;
+    if(gpu_list){
+        printf("%s\n", gpu_list);
+        int len = strlen(gpu_list);
+        ngpus = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (gpu_list[i] == ',') ++ngpus;
+        }
+        gpus = calloc(ngpus, sizeof(int));
+        for(i = 0; i < ngpus; ++i){
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',')+1;
+        }
+    } else {
+        gpu = gpu_index;
+        gpus = &gpu;
+        ngpus = 1;
+    }
+
+    int clear = find_arg(argc, argv, "-clear");
+    int fullscreen = find_arg(argc, argv, "-fullscreen");
+    int width = find_int_arg(argc, argv, "-w", 0);
+    int height = find_int_arg(argc, argv, "-h", 0);
+    int fps = find_int_arg(argc, argv, "-fps", 0);
+    //int class = find_int_arg(argc, argv, "-class", 0);
+
+    char *datacfg = argv[3];
+    char *cfg = argv[4];
+    char *weights = (argc > 5) ? argv[5] : 0;
+    char *filename = (argc > 6) ? argv[6]: 0;
+    if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, outfile, fullscreen);
+    else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
+    else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
+    else if(0==strcmp(argv[2], "valid2")) validate_detector_flip(datacfg, cfg, weights, outfile);
+    else if(0==strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights);
+    else if(0==strcmp(argv[2], "demo")) {
+        list *options = read_data_cfg(datacfg);
+        int classes = option_find_int(options, "classes", 20);
+        char *name_list = option_find_str(options, "names", "data/names.list");
+        char **names = get_labels(name_list);
+        demo(cfg, weights, thresh, cam_index, filename, names, classes, frame_skip, prefix, avg, hier_thresh, width, height, fps, fullscreen);
+    }
+    //else if(0==strcmp(argv[2], "extract")) extract_detector(datacfg, cfg, weights, cam_index, filename, class, thresh, frame_skip);
+    //else if(0==strcmp(argv[2], "censor")) censor_detector(datacfg, cfg, weights, cam_index, filename, class, thresh, frame_skip);
+}
diff --git a/image.darknet/inst/include/darknet/examples/detector.py b/image.darknet/inst/include/darknet/examples/detector.py
new file mode 100644
index 0000000..40bb365
--- /dev/null
+++ b/image.darknet/inst/include/darknet/examples/detector.py
@@ -0,0 +1,27 @@
+# Stupid python path shit.
+# Instead just add darknet.py to somewhere in your python path
+# OK actually that might not be a great idea, idk, work in progress
+# Use at your own risk. or don't, i don't care
+
+import sys, os
+sys.path.append(os.path.join(os.getcwd(),'python/'))
+
+import darknet as dn
+import pdb
+
+dn.set_gpu(0)
+net = dn.load_net("cfg/yolo-thor.cfg", "/home/pjreddie/backup/yolo-thor_final.weights", 0)
+meta = dn.load_meta("cfg/thor.data")
+r = dn.detect(net, meta, "data/bedroom.jpg")
+print r
+
+# And then down here you could detect a lot more images like:
+r = dn.detect(net, meta, "data/eagle.jpg")
+print r
+r = dn.detect(net, meta, "data/giraffe.jpg")
+print r
+r = dn.detect(net, meta, "data/horses.jpg")
+print r
+r = dn.detect(net, meta, "data/person.jpg")
+print r
+
diff --git a/image.darknet/inst/include/darknet/src/dice.c b/image.darknet/inst/include/darknet/examples/dice.c
similarity index 95%
rename from image.darknet/inst/include/darknet/src/dice.c
rename to image.darknet/inst/include/darknet/examples/dice.c
index 2286459..f56d76c 100644
--- a/image.darknet/inst/include/darknet/src/dice.c
+++ b/image.darknet/inst/include/darknet/examples/dice.c
@@ -1,6 +1,4 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
+#include "darknet.h"
 
 char *dice_labels[] = {"face1","face2","face3","face4","face5","face6"};
 
@@ -33,7 +31,7 @@ void train_dice(char *cfgfile, char *weightfile)
         float loss = train_network(net, train);
         if(avg_loss == -1) avg_loss = loss;
         avg_loss = avg_loss*.9 + loss*.1;
-        printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), *net.seen);
+        printf("%d: %f, %f avg, %lf seconds, %ld images\n", i, loss, avg_loss, sec(clock()-time), *net.seen);
         free_data(train);
         if((i % 100) == 0) net.learning_rate *= .1;
         if(i%100==0){
diff --git a/image.darknet/inst/include/darknet/examples/go.c b/image.darknet/inst/include/darknet/examples/go.c
new file mode 100644
index 0000000..688579d
--- /dev/null
+++ b/image.darknet/inst/include/darknet/examples/go.c
@@ -0,0 +1,1370 @@
+#include "darknet.h"
+
+#include <assert.h>
+#include <math.h>
+#include <unistd.h>
+
+int inverted = 1;
+int noi = 1;
+static const int nind = 10;
+int legal_go(float *b, float *ko, int p, int r, int c);
+int check_ko(float *x, float *ko);
+
+typedef struct {
+    char **data;
+    int n;
+} moves;
+
+char *fgetgo(FILE *fp)
+{
+    if(feof(fp)) return 0;
+    size_t size = 96;
+    char *line = malloc(size*sizeof(char));
+    if(size != fread(line, sizeof(char), size, fp)){
+        free(line);
+        return 0;
+    }
+
+    return line;
+}
+
+moves load_go_moves(char *filename)
+{
+    moves m;
+    m.n = 128;
+    m.data = calloc(128, sizeof(char*));
+    FILE *fp = fopen(filename, "rb");
+    int count = 0;
+    char *line = 0;
+    while ((line = fgetgo(fp))) {
+        if (count >= m.n) {
+            m.n *= 2;
+            m.data = realloc(m.data, m.n*sizeof(char*));
+        }
+        m.data[count] = line;
+        ++count;
+    }
+    printf("%d\n", count);
+    m.n = count;
+    m.data = realloc(m.data, count*sizeof(char*));
+    return m;
+}
+
+void string_to_board(char *s, float *board)
+{
+    int i, j;
+    memset(board, 0, 2*19*19*sizeof(float));
+    int count = 0;
+    for(i = 0; i < 91; ++i){
+        char c = s[i];
+        for(j = 0; j < 4; ++j){
+            int me = (c >> (2*j)) & 1;
+            int you = (c >> (2*j + 1)) & 1;
+            if (me) board[count] = 1;
+            else if (you) board[count + 19*19] = 1;
+            ++count;
+            if(count >= 19*19) break;
+        }
+    }
+}
+
+void board_to_string(char *s, float *board)
+{
+    int i, j;
+    memset(s, 0, (19*19/4+1)*sizeof(char));
+    int count = 0;
+    for(i = 0; i < 91; ++i){
+        for(j = 0; j < 4; ++j){
+            int me = (board[count] == 1);
+            int you = (board[count + 19*19] == 1);
+            if (me) s[i] = s[i] | (1<<(2*j));
+            if (you) s[i] = s[i] | (1<<(2*j + 1));
+            ++count;
+            if(count >= 19*19) break;
+        }
+    }
+}
+
+static int occupied(float *b, int i)
+{
+    if (b[i]) return 1;
+    if (b[i+19*19]) return -1;
+    return 0;
+}
+
+data random_go_moves(moves m, int n)
+{
+    data d = {0};
+    d.X = make_matrix(n, 19*19*3);
+    d.y = make_matrix(n, 19*19+2);
+    int i, j;
+    for(i = 0; i < n; ++i){
+        float *board = d.X.vals[i];
+        float *label = d.y.vals[i];
+        char *b = m.data[rand()%m.n];
+        int player = b[0] - '0';
+        int result = b[1] - '0';
+        int row = b[2];
+        int col = b[3];
+        string_to_board(b+4, board);
+        if(player > 0) for(j = 0; j < 19*19; ++j) board[19*19*2 + j] = 1;
+        label[19*19+1] = (player==result);
+        if(row >= 19 || col >= 19){
+            label[19*19] = 1;
+        } else {
+            label[col + 19*row] = 1;
+            if(occupied(board, col + 19*row)) printf("hey\n");
+        }
+
+        int flip = rand()%2;
+        int rotate = rand()%4;
+        image in = float_to_image(19, 19, 3, board);
+        image out = float_to_image(19, 19, 1, label);
+        if(flip){
+            flip_image(in);
+            flip_image(out);
+        }
+        rotate_image_cw(in, rotate);
+        rotate_image_cw(out, rotate);
+    }
+    return d;
+}
+
+
+void train_go(char *cfgfile, char *weightfile, char *filename, int *gpus, int ngpus, int clear)
+{
+    int i;
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    printf("%d\n", ngpus);
+    network **nets = calloc(ngpus, sizeof(network*));
+
+    srand(time(0));
+    int seed = rand();
+    for(i = 0; i < ngpus; ++i){
+        srand(seed);
+#ifdef GPU
+        cuda_set_device(gpus[i]);
+#endif
+        nets[i] = load_network(cfgfile, weightfile, clear);
+        nets[i]->learning_rate *= ngpus;
+    }
+    network *net = nets[0];
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+
+    char *backup_directory = "/home/pjreddie/backup/";
+
+    char buff[256];
+    moves m = load_go_moves(filename);
+    //moves m = load_go_moves("games.txt");
+
+    int N = m.n;
+    printf("Moves: %d\n", N);
+    int epoch = (*net->seen)/N;
+    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
+        double time=what_time_is_it_now();
+
+        data train = random_go_moves(m, net->batch*net->subdivisions*ngpus);
+        printf("Loaded: %lf seconds\n", what_time_is_it_now() - time);
+        time=what_time_is_it_now();
+
+        float loss = 0;
+#ifdef GPU
+        if(ngpus == 1){
+            loss = train_network(net, train);
+        } else {
+            loss = train_networks(nets, ngpus, train, 10);
+        }
+#else
+        loss = train_network(net, train);
+#endif
+        free_data(train);
+
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.95 + loss*.05;
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, *net->seen);
+        if(*net->seen/N > epoch){
+            epoch = *net->seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory,base, epoch);
+            save_weights(net, buff);
+
+        }
+        if(get_current_batch(net)%1000 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
+        if(get_current_batch(net)%10000 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%ld.backup",backup_directory,base,get_current_batch(net));
+            save_weights(net, buff);
+        }
+    }
+    sprintf(buff, "%s/%s.weights", backup_directory, base);
+    save_weights(net, buff);
+
+    free_network(net);
+    free(base);
+}
+
+static void propagate_liberty(float *board, int *lib, int *visited, int row, int col, int side)
+{
+    if (row < 0 || row > 18 || col < 0 || col > 18) return;
+    int index = row*19 + col;
+    if (occupied(board,index) != side) return;
+    if (visited[index]) return;
+    visited[index] = 1;
+    lib[index] += 1;
+    propagate_liberty(board, lib, visited, row+1, col, side);
+    propagate_liberty(board, lib, visited, row-1, col, side);
+    propagate_liberty(board, lib, visited, row, col+1, side);
+    propagate_liberty(board, lib, visited, row, col-1, side);
+}
+
+
+static int *calculate_liberties(float *board)
+{
+    int *lib = calloc(19*19, sizeof(int));
+    int visited[19*19];
+    int i, j;
+    for(j = 0; j < 19; ++j){
+        for(i = 0; i < 19; ++i){
+            memset(visited, 0, 19*19*sizeof(int));
+            int index = j*19 + i;
+            if(!occupied(board,index)){
+                if ((i > 0)  && occupied(board,index - 1)) propagate_liberty(board, lib, visited, j, i-1, occupied(board,index-1));
+                if ((i < 18) && occupied(board,index + 1)) propagate_liberty(board, lib, visited, j, i+1, occupied(board,index+1));
+                if ((j > 0)  && occupied(board,index - 19)) propagate_liberty(board, lib, visited, j-1, i, occupied(board,index-19));
+                if ((j < 18) && occupied(board,index + 19)) propagate_liberty(board, lib, visited, j+1, i, occupied(board,index+19));
+            }
+        }
+    }
+    return lib;
+}
+
+void print_board(FILE *stream, float *board, int player, int *indexes)
+{
+    int i,j,n;
+    fprintf(stream, "   ");
+    for(i = 0; i < 19; ++i){
+        fprintf(stream, "%c ", 'A' + i + 1*(i > 7 && noi));
+    }
+    fprintf(stream, "\n");
+    for(j = 0; j < 19; ++j){
+        fprintf(stream, "%2d", (inverted) ? 19-j : j+1);
+        for(i = 0; i < 19; ++i){
+            int index = j*19 + i;
+            if(indexes){
+                int found = 0;
+                for(n = 0; n < nind; ++n){
+                    if(index == indexes[n]){
+                        found = 1;
+                        /*
+                           if(n == 0) fprintf(stream, "\uff11");
+                           else if(n == 1) fprintf(stream, "\uff12");
+                           else if(n == 2) fprintf(stream, "\uff13");
+                           else if(n == 3) fprintf(stream, "\uff14");
+                           else if(n == 4) fprintf(stream, "\uff15");
+                         */
+                        fprintf(stream, " %d", n+1);
+                    }
+                }
+                if(found) continue;
+            }
+            //if(board[index]*-swap > 0) fprintf(stream, "\u25C9 ");
+            //else if(board[index]*-swap < 0) fprintf(stream, "\u25EF ");
+            if      (occupied(board, index) == player) fprintf(stream, " X");
+            else if (occupied(board, index) ==-player) fprintf(stream, " O");
+            else fprintf(stream, " .");
+        }
+        fprintf(stream, "\n");
+    }
+}
+
+void flip_board(float *board)
+{
+    int i;
+    for(i = 0; i < 19*19; ++i){
+        float swap = board[i];
+        board[i] = board[i+19*19];
+        board[i+19*19] = swap;
+        board[i+19*19*2] = 1-board[i+19*19*2];
+    }
+}
+
+float predict_move2(network *net, float *board, float *move, int multi)
+{
+    float *output = network_predict(net, board);
+    copy_cpu(19*19+1, output, 1, move, 1);
+    float result = output[19*19 + 1];
+    int i;
+    if(multi){
+        image bim = float_to_image(19, 19, 3, board);
+        for(i = 1; i < 8; ++i){
+            rotate_image_cw(bim, i);
+            if(i >= 4) flip_image(bim);
+
+            float *output = network_predict(net, board);
+            image oim = float_to_image(19, 19, 1, output);
+            result += output[19*19 + 1];
+
+            if(i >= 4) flip_image(oim);
+            rotate_image_cw(oim, -i);
+
+            axpy_cpu(19*19+1, 1, output, 1, move, 1);
+
+            if(i >= 4) flip_image(bim);
+            rotate_image_cw(bim, -i);
+        }
+        result = result/8;
+        scal_cpu(19*19+1, 1./8., move, 1);
+    }
+    for(i = 0; i < 19*19; ++i){
+        if(board[i] || board[i+19*19]) move[i] = 0;
+    }
+    return result;
+}
+
+static void remove_connected(float *b, int *lib, int p, int r, int c)
+{
+    if (r < 0 || r >= 19 || c < 0 || c >= 19) return;
+    if (occupied(b, r*19 + c) != p) return;
+    if (lib[r*19 + c] != 1) return;
+    b[r*19 + c] = 0;
+    b[19*19 + r*19 + c] = 0;
+    remove_connected(b, lib, p, r+1, c);
+    remove_connected(b, lib, p, r-1, c);
+    remove_connected(b, lib, p, r, c+1);
+    remove_connected(b, lib, p, r, c-1);
+}
+
+
+void move_go(float *b, int p, int r, int c)
+{
+    int *l = calculate_liberties(b);
+    if(p > 0) b[r*19 + c] = 1;
+    else b[19*19 + r*19 + c] = 1;
+    remove_connected(b, l, -p, r+1, c);
+    remove_connected(b, l, -p, r-1, c);
+    remove_connected(b, l, -p, r, c+1);
+    remove_connected(b, l, -p, r, c-1);
+    free(l);
+}
+
+int compare_board(float *a, float *b)
+{
+    if(memcmp(a, b, 19*19*3*sizeof(float)) == 0) return 1;
+    return 0;
+}
+
+typedef struct mcts_tree{
+    float *board;
+    struct mcts_tree **children;
+    float *prior;
+    int *visit_count;
+    float *value;
+    float *mean;
+    float *prob;
+    int total_count;
+    float result;
+    int done;
+    int pass;
+} mcts_tree;
+
+void free_mcts(mcts_tree *root)
+{
+    if(!root) return;
+    int i;
+    free(root->board);
+    for(i = 0; i < 19*19+1; ++i){
+        if(root->children[i]) free_mcts(root->children[i]);
+    }
+    free(root->children);
+    free(root->prior);
+    free(root->visit_count);
+    free(root->value);
+    free(root->mean);
+    free(root->prob);
+    free(root);
+}
+
+float *network_predict_rotations(network *net, float *next)
+{
+    int n = net->batch;
+    float *in = calloc(19*19*3*n, sizeof(float));
+    image im = float_to_image(19, 19, 3, next);
+    int i,j;
+    int *inds = random_index_order(0, 8);
+    for(j = 0; j < n; ++j){
+        i = inds[j];
+        rotate_image_cw(im, i);
+        if(i >= 4) flip_image(im);
+        memcpy(in + 19*19*3*j, im.data, 19*19*3*sizeof(float));
+        if(i >= 4) flip_image(im);
+        rotate_image_cw(im, -i);
+    }
+    float *pred = network_predict(net, in);
+    for(j = 0; j < n; ++j){
+        i = inds[j];
+        image im = float_to_image(19, 19, 1, pred + j*(19*19 + 2));
+        if(i >= 4) flip_image(im);
+        rotate_image_cw(im, -i);
+        if(j > 0){
+            axpy_cpu(19*19+2, 1, im.data, 1, pred, 1);
+        }
+    }
+    free(in);
+    free(inds);
+    scal_cpu(19*19+2, 1./n, pred, 1);
+    return pred;
+}
+
+mcts_tree *expand(float *next, float *ko, network *net)
+{
+    mcts_tree *root = calloc(1, sizeof(mcts_tree));
+    root->board = next;
+    root->children = calloc(19*19+1, sizeof(mcts_tree*));
+    root->prior = calloc(19*19 + 1, sizeof(float));
+    root->prob = calloc(19*19 + 1, sizeof(float));
+    root->mean = calloc(19*19 + 1, sizeof(float));
+    root->value = calloc(19*19 + 1, sizeof(float));
+    root->visit_count = calloc(19*19 + 1, sizeof(int));
+    root->total_count = 1;
+    int i;
+    float *pred = network_predict_rotations(net, next);
+    copy_cpu(19*19+1, pred, 1, root->prior, 1);
+    float val = 2*pred[19*19 + 1] - 1;
+    root->result = val;
+    for(i = 0; i < 19*19+1; ++i) {
+        root->visit_count[i] = 0;
+        root->value[i] = 0;
+        root->mean[i] = val;
+        if(i < 19*19 && occupied(next, i)){
+            root->value[i] = -1;
+            root->mean[i] = -1;
+            root->prior[i] = 0;
+        }
+    }
+    //print_board(stderr, next, flip?-1:1, 0);
+    return root;
+}
+
+float *copy_board(float *board)
+{
+    float *next = calloc(19*19*3, sizeof(float));
+    copy_cpu(19*19*3, board, 1, next, 1);
+    return next;
+}
+
+float select_mcts(mcts_tree *root, network *net, float *prev, float cpuct)
+{
+    if(root->done) return -root->result;
+    int i;
+    float max = -1000;
+    int max_i = 0;
+    for(i = 0; i < 19*19+1; ++i){
+        root->prob[i] = root->mean[i] + cpuct*root->prior[i] * sqrt(root->total_count) / (1. + root->visit_count[i]);
+        if(root->prob[i] > max){
+            max = root->prob[i];
+            max_i = i;
+        }
+    }
+    float val;
+    i = max_i;
+    root->visit_count[i]++;
+    root->total_count++;
+    if (root->children[i]) {
+        val = select_mcts(root->children[i], net, root->board, cpuct);
+    } else {
+        if(max_i < 19*19 && !legal_go(root->board, prev, 1, max_i/19, max_i%19)) {
+            root->mean[i]  = -1;
+            root->value[i] = -1;
+            root->prior[i] = 0;
+            --root->total_count;
+            return select_mcts(root, net, prev, cpuct);
+            //printf("Detected ko\n");
+            //getchar();
+        } else {
+            float *next = copy_board(root->board);
+            if (max_i < 19*19) {
+                move_go(next, 1, max_i / 19, max_i % 19);
+            }
+            flip_board(next);
+            root->children[i] = expand(next, root->board, net);
+            val = -root->children[i]->result;
+            if(max_i == 19*19){
+                root->children[i]->pass = 1;
+                if (root->pass){
+                    root->children[i]->done = 1;
+                }
+            }
+        }
+    }
+    root->value[i] += val;
+    root->mean[i] = root->value[i]/root->visit_count[i];
+    return -val;
+}
+
+mcts_tree *run_mcts(mcts_tree *tree, network *net, float *board, float *ko, int player, int n, float cpuct, float secs)
+{
+    int i;
+    double t = what_time_is_it_now();
+    if(player < 0) flip_board(board);
+    if(!tree) tree = expand(copy_board(board), ko, net);
+    assert(compare_board(tree->board, board));
+    for(i = 0; i < n; ++i){
+        if (secs > 0 && (what_time_is_it_now() - t) > secs) break;
+        int max_i = max_int_index(tree->visit_count, 19*19+1);
+        if (tree->visit_count[max_i] >= n) break;
+        select_mcts(tree, net, ko, cpuct);
+    }
+    if(player < 0) flip_board(board);
+    //fprintf(stderr, "%f Seconds\n", what_time_is_it_now() - t);
+    return tree;
+}
+
+mcts_tree *move_mcts(mcts_tree *tree, int index)
+{
+    if(index < 0 || index > 19*19 || !tree || !tree->children[index]) {
+        free_mcts(tree);
+        tree = 0;
+    } else {
+        mcts_tree *swap = tree;
+        tree = tree->children[index];
+        swap->children[index] = 0;
+        free_mcts(swap);
+    }
+    return tree;
+}
+
+typedef struct {
+    float value;
+    float mcts;
+    int row;
+    int col;
+} move;
+
+move pick_move(mcts_tree *tree, float temp, int player)
+{
+    int i;
+    float probs[19*19+1] = {0};
+    move m = {0};
+    double sum = 0;
+    /*
+    for(i = 0; i < 19*19+1; ++i){
+        probs[i] = tree->visit_count[i];
+    }
+    */
+    //softmax(probs, 19*19+1, temp, 1, probs);
+    for(i = 0; i < 19*19+1; ++i){
+        sum += pow(tree->visit_count[i], 1./temp);
+    }
+    for(i = 0; i < 19*19+1; ++i){
+        probs[i] = pow(tree->visit_count[i], 1./temp) / sum;
+    }
+
+    int index = sample_array(probs, 19*19+1);
+    m.row = index / 19;
+    m.col = index % 19;
+    m.value = (tree->result+1.)/2.;
+    m.mcts  = (tree->mean[index]+1.)/2.;
+
+    int indexes[nind];
+    top_k(probs, 19*19+1, nind, indexes);
+    print_board(stderr, tree->board, player, indexes);
+
+    fprintf(stderr, "%d %d, Result: %f, Prior: %f, Prob: %f, Mean Value: %f, Child Result: %f, Visited: %d\n", index/19, index%19, tree->result, tree->prior[index], probs[index], tree->mean[index], (tree->children[index])?tree->children[index]->result:0, tree->visit_count[index]);
+    int ind = max_index(probs, 19*19+1);
+    fprintf(stderr, "%d %d, Result: %f, Prior: %f, Prob: %f, Mean Value: %f, Child Result: %f, Visited: %d\n", ind/19, ind%19, tree->result, tree->prior[ind], probs[ind], tree->mean[ind], (tree->children[ind])?tree->children[ind]->result:0, tree->visit_count[ind]);
+    ind = max_index(tree->prior, 19*19+1);
+    fprintf(stderr, "%d %d, Result: %f, Prior: %f, Prob: %f, Mean Value: %f, Child Result: %f, Visited: %d\n", ind/19, ind%19, tree->result, tree->prior[ind], probs[ind], tree->mean[ind], (tree->children[ind])?tree->children[ind]->result:0, tree->visit_count[ind]);
+    return m;
+}
+
+/*
+   float predict_move(network *net, float *board, float *move, int multi, float *ko, float temp)
+   {
+
+   int i;
+
+   int max_v = 0;
+   int max_i = 0;
+   for(i = 0; i < 19*19+1; ++i){
+   if(root->visit_count[i] > max_v){
+   max_v = root->visit_count[i];
+   max_i = i;
+   }
+   }
+   fprintf(stderr, "%f Seconds\n", what_time_is_it_now() - t);
+   int ind = max_index(root->mean, 19*19+1);
+   fprintf(stderr, "%d %d, Result: %f, Prior: %f, Prob: %f, Mean Value: %f, Child Result: %f, Visited: %d\n", max_i/19, max_i%19, root->result, root->prior[max_i], root->prob[max_i], root->mean[max_i], (root->children[max_i])?root->children[max_i]->result:0, root->visit_count[max_i]);
+   fprintf(stderr, "%d %d, Result: %f, Prior: %f, Prob: %f, Mean Value: %f, Child Result: %f, Visited: %d\n", ind/19, ind%19, root->result, root->prior[ind], root->prob[ind], root->mean[ind], (root->children[ind])?root->children[ind]->result:0, root->visit_count[ind]);
+   ind = max_index(root->prior, 19*19+1);
+   fprintf(stderr, "%d %d, Result: %f, Prior: %f, Prob: %f, Mean Value: %f, Child Result: %f, Visited: %d\n", ind/19, ind%19, root->result, root->prior[ind], root->prob[ind], root->mean[ind], (root->children[ind])?root->children[ind]->result:0, root->visit_count[ind]);
+   if(root->result < -.9 && root->mean[max_i] < -.9) return -1000.f;
+
+   float val = root->result;
+   free_mcts(root);
+   return val;
+   }
+ */
+
+static int makes_safe_go(float *b, int *lib, int p, int r, int c){
+    if (r < 0 || r >= 19 || c < 0 || c >= 19) return 0;
+    if (occupied(b,r*19 + c) == -p){
+        if (lib[r*19 + c] > 1) return 0;
+        else return 1;
+    }
+    if (!occupied(b,r*19 + c)) return 1;
+    if (lib[r*19 + c] > 1) return 1;
+    return 0;
+}
+
+int suicide_go(float *b, int p, int r, int c)
+{
+    int *l = calculate_liberties(b);
+    int safe = 0;
+    safe = safe || makes_safe_go(b, l, p, r+1, c);
+    safe = safe || makes_safe_go(b, l, p, r-1, c);
+    safe = safe || makes_safe_go(b, l, p, r, c+1);
+    safe = safe || makes_safe_go(b, l, p, r, c-1);
+    free(l);
+    return !safe;
+}
+
+int check_ko(float *x, float *ko)
+{
+    if(!ko) return 0;
+    float curr[19*19*3];
+    copy_cpu(19*19*3, x, 1, curr, 1);
+    if(curr[19*19*2] != ko[19*19*2]) flip_board(curr);
+    if(compare_board(curr, ko)) return 1;
+    return 0;
+}
+
+int legal_go(float *b, float *ko, int p, int r, int c)
+{
+    if (occupied(b, r*19+c)) return 0;
+    float curr[19*19*3];
+    copy_cpu(19*19*3, b, 1, curr, 1);
+    move_go(curr, p, r, c);
+    if(check_ko(curr, ko)) return 0;
+    if(suicide_go(b, p, r, c)) return 0;
+    return 1;
+}
+
+/*
+   move generate_move(mcts_tree *root, network *net, int player, float *board, int multi, float temp, float *ko, int print)
+   {
+   move m = {0};
+//root = run_mcts(tree, network *net, float *board, float *ko, int n, float cpuct)
+int i, j;
+int empty = 1;
+for(i = 0; i < 19*19; ++i){
+if (occupied(board, i)) {
+empty = 0;
+break;
+}
+}
+if(empty) {
+m.value = .5;
+m.mcts = .5;
+m.row = 3;
+m.col = 15;
+return m;
+}
+
+float move[362];
+if (player < 0) flip_board(board);
+float result = predict_move(net, board, move, multi, ko, temp);
+if (player < 0) flip_board(board);
+if(result == -1000.f) return -2;
+
+for(i = 0; i < 19; ++i){
+for(j = 0; j < 19; ++j){
+if (!legal_go(board, ko, player, i, j)) move[i*19 + j] = 0;
+}
+}
+
+int indexes[nind];
+top_k(move, 19*19+1, nind, indexes);
+
+
+int max = max_index(move, 19*19+1);
+int row = max / 19;
+int col = max % 19;
+int index = sample_array(move, 19*19+1);
+
+if(print){
+top_k(move, 19*19+1, nind, indexes);
+for(i = 0; i < nind; ++i){
+if (!move[indexes[i]]) indexes[i] = -1;
+}
+print_board(stderr, board, 1, indexes);
+fprintf(stderr, "%s To Move\n", player > 0 ? "X" : "O");
+fprintf(stderr, "%.2f%% Win Chance\n", (result+1)/2*100);
+for(i = 0; i < nind; ++i){
+int index = indexes[i];
+int row = index / 19;
+int col = index % 19;
+if(row == 19){
+fprintf(stderr, "%d: Pass, %.2f%%\n", i+1, move[index]*100);
+} else {
+fprintf(stderr, "%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100);
+}
+}
+}
+if (row == 19) return -1;
+
+if (suicide_go(board, player, row, col)){
+return -1; 
+}
+
+if (suicide_go(board, player, index/19, index%19)){
+index = max;
+}
+if (index == 19*19) return -1;
+return index;
+}
+*/
+
+void valid_go(char *cfgfile, char *weightfile, int multi, char *filename)
+{
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+
+    float *board = calloc(19*19*3, sizeof(float));
+    float *move = calloc(19*19+2, sizeof(float));
+    // moves m = load_go_moves("/home/pjreddie/backup/go.test");
+    moves m = load_go_moves(filename);
+
+    int N = m.n;
+    int i,j;
+    int correct = 0;
+    for (i = 0; i <N; ++i) {
+        char *b = m.data[i];
+        int player = b[0] - '0';
+        //int result = b[1] - '0';
+        int row = b[2];
+        int col = b[3];
+        int truth = col + 19*row;
+        string_to_board(b+4, board);
+        if(player > 0) for(j = 0; j < 19*19; ++j) board[19*19*2 + j] = 1;
+        predict_move2(net, board, move, multi);
+        int index = max_index(move, 19*19+1);
+        if(index == truth) ++correct;
+        printf("%d Accuracy %f\n", i, (float) correct/(i+1));
+    }
+}
+
+int print_game(float *board, FILE *fp)
+{
+    int i, j;
+    int count = 3;
+    fprintf(fp, "komi 6.5\n");
+    fprintf(fp, "boardsize 19\n");
+    fprintf(fp, "clear_board\n");
+    for(j = 0; j < 19; ++j){
+        for(i = 0; i < 19; ++i){
+            if(occupied(board,j*19 + i) == 1) fprintf(fp, "play black %c%d\n", 'A'+i+(i>=8), 19-j);
+            if(occupied(board,j*19 + i) == -1) fprintf(fp, "play white %c%d\n", 'A'+i+(i>=8), 19-j);
+            if(occupied(board,j*19 + i)) ++count;
+        }
+    }
+    return count;
+}
+
+
+int stdin_ready()
+{
+    fd_set readfds;
+    FD_ZERO(&readfds);
+
+    struct timeval timeout;
+    timeout.tv_sec = 0;
+    timeout.tv_usec = 0;
+    FD_SET(STDIN_FILENO, &readfds);
+
+    if (select(1, &readfds, NULL, NULL, &timeout)){
+        return 1;
+    }
+    return 0;
+}
+
+mcts_tree *ponder(mcts_tree *tree, network *net, float *b, float *ko, int player, float cpuct)
+{
+    double t = what_time_is_it_now();
+    int count = 0;
+    if (tree) count = tree->total_count;
+    while(!stdin_ready()){
+        if (what_time_is_it_now() - t > 120) break;
+        tree = run_mcts(tree, net, b, ko, player, 100000, cpuct, .1);
+    }
+    fprintf(stderr, "Pondered %d moves...\n", tree->total_count - count);
+    return tree;
+}
+
+void engine_go(char *filename, char *weightfile, int mcts_iters, float secs, float temp, float cpuct, int anon, int resign)
+{
+    mcts_tree *root = 0;
+    network *net = load_network(filename, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(time(0));
+    float *board = calloc(19*19*3, sizeof(float));
+    flip_board(board);
+    float *one = calloc(19*19*3, sizeof(float));
+    float *two = calloc(19*19*3, sizeof(float));
+    int ponder_player = 0;
+    int passed = 0;
+    int move_num = 0;
+    int main_time = 0;
+    int byo_yomi_time = 0;
+    int byo_yomi_stones = 0;
+    int black_time_left = 0;
+    int black_stones_left = 0;
+    int white_time_left = 0;
+    int white_stones_left = 0;
+    float orig_time = secs;
+    int old_ponder = 0;
+    while(1){
+        if(ponder_player){
+            root = ponder(root, net, board, two, ponder_player, cpuct);
+        }
+        old_ponder = ponder_player;
+        ponder_player = 0;
+        char buff[256];
+        int id = 0;
+        int has_id = (scanf("%d", &id) == 1);
+        scanf("%s", buff);
+        if (feof(stdin)) break;
+        fprintf(stderr, "%s\n", buff);
+        char ids[256];
+        sprintf(ids, "%d", id);
+        //fprintf(stderr, "%s\n", buff);
+        if (!has_id) ids[0] = 0;
+        if (!strcmp(buff, "protocol_version")){
+            printf("=%s 2\n\n", ids);
+        } else if (!strcmp(buff, "name")){
+            if(anon){
+                printf("=%s The Fool!\n\n", ids);
+            }else{
+                printf("=%s DarkGo\n\n", ids);
+            }
+        } else if (!strcmp(buff, "time_settings")){
+            ponder_player = old_ponder;
+            scanf("%d %d %d", &main_time, &byo_yomi_time, &byo_yomi_stones);
+            printf("=%s \n\n", ids);
+        } else if (!strcmp(buff, "time_left")){
+            ponder_player = old_ponder;
+            char color[256];
+            int time = 0, stones = 0;
+            scanf("%s %d %d", color, &time, &stones);
+            if (color[0] == 'b' || color[0] == 'B'){
+                black_time_left = time;
+                black_stones_left = stones;
+            } else {
+                white_time_left = time;
+                white_stones_left = stones;
+            }
+            printf("=%s \n\n", ids);
+        } else if (!strcmp(buff, "version")){
+            if(anon){
+                printf("=%s :-DDDD\n\n", ids);
+            }else {
+                printf("=%s 1.0. Want more DarkGo? You can find me on OGS, unlimited games, no waiting! https://online-go.com/user/view/434218\n\n", ids);
+            }
+        } else if (!strcmp(buff, "known_command")){
+            char comm[256];
+            scanf("%s", comm);
+            int known = (!strcmp(comm, "protocol_version") || 
+                    !strcmp(comm, "name") || 
+                    !strcmp(comm, "version") || 
+                    !strcmp(comm, "known_command") || 
+                    !strcmp(comm, "list_commands") || 
+                    !strcmp(comm, "quit") || 
+                    !strcmp(comm, "boardsize") || 
+                    !strcmp(comm, "clear_board") || 
+                    !strcmp(comm, "komi") || 
+                    !strcmp(comm, "final_status_list") || 
+                    !strcmp(comm, "play") || 
+                    !strcmp(comm, "genmove_white") || 
+                    !strcmp(comm, "genmove_black") || 
+                    !strcmp(comm, "fixed_handicap") || 
+                    !strcmp(comm, "genmove"));
+            if(known) printf("=%s true\n\n", ids);
+            else printf("=%s false\n\n", ids);
+        } else if (!strcmp(buff, "list_commands")){
+            printf("=%s protocol_version\nshowboard\nname\nversion\nknown_command\nlist_commands\nquit\nboardsize\nclear_board\nkomi\nplay\ngenmove_black\ngenmove_white\ngenmove\nfinal_status_list\nfixed_handicap\n\n", ids);
+        } else if (!strcmp(buff, "quit")){
+            break;
+        } else if (!strcmp(buff, "boardsize")){
+            int boardsize = 0;
+            scanf("%d", &boardsize);
+            //fprintf(stderr, "%d\n", boardsize);
+            if(boardsize != 19){
+                printf("?%s unacceptable size\n\n", ids);
+            } else {
+                root = move_mcts(root, -1);
+                memset(board, 0, 3*19*19*sizeof(float));
+                flip_board(board);
+                move_num = 0;
+                printf("=%s \n\n", ids);
+            }
+        } else if (!strcmp(buff, "fixed_handicap")){
+            int handicap = 0;
+            scanf("%d", &handicap);
+            int indexes[] = {72, 288, 300, 60, 180, 174, 186, 66, 294};
+            int i;
+            for(i = 0; i < handicap; ++i){
+                board[indexes[i]] = 1;   
+                ++move_num;
+            }
+            root = move_mcts(root, -1);
+        } else if (!strcmp(buff, "clear_board")){
+            passed = 0;
+            memset(board, 0, 3*19*19*sizeof(float));
+            flip_board(board);
+            move_num = 0;
+            root = move_mcts(root, -1);
+            printf("=%s \n\n", ids);
+        } else if (!strcmp(buff, "komi")){
+            float komi = 0;
+            scanf("%f", &komi);
+            printf("=%s \n\n", ids);
+        } else if (!strcmp(buff, "showboard")){
+            printf("=%s \n", ids);
+            print_board(stdout, board, 1, 0);
+            printf("\n");
+        } else if (!strcmp(buff, "play") || !strcmp(buff, "black") || !strcmp(buff, "white")){
+            ++move_num;
+            char color[256];
+            if(!strcmp(buff, "play"))
+            {
+                scanf("%s ", color);
+            } else {
+                scanf(" ");
+                color[0] = buff[0];
+            }
+            char c;
+            int r;
+            int count = scanf("%c%d", &c, &r);
+            int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
+            if((c == 'p' || c == 'P') && count < 2) {
+                passed = 1;
+                printf("=%s \n\n", ids);
+                char *line = fgetl(stdin);
+                free(line);
+                fflush(stdout);
+                fflush(stderr);
+                root = move_mcts(root, 19*19);
+                continue;
+            } else {
+                passed = 0;
+            }
+            if(c >= 'A' && c <= 'Z') c = c - 'A';
+            if(c >= 'a' && c <= 'z') c = c - 'a';
+            if(c >= 8) --c;
+            r = 19 - r;
+            fprintf(stderr, "move: %d %d\n", r, c);
+
+            float *swap = two;
+            two = one;
+            one = swap;
+            move_go(board, player, r, c);
+            copy_cpu(19*19*3, board, 1, one, 1);
+            if(root) fprintf(stderr, "Prior: %f\n", root->prior[r*19 + c]);
+            if(root) fprintf(stderr, "Mean: %f\n", root->mean[r*19 + c]);
+            if(root) fprintf(stderr, "Result: %f\n", root->result);
+            root = move_mcts(root, r*19 + c);
+            if(root) fprintf(stderr, "Visited: %d\n", root->total_count);
+            else fprintf(stderr, "NOT VISITED\n");
+
+            printf("=%s \n\n", ids);
+            //print_board(stderr, board, 1, 0);
+        } else if (!strcmp(buff, "genmove") || !strcmp(buff, "genmove_black") || !strcmp(buff, "genmove_white")){
+            ++move_num;
+            int player = 0;
+            if(!strcmp(buff, "genmove")){
+                char color[256];
+                scanf("%s", color);
+                player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
+            } else if (!strcmp(buff, "genmove_black")){
+                player = 1;
+            } else {
+                player = -1;
+            }
+            if(player > 0){
+                if(black_time_left <= 30) secs = 2.5;
+                else secs = orig_time;
+            } else {
+                if(white_time_left <= 30) secs = 2.5;
+                else secs = orig_time;
+            }
+            ponder_player = -player;
+
+            //tree = generate_move(net, player, board, multi, .1, two, 1);
+            double t = what_time_is_it_now();
+            root = run_mcts(root, net, board, two, player, mcts_iters, cpuct, secs);
+            fprintf(stderr, "%f Seconds\n", what_time_is_it_now() - t);
+            move m = pick_move(root, temp, player);
+            root = move_mcts(root, m.row*19 + m.col);
+
+
+            if(move_num > resign && m.value < .1 && m.mcts < .1){
+                printf("=%s resign\n\n", ids);
+            } else if(m.row == 19){
+                printf("=%s pass\n\n", ids);
+                passed = 0;
+            } else {
+                int row = m.row;
+                int col = m.col;
+
+                float *swap = two;
+                two = one;
+                one = swap;
+
+                move_go(board, player, row, col);
+                copy_cpu(19*19*3, board, 1, one, 1);
+                row = 19 - row;
+                if (col >= 8) ++col;
+                printf("=%s %c%d\n\n", ids, 'A' + col, row);
+            }
+
+        } else if (!strcmp(buff, "p")){
+            //print_board(board, 1, 0);
+        } else if (!strcmp(buff, "final_status_list")){
+            char type[256];
+            scanf("%s", type);
+            fprintf(stderr, "final_status\n");
+            char *line = fgetl(stdin);
+            free(line);
+            if(type[0] == 'd' || type[0] == 'D'){
+                int i;
+                FILE *f = fopen("game.txt", "w");
+                int count = print_game(board, f);
+                fprintf(f, "%s final_status_list dead\n", ids);
+                fclose(f);
+                FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
+                for(i = 0; i < count; ++i){
+                    free(fgetl(p));
+                    free(fgetl(p));
+                }
+                char *l = 0;
+                while((l = fgetl(p))){
+                    printf("%s\n", l);
+                    free(l);
+                }
+            } else {
+                printf("?%s unknown command\n\n", ids);
+            }
+        } else if (!strcmp(buff, "kgs-genmove_cleanup")){
+            char type[256];
+            scanf("%s", type);
+            fprintf(stderr, "kgs-genmove_cleanup\n");
+            char *line = fgetl(stdin);
+            free(line);
+            int i;
+            FILE *f = fopen("game.txt", "w");
+            int count = print_game(board, f);
+            fprintf(f, "%s kgs-genmove_cleanup %s\n", ids, type);
+            fclose(f);
+            FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
+            for(i = 0; i < count; ++i){
+                free(fgetl(p));
+                free(fgetl(p));
+            }
+            char *l = 0;
+            while((l = fgetl(p))){
+                printf("%s\n", l);
+                free(l);
+            }
+        } else {
+            char *line = fgetl(stdin);
+            free(line);
+            printf("?%s unknown command\n\n", ids);
+        }
+        fflush(stdout);
+        fflush(stderr);
+    }
+    printf("%d %d %d\n",passed, black_stones_left, white_stones_left);
+}
+
+void test_go(char *cfg, char *weights, int multi)
+{
+    int i;
+    network *net = load_network(cfg, weights, 0);
+    set_batch_network(net, 1);
+    srand(time(0));
+    float *board = calloc(19*19*3, sizeof(float));
+    flip_board(board);
+    float *move = calloc(19*19+1, sizeof(float));
+    int color = 1;
+    while(1){
+        float result = predict_move2(net, board, move, multi);
+        printf("%.2f%% Win Chance\n", (result+1)/2*100);
+
+        int indexes[nind];
+        int row, col;
+        top_k(move, 19*19+1, nind, indexes);
+        print_board(stderr, board, color, indexes);
+        for(i = 0; i < nind; ++i){
+            int index = indexes[i];
+            row = index / 19;
+            col = index % 19;
+            if(row == 19){
+                printf("%d: Pass, %.2f%%\n", i+1, move[index]*100);
+            } else {
+                printf("%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100);
+            }
+        }
+        //if(color == 1) printf("\u25EF Enter move: ");
+        //else printf("\u25C9 Enter move: ");
+        if(color == 1) printf("X Enter move: ");
+        else printf("O Enter move: ");
+
+        char c;
+        char *line = fgetl(stdin);
+        int picked = 1;
+        int dnum = sscanf(line, "%d", &picked);
+        int cnum = sscanf(line, "%c", &c);
+        if (strlen(line) == 0 || dnum) {
+            --picked;
+            if (picked < nind){
+                int index = indexes[picked];
+                row = index / 19;
+                col = index % 19;
+                if(row < 19){
+                    move_go(board, 1, row, col);
+                }
+            }
+        } else if (cnum){
+            if (c <= 'T' && c >= 'A'){
+                int num = sscanf(line, "%c %d", &c, &row);
+                row = (inverted)?19 - row : row-1;
+                col = c - 'A';
+                if (col > 7 && noi) col -= 1;
+                if (num == 2) move_go(board, 1, row, col);
+            } else if (c == 'p') {
+                // Pass
+            } else if(c=='b' || c == 'w'){
+                char g;
+                int num = sscanf(line, "%c %c %d", &g, &c, &row);
+                row = (inverted)?19 - row : row-1;
+                col = c - 'A';
+                if (col > 7 && noi) col -= 1;
+                if (num == 3) {
+                    int mc = (g == 'b') ? 1 : -1;
+                    if (mc == color) {
+                        board[row*19 + col] = 1;
+                    } else {
+                        board[19*19 + row*19 + col] = 1;
+                    }
+                }
+            } else if(c == 'c'){
+                char g;
+                int num = sscanf(line, "%c %c %d", &g, &c, &row);
+                row = (inverted)?19 - row : row-1;
+                col = c - 'A';
+                if (col > 7 && noi) col -= 1;
+                if (num == 3) {
+                    board[row*19 + col] = 0;
+                    board[19*19 + row*19 + col] = 0;
+                }
+            }
+        }
+        free(line);
+        flip_board(board);
+        color = -color;
+    }
+}
+
+float score_game(float *board)
+{
+    int i;
+    FILE *f = fopen("game.txt", "w");
+    int count = print_game(board, f);
+    fprintf(f, "final_score\n");
+    fclose(f);
+    FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
+    for(i = 0; i < count; ++i){
+        free(fgetl(p));
+        free(fgetl(p));
+    }
+    char *l = 0;
+    float score = 0;
+    char player = 0;
+    while((l = fgetl(p))){
+        fprintf(stderr, "%s  \t", l);
+        int n = sscanf(l, "= %c+%f", &player, &score);
+        free(l);
+        if (n == 2) break;
+    }
+    if(player == 'W') score = -score;
+    pclose(p);
+    return score;
+}
+
+void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi)
+{
+    mcts_tree *tree1 = 0;
+    mcts_tree *tree2 = 0;
+    network *net = load_network(filename, weightfile, 0);
+    //set_batch_network(net, 1);
+
+    network *net2;
+    if (f2) {
+        net2 = parse_network_cfg(f2);
+        if(w2){
+            load_weights(net2, w2);
+        }
+    } else {
+        net2 = calloc(1, sizeof(network));
+        *net2 = *net;
+    }
+    srand(time(0));
+    char boards[600][93];
+    int count = 0;
+    //set_batch_network(net, 1);
+    //set_batch_network(net2, 1);
+    float *board = calloc(19*19*3, sizeof(float));
+    flip_board(board);
+    float *one = calloc(19*19*3, sizeof(float));
+    float *two = calloc(19*19*3, sizeof(float));
+    int done = 0;
+    int player = 1;
+    int p1 = 0;
+    int p2 = 0;
+    int total = 0;
+    float temp = .1;
+    int mcts_iters = 500;
+    float cpuct = 5;
+    while(1){
+        if (done){
+            tree1 = move_mcts(tree1, -1);
+            tree2 = move_mcts(tree2, -1);
+            float score = score_game(board);
+            if((score > 0) == (total%2==0)) ++p1;
+            else ++p2;
+            ++total;
+            fprintf(stderr, "Total: %d, Player 1: %f, Player 2: %f\n", total, (float)p1/total, (float)p2/total);
+            sleep(1);
+            /*
+               int i = (score > 0)? 0 : 1;
+               int j;
+               for(; i < count; i += 2){
+               for(j = 0; j < 93; ++j){
+               printf("%c", boards[i][j]);
+               }
+               printf("\n");
+               }
+             */
+            memset(board, 0, 3*19*19*sizeof(float));
+            flip_board(board);
+            player = 1;
+            done = 0;
+            count = 0;
+            fflush(stdout);
+            fflush(stderr);
+        }
+        //print_board(stderr, board, 1, 0);
+        //sleep(1);
+
+        if ((total%2==0) == (player==1)){
+            //mcts_iters = 4500;   
+            cpuct = 5;
+        } else {
+            //mcts_iters = 500;
+            cpuct = 1;
+        }
+        network *use = ((total%2==0) == (player==1)) ? net : net2;
+        mcts_tree *t = ((total%2==0) == (player==1)) ? tree1 : tree2;
+        t = run_mcts(t, use, board, two, player, mcts_iters, cpuct, 0);
+        move m = pick_move(t, temp, player);
+        if(((total%2==0) == (player==1))) tree1 = t;
+        else tree2 = t;
+
+        tree1 = move_mcts(tree1, m.row*19 + m.col);
+        tree2 = move_mcts(tree2, m.row*19 + m.col);
+
+        if(m.row == 19){
+            done = 1;
+            continue;
+        }
+        int row = m.row;
+        int col = m.col;
+
+        float *swap = two;
+        two = one;
+        one = swap;
+
+        if(player < 0) flip_board(board);
+        boards[count][0] = row;
+        boards[count][1] = col;
+        board_to_string(boards[count] + 2, board);
+        if(player < 0) flip_board(board);
+        ++count;
+
+        move_go(board, player, row, col);
+        copy_cpu(19*19*3, board, 1, one, 1);
+
+        player = -player;
+    }
+}
+
+void run_go(int argc, char **argv)
+{
+    //boards_go();
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    int *gpus = 0;
+    int gpu = 0;
+    int ngpus = 0;
+    if(gpu_list){
+        printf("%s\n", gpu_list);
+        int len = strlen(gpu_list);
+        ngpus = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (gpu_list[i] == ',') ++ngpus;
+        }
+        gpus = calloc(ngpus, sizeof(int));
+        for(i = 0; i < ngpus; ++i){
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',')+1;
+        }
+    } else {
+        gpu = gpu_index;
+        gpus = &gpu;
+        ngpus = 1;
+    }
+    int clear = find_arg(argc, argv, "-clear");
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *c2 = (argc > 5) ? argv[5] : 0;
+    char *w2 = (argc > 6) ? argv[6] : 0;
+    int multi = find_arg(argc, argv, "-multi");
+    int anon = find_arg(argc, argv, "-anon");
+    int iters = find_int_arg(argc, argv, "-iters", 500);
+    int resign = find_int_arg(argc, argv, "-resign", 175);
+    float cpuct = find_float_arg(argc, argv, "-cpuct", 5);
+    float temp = find_float_arg(argc, argv, "-temp", .1);
+    float time = find_float_arg(argc, argv, "-time", 0);
+    if(0==strcmp(argv[2], "train")) train_go(cfg, weights, c2, gpus, ngpus, clear);
+    else if(0==strcmp(argv[2], "valid")) valid_go(cfg, weights, multi, c2);
+    else if(0==strcmp(argv[2], "self")) self_go(cfg, weights, c2, w2, multi);
+    else if(0==strcmp(argv[2], "test")) test_go(cfg, weights, multi);
+    else if(0==strcmp(argv[2], "engine")) engine_go(cfg, weights, iters, time, temp, cpuct, anon, resign);
+}
+
+
diff --git a/image.darknet/inst/include/darknet/examples/instance-segmenter.c b/image.darknet/inst/include/darknet/examples/instance-segmenter.c
new file mode 100644
index 0000000..664e714
--- /dev/null
+++ b/image.darknet/inst/include/darknet/examples/instance-segmenter.c
@@ -0,0 +1,267 @@
+#include "darknet.h"
+#include <sys/time.h>
+#include <assert.h>
+
+void normalize_image2(image p);
+void train_isegmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int display)
+{
+    int i;
+
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    printf("%d\n", ngpus);
+    network **nets = calloc(ngpus, sizeof(network*));
+
+    srand(time(0));
+    int seed = rand();
+    for(i = 0; i < ngpus; ++i){
+        srand(seed);
+#ifdef GPU
+        cuda_set_device(gpus[i]);
+#endif
+        nets[i] = load_network(cfgfile, weightfile, clear);
+        nets[i]->learning_rate *= ngpus;
+    }
+    srand(time(0));
+    network *net = nets[0];
+    image pred = get_network_image(net);
+
+    image embed = pred;
+    embed.c = 3;
+    embed.data += embed.w*embed.h*80;
+
+    int div = net->w/pred.w;
+    assert(pred.w * div == net->w);
+    assert(pred.h * div == net->h);
+
+    int imgs = net->batch * net->subdivisions * ngpus;
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    list *options = read_data_cfg(datacfg);
+
+    char *backup_directory = option_find_str(options, "backup", "/backup/");
+    char *train_list = option_find_str(options, "train", "data/train.list");
+
+    list *plist = get_paths(train_list);
+    char **paths = (char **)list_to_array(plist);
+    printf("%d\n", plist->size);
+    int N = plist->size;
+
+    load_args args = {0};
+    args.w = net->w;
+    args.h = net->h;
+    args.threads = 32;
+    args.scale = div;
+    args.num_boxes = 90;
+
+    args.min = net->min_crop;
+    args.max = net->max_crop;
+    args.angle = net->angle;
+    args.aspect = net->aspect;
+    args.exposure = net->exposure;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
+    args.size = net->w;
+    args.classes = 80;
+
+    args.paths = paths;
+    args.n = imgs;
+    args.m = N;
+    args.type = ISEG_DATA;
+
+    data train;
+    data buffer;
+    pthread_t load_thread;
+    args.d = &buffer;
+    load_thread = load_data(args);
+
+    int epoch = (*net->seen)/N;
+    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
+        double time = what_time_is_it_now();
+
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data(args);
+
+        printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
+        time = what_time_is_it_now();
+
+        float loss = 0;
+#ifdef GPU
+        if(ngpus == 1){
+            loss = train_network(net, train);
+        } else {
+            loss = train_networks(nets, ngpus, train, 4);
+        }
+#else
+        loss = train_network(net, train);
+#endif
+        if(display){
+            image tr = float_to_image(net->w/div, net->h/div, 80, train.y.vals[net->batch*(net->subdivisions-1)]);
+            image im = float_to_image(net->w, net->h, net->c, train.X.vals[net->batch*(net->subdivisions-1)]);
+            pred.c = 80;
+            image mask = mask_to_rgb(tr);
+            image prmask = mask_to_rgb(pred);
+            image ecopy = copy_image(embed);
+            normalize_image2(ecopy);
+            show_image(ecopy, "embed", 1);
+            free_image(ecopy);
+
+            show_image(im, "input", 1);
+            show_image(prmask, "pred", 1);
+            show_image(mask, "truth", 100);
+            free_image(mask);
+            free_image(prmask);
+        }
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, *net->seen);
+        free_data(train);
+        if(*net->seen/N > epoch){
+            epoch = *net->seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            save_weights(net, buff);
+        }
+        if(get_current_batch(net)%100 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s.weights", backup_directory, base);
+    save_weights(net, buff);
+
+    free_network(net);
+    free_ptrs((void**)paths, plist->size);
+    free_list(plist);
+    free(base);
+}
+
+void predict_isegmenter(char *datafile, char *cfg, char *weights, char *filename)
+{
+    network *net = load_network(cfg, weights, 0);
+    set_batch_network(net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        image sized = letterbox_image(im, net->w, net->h);
+
+        float *X = sized.data;
+        time=clock();
+        float *predictions = network_predict(net, X);
+        image pred = get_network_image(net);
+        image prmask = mask_to_rgb(pred);
+        printf("Predicted: %f\n", predictions[0]);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        show_image(sized, "orig", 1);
+        show_image(prmask, "pred", 0);
+        free_image(im);
+        free_image(sized);
+        free_image(prmask);
+        if (filename) break;
+    }
+}
+
+
+void demo_isegmenter(char *datacfg, char *cfg, char *weights, int cam_index, const char *filename)
+{
+#ifdef OPENCV
+    printf("Classifier Demo\n");
+    network *net = load_network(cfg, weights, 0);
+    set_batch_network(net, 1);
+
+    srand(2222222);
+    void * cap = open_video_stream(filename, cam_index, 0,0,0);
+
+    if(!cap) error("Couldn't connect to webcam.\n");
+    float fps = 0;
+
+    while(1){
+        struct timeval tval_before, tval_after, tval_result;
+        gettimeofday(&tval_before, NULL);
+
+        image in = get_image_from_stream(cap);
+        image in_s = letterbox_image(in, net->w, net->h);
+
+        network_predict(net, in_s.data);
+
+        printf("\033[2J");
+        printf("\033[1;1H");
+        printf("\nFPS:%.0f\n",fps);
+
+        image pred = get_network_image(net);
+        image prmask = mask_to_rgb(pred);
+        show_image(prmask, "Segmenter", 10);
+
+        free_image(in_s);
+        free_image(in);
+        free_image(prmask);
+
+        gettimeofday(&tval_after, NULL);
+        timersub(&tval_after, &tval_before, &tval_result);
+        float curr = 1000000.f/((long int)tval_result.tv_usec);
+        fps = .9*fps + .1*curr;
+    }
+#endif
+}
+
+
+void run_isegmenter(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    int *gpus = 0;
+    int gpu = 0;
+    int ngpus = 0;
+    if(gpu_list){
+        printf("%s\n", gpu_list);
+        int len = strlen(gpu_list);
+        ngpus = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (gpu_list[i] == ',') ++ngpus;
+        }
+        gpus = calloc(ngpus, sizeof(int));
+        for(i = 0; i < ngpus; ++i){
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',')+1;
+        }
+    } else {
+        gpu = gpu_index;
+        gpus = &gpu;
+        ngpus = 1;
+    }
+
+    int cam_index = find_int_arg(argc, argv, "-c", 0);
+    int clear = find_arg(argc, argv, "-clear");
+    int display = find_arg(argc, argv, "-display");
+    char *data = argv[3];
+    char *cfg = argv[4];
+    char *weights = (argc > 5) ? argv[5] : 0;
+    char *filename = (argc > 6) ? argv[6]: 0;
+    if(0==strcmp(argv[2], "test")) predict_isegmenter(data, cfg, weights, filename);
+    else if(0==strcmp(argv[2], "train")) train_isegmenter(data, cfg, weights, gpus, ngpus, clear, display);
+    else if(0==strcmp(argv[2], "demo")) demo_isegmenter(data, cfg, weights, cam_index, filename);
+}
+
+
diff --git a/image.darknet/inst/include/darknet/examples/lsd.c b/image.darknet/inst/include/darknet/examples/lsd.c
new file mode 100644
index 0000000..4ab944c
--- /dev/null
+++ b/image.darknet/inst/include/darknet/examples/lsd.c
@@ -0,0 +1,1378 @@
+#include <math.h>
+#include "darknet.h"
+
+/*
+void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg, char *aweight, int clear)
+{
+#ifdef GPU
+    //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+    char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list";
+    //char *style_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+    char *style_images = "/home/pjreddie/zelda.txt";
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    network fnet = load_network(fcfg, fweight, clear);
+    network gnet = load_network(gcfg, gweight, clear);
+    network anet = load_network(acfg, aweight, clear);
+    char *gbase = basecfg(gcfg);
+    char *abase = basecfg(acfg);
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay);
+    int imgs = gnet->batch*gnet->subdivisions;
+    int i = *gnet->seen/imgs;
+    data train, tbuffer;
+    data style, sbuffer;
+
+
+    list *slist = get_paths(style_images);
+    char **spaths = (char **)list_to_array(slist);
+
+    list *tlist = get_paths(train_images);
+    char **tpaths = (char **)list_to_array(tlist);
+
+    load_args targs= get_base_args(gnet);
+    targs.paths = tpaths;
+    targs.n = imgs;
+    targs.m = tlist->size;
+    targs.d = &tbuffer;
+    targs.type = CLASSIFICATION_DATA;
+    targs.classes = 1;
+    char *ls[1] = {"zelda"};
+    targs.labels = ls;
+
+    load_args sargs = get_base_args(gnet);
+    sargs.paths = spaths;
+    sargs.n = imgs;
+    sargs.m = slist->size;
+    sargs.d = &sbuffer;
+    sargs.type = CLASSIFICATION_DATA;
+    sargs.classes = 1;
+    sargs.labels = ls;
+
+    pthread_t tload_thread = load_data_in_thread(targs);
+    pthread_t sload_thread = load_data_in_thread(sargs);
+    clock_t time;
+
+    float aloss_avg = -1;
+    float floss_avg = -1;
+
+    fnet->train=1;
+    int x_size = fnet->inputs*fnet->batch;
+    int y_size = fnet->truths*fnet->batch;
+    float *X = calloc(x_size, sizeof(float));
+    float *y = calloc(y_size, sizeof(float));
+
+
+    int ax_size = anet->inputs*anet->batch;
+    int ay_size = anet->truths*anet->batch;
+    fill_gpu(ay_size, .9, anet->truth_gpu, 1);
+    anet->delta_gpu = cuda_make_array(0, ax_size);
+    anet->train = 1;
+
+    int gx_size = gnet->inputs*gnet->batch;
+    int gy_size = gnet->truths*gnet->batch;
+    gstate.input = cuda_make_array(0, gx_size);
+    gstate.truth = 0;
+    gstate.delta = 0;
+    gstate.train = 1;
+
+    while (get_current_batch(gnet) < gnet->max_batches) {
+        i += 1;
+        time=clock();
+        pthread_join(tload_thread, 0);
+        pthread_join(sload_thread, 0);
+        train = tbuffer;
+        style = sbuffer;
+        tload_thread = load_data_in_thread(targs);
+        sload_thread = load_data_in_thread(sargs);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        data generated = copy_data(train);
+        time=clock();
+
+        int j, k;
+        float floss = 0;
+        for(j = 0; j < fnet->subdivisions; ++j){
+            layer imlayer = gnet->layers[gnet->n - 1];
+            get_next_batch(train, fnet->batch, j*fnet->batch, X, y);
+
+            cuda_push_array(fstate.input, X, x_size);
+            cuda_push_array(gstate.input, X, gx_size);
+            *gnet->seen += gnet->batch;
+
+            forward_network_gpu(fnet, fstate);
+            float *feats = fnet->layers[fnet->n - 2].output_gpu;
+            copy_gpu(y_size, feats, 1, fstate.truth, 1);
+
+            forward_network_gpu(gnet, gstate);
+            float *gen = gnet->layers[gnet->n-1].output_gpu;
+            copy_gpu(x_size, gen, 1, fstate.input, 1);
+
+            fill_gpu(x_size, 0, fstate.delta, 1);
+            forward_network_gpu(fnet, fstate);
+            backward_network_gpu(fnet, fstate);
+            //HERE
+
+            astate.input = gen;
+            fill_gpu(ax_size, 0, astate.delta, 1);
+            forward_network_gpu(anet, astate);
+            backward_network_gpu(anet, astate);
+
+            float *delta = imlayer.delta_gpu;
+            fill_gpu(x_size, 0, delta, 1);
+            scal_gpu(x_size, 100, astate.delta, 1);
+            scal_gpu(x_size, .001, fstate.delta, 1);
+            axpy_gpu(x_size, 1, fstate.delta, 1, delta, 1);
+            axpy_gpu(x_size, 1, astate.delta, 1, delta, 1);
+
+            //fill_gpu(x_size, 0, delta, 1);
+            //cuda_push_array(delta, X, x_size);
+            //axpy_gpu(x_size, -1, imlayer.output_gpu, 1, delta, 1);
+            //printf("pix error: %f\n", cuda_mag_array(delta, x_size));
+            printf("fea error: %f\n", cuda_mag_array(fstate.delta, x_size));
+            printf("adv error: %f\n", cuda_mag_array(astate.delta, x_size));
+            //axpy_gpu(x_size, 1, astate.delta, 1, delta, 1);
+
+            backward_network_gpu(gnet, gstate);
+
+            floss += get_network_cost(fnet) /(fnet->subdivisions*fnet->batch);
+
+            cuda_pull_array(imlayer.output_gpu, imlayer.output, imlayer.outputs*imlayer.batch);
+            for(k = 0; k < gnet->batch; ++k){
+                int index = j*gnet->batch + k;
+                copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1);
+                generated.y.vals[index][0] = .1;
+                style.y.vals[index][0] = .9;
+            }
+        }
+
+*/
+/*
+        image sim = float_to_image(anet->w, anet->h, anet->c, style.X.vals[j]);
+        show_image(sim, "style");
+        cvWaitKey(0);
+        */
+        /*
+
+        harmless_update_network_gpu(anet);
+
+        data merge = concat_data(style, generated);
+        randomize_data(merge);
+        float aloss = train_network(anet, merge);
+
+        update_network_gpu(gnet);
+
+        free_data(merge);
+        free_data(train);
+        free_data(generated);
+        free_data(style);
+        if (aloss_avg < 0) aloss_avg = aloss;
+        if (floss_avg < 0) floss_avg = floss;
+        aloss_avg = aloss_avg*.9 + aloss*.1;
+        floss_avg = floss_avg*.9 + floss*.1;
+
+        printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, floss, aloss, floss_avg, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs);
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, gbase, i);
+            save_weights(gnet, buff);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+            save_weights(anet, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, gbase);
+            save_weights(gnet, buff);
+            sprintf(buff, "%s/%s.backup", backup_directory, abase);
+            save_weights(anet, buff);
+        }
+    }
+#endif
+}
+*/
+
+/*
+void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear)
+{
+#ifdef GPU
+    //char *train_images = "/home/pjreddie/data/coco/train1.txt";
+    //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+    char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list";
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    char *base = basecfg(cfg);
+    char *abase = basecfg(acfg);
+    printf("%s\n", base);
+    network net = load_network(cfg, weight, clear);
+    network anet = load_network(acfg, aweight, clear);
+
+    int i, j, k;
+    layer imlayer = {0};
+    for (i = 0; i < net->n; ++i) {
+        if (net->layers[i].out_c == 3) {
+            imlayer = net->layers[i];
+            break;
+        }
+    }
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    int imgs = net->batch*net->subdivisions;
+    i = *net->seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = {0};
+    args.w = net->w;
+    args.h = net->h;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+
+    args.min = net->min_crop;
+    args.max = net->max_crop;
+    args.angle = net->angle;
+    args.aspect = net->aspect;
+    args.exposure = net->exposure;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
+    args.size = net->w;
+    args.type = CLASSIFICATION_DATA;
+    args.classes = 1;
+    char *ls[1] = {"coco"};
+    args.labels = ls;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+
+    network_state gstate = {0};
+    gstate.index = 0;
+    gstate.net = net;
+    int x_size = get_network_input_size(net)*net->batch;
+    int y_size = x_size;
+    gstate.input = cuda_make_array(0, x_size);
+    gstate.truth = cuda_make_array(0, y_size);
+    gstate.delta = 0;
+    gstate.train = 1;
+    float *pixs = calloc(x_size, sizeof(float));
+    float *graypixs = calloc(x_size, sizeof(float));
+    float *y = calloc(y_size, sizeof(float));
+
+    network_state astate = {0};
+    astate.index = 0;
+    astate.net = anet;
+    int ay_size = get_network_output_size(anet)*anet->batch;
+    astate.input = 0;
+    astate.truth = 0;
+    astate.delta = 0;
+    astate.train = 1;
+
+    float *imerror = cuda_make_array(0, imlayer.outputs);
+    float *ones_gpu = cuda_make_array(0, ay_size);
+    fill_gpu(ay_size, .9, ones_gpu, 1);
+
+    float aloss_avg = -1;
+    float gloss_avg = -1;
+
+    //data generated = copy_data(train);
+
+    while (get_current_batch(net) < net->max_batches) {
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        data gray = copy_data(train);
+        for(j = 0; j < imgs; ++j){
+            image gim = float_to_image(net->w, net->h, net->c, gray.X.vals[j]);
+            grayscale_image_3c(gim);
+            train.y.vals[j][0] = .9;
+
+            image yim = float_to_image(net->w, net->h, net->c, train.X.vals[j]);
+            //rgb_to_yuv(yim);
+        }
+        time=clock();
+        float gloss = 0;
+
+        for(j = 0; j < net->subdivisions; ++j){
+            get_next_batch(train, net->batch, j*net->batch, pixs, y);
+            get_next_batch(gray, net->batch, j*net->batch, graypixs, y);
+            cuda_push_array(gstate.input, graypixs, x_size);
+            cuda_push_array(gstate.truth, pixs, y_size);
+            */
+            /*
+            image origi = float_to_image(net->w, net->h, 3, pixs);
+            image grayi = float_to_image(net->w, net->h, 3, graypixs);
+            show_image(grayi, "gray");
+            show_image(origi, "orig");
+            cvWaitKey(0);
+            */
+            /*
+            *net->seen += net->batch;
+            forward_network_gpu(net, gstate);
+
+            fill_gpu(imlayer.outputs, 0, imerror, 1);
+            astate.input = imlayer.output_gpu;
+            astate.delta = imerror;
+            astate.truth = ones_gpu;
+            forward_network_gpu(anet, astate);
+            backward_network_gpu(anet, astate);
+
+            scal_gpu(imlayer.outputs, .1, net->layers[net->n-1].delta_gpu, 1);
+
+            backward_network_gpu(net, gstate);
+
+            scal_gpu(imlayer.outputs, 1000, imerror, 1);
+
+            printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs));
+            printf("features %f\n", cuda_mag_array(net->layers[net->n-1].delta_gpu, imlayer.outputs));
+
+            axpy_gpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
+
+            gloss += get_network_cost(net) /(net->subdivisions*net->batch);
+
+            cuda_pull_array(imlayer.output_gpu, imlayer.output, imlayer.outputs*imlayer.batch);
+            for(k = 0; k < net->batch; ++k){
+                int index = j*net->batch + k;
+                copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1);
+                gray.y.vals[index][0] = .1;
+            }
+        }
+        harmless_update_network_gpu(anet);
+
+        data merge = concat_data(train, gray);
+        randomize_data(merge);
+        float aloss = train_network(anet, merge);
+
+        update_network_gpu(net);
+        update_network_gpu(anet);
+        free_data(merge);
+        free_data(train);
+        free_data(gray);
+        if (aloss_avg < 0) aloss_avg = aloss;
+        aloss_avg = aloss_avg*.9 + aloss*.1;
+        gloss_avg = gloss_avg*.9 + gloss*.1;
+
+        printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+            save_weights(anet, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+            sprintf(buff, "%s/%s.backup", backup_directory, abase);
+            save_weights(anet, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+#endif
+}
+*/
+
+void slerp(float *start, float *end, float s, int n, float *out)
+{
+    float omega = acos(dot_cpu(n, start, 1, end, 1));
+    float so = sin(omega);
+    fill_cpu(n, 0, out, 1);
+    axpy_cpu(n, sin((1-s)*omega)/so, start, 1, out, 1);
+    axpy_cpu(n, sin(s*omega)/so, end, 1, out, 1);
+
+    float mag = mag_array(out, n);
+    scale_array(out, n, 1./mag);
+}
+
+image random_unit_vector_image(int w, int h, int c)
+{
+    image im = make_image(w, h, c);
+    int i;
+    for(i = 0; i < im.w*im.h*im.c; ++i){
+        im.data[i] = rand_normal();
+    }
+    float mag = mag_array(im.data, im.w*im.h*im.c);
+    scale_array(im.data, im.w*im.h*im.c, 1./mag);
+    return im;
+}
+
+void inter_dcgan(char *cfgfile, char *weightfile)
+{
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    int i, imlayer = 0;
+
+    for (i = 0; i < net->n; ++i) {
+        if (net->layers[i].out_c == 3) {
+            imlayer = i;
+            printf("%d\n", i);
+            break;
+        }
+    }
+    image start = random_unit_vector_image(net->w, net->h, net->c);
+    image end = random_unit_vector_image(net->w, net->h, net->c);
+        image im = make_image(net->w, net->h, net->c);
+        image orig = copy_image(start);
+
+    int c = 0;
+    int count = 0;
+    int max_count = 15;
+    while(1){
+        ++c;
+        
+        if(count == max_count){
+            count = 0;
+            free_image(start);
+            start = end;
+            end = random_unit_vector_image(net->w, net->h, net->c);
+            if(c > 300){
+                end = orig;
+            }
+            if(c>300 + max_count) return;
+        }
+        ++count;
+
+        slerp(start.data, end.data, (float)count / max_count, im.w*im.h*im.c, im.data);
+
+        float *X = im.data;
+        time=clock();
+        network_predict(net, X);
+        image out = get_network_image_layer(net, imlayer);
+        //yuv_to_rgb(out);
+        normalize_image(out);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        //char buff[256];
+        sprintf(buff, "out%05d", c);
+        save_image(out, "out");
+        save_image(out, buff);
+        show_image(out, "out", 0);
+    }
+}
+
+void test_dcgan(char *cfgfile, char *weightfile)
+{
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    int imlayer = 0;
+
+    imlayer = net->n-1;
+
+    while(1){
+        image im = make_image(net->w, net->h, net->c);
+        int i;
+        for(i = 0; i < im.w*im.h*im.c; ++i){
+            im.data[i] = rand_normal();
+        }
+        //float mag = mag_array(im.data, im.w*im.h*im.c);
+        //scale_array(im.data, im.w*im.h*im.c, 1./mag);
+
+        float *X = im.data;
+        time=clock();
+        network_predict(net, X);
+        image out = get_network_image_layer(net, imlayer);
+        //yuv_to_rgb(out);
+        normalize_image(out);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        save_image(out, "out");
+        show_image(out, "out", 0);
+
+        free_image(im);
+    }
+}
+
+void set_network_alpha_beta(network *net, float alpha, float beta)
+{
+    int i;
+    for(i = 0; i < net->n; ++i){
+        if(net->layers[i].type == SHORTCUT){
+            net->layers[i].alpha = alpha;
+            net->layers[i].beta = beta;
+        }
+    }
+}
+
+void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch)
+{
+#ifdef GPU
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    char *base = basecfg(cfg);
+    char *abase = basecfg(acfg);
+    printf("%s\n", base);
+    network *gnet = load_network(cfg, weight, clear);
+    network *anet = load_network(acfg, aweight, clear);
+
+    int i, j, k;
+    layer imlayer = gnet->layers[gnet->n-1];
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay);
+    int imgs = gnet->batch*gnet->subdivisions;
+    i = *gnet->seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args= get_base_args(anet);
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+    args.type = CLASSIFICATION_DATA;
+    args.threads=16;
+    args.classes = 1;
+    char *ls[2] = {"imagenet", "zzzzzzzz"};
+    args.labels = ls;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+
+    gnet->train = 1;
+    anet->train = 1;
+
+    int x_size = gnet->inputs*gnet->batch;
+    int y_size = gnet->truths*gnet->batch;
+    float *imerror = cuda_make_array(0, y_size);
+
+    float aloss_avg = -1;
+
+    if (maxbatch == 0) maxbatch = gnet->max_batches;
+    while (get_current_batch(gnet) < maxbatch) {
+        {
+            int cb = get_current_batch(gnet);
+            float alpha = (float) cb / (maxbatch/2);
+            if(alpha > 1) alpha = 1;
+            float beta = 1 - alpha;
+            printf("%f %f\n", alpha, beta);
+            set_network_alpha_beta(gnet, alpha, beta);
+            set_network_alpha_beta(anet, beta, alpha);
+        }
+
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        data gen = copy_data(train);
+        for (j = 0; j < imgs; ++j) {
+            train.y.vals[j][0] = 1;
+            gen.y.vals[j][0] = 0;
+        }
+        time=clock();
+
+        for (j = 0; j < gnet->subdivisions; ++j) {
+            get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0);
+            int z;
+            for(z = 0; z < x_size; ++z){
+                gnet->input[z] = rand_normal();
+            }
+            /*
+               for(z = 0; z < gnet->batch; ++z){
+               float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs);
+               scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag);
+               }
+             */
+            *gnet->seen += gnet->batch;
+            forward_network(gnet);
+
+            fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
+            fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1);
+            copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1);
+            anet->delta_gpu = imerror;
+            forward_network(anet);
+            backward_network(anet);
+
+            //float genaloss = *anet->cost / anet->batch;
+
+            scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
+            scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1);
+
+            axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1);
+
+            backward_network(gnet);
+
+            for(k = 0; k < gnet->batch; ++k){
+                int index = j*gnet->batch + k;
+                copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1);
+            }
+        }
+        harmless_update_network_gpu(anet);
+
+        data merge = concat_data(train, gen);
+        float aloss = train_network(anet, merge);
+
+#ifdef OPENCV
+        if(display){
+            image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]);
+            image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]);
+            show_image(im, "gen", 1);
+            show_image(im2, "train", 1);
+            save_image(im, "gen");
+            save_image(im2, "train");
+        }
+#endif
+
+        update_network_gpu(gnet);
+
+        free_data(merge);
+        free_data(train);
+        free_data(gen);
+        if (aloss_avg < 0) aloss_avg = aloss;
+        aloss_avg = aloss_avg*.9 + aloss*.1;
+
+        printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs);
+        if(i%10000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(gnet, buff);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+            save_weights(anet, buff);
+        }
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(gnet, buff);
+            sprintf(buff, "%s/%s.backup", backup_directory, abase);
+            save_weights(anet, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(gnet, buff);
+#endif
+}
+
+void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch)
+{
+#ifdef GPU
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    char *base = basecfg(cfg);
+    char *abase = basecfg(acfg);
+    printf("%s\n", base);
+    network *gnet = load_network(cfg, weight, clear);
+    network *anet = load_network(acfg, aweight, clear);
+    //float orig_rate = anet->learning_rate;
+
+    int i, j, k;
+    layer imlayer = {0};
+    for (i = 0; i < gnet->n; ++i) {
+        if (gnet->layers[i].out_c == 3) {
+            imlayer = gnet->layers[i];
+            break;
+        }
+    }
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay);
+    int imgs = gnet->batch*gnet->subdivisions;
+    i = *gnet->seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args= get_base_args(anet);
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+    args.type = CLASSIFICATION_DATA;
+    args.threads=16;
+    args.classes = 1;
+    char *ls[2] = {"imagenet", "zzzzzzzz"};
+    args.labels = ls;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+
+    gnet->train = 1;
+    anet->train = 1;
+
+    int x_size = gnet->inputs*gnet->batch;
+    int y_size = gnet->truths*gnet->batch;
+    float *imerror = cuda_make_array(0, y_size);
+
+    //int ay_size = anet->truths*anet->batch;
+
+    float aloss_avg = -1;
+
+    //data generated = copy_data(train);
+
+    if (maxbatch == 0) maxbatch = gnet->max_batches;
+    while (get_current_batch(gnet) < maxbatch) {
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+
+        //translate_data_rows(train, -.5);
+        //scale_data_rows(train, 2);
+
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        data gen = copy_data(train);
+        for (j = 0; j < imgs; ++j) {
+            train.y.vals[j][0] = 1;
+            gen.y.vals[j][0] = 0;
+        }
+        time=clock();
+
+        for(j = 0; j < gnet->subdivisions; ++j){
+            get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0);
+            int z;
+            for(z = 0; z < x_size; ++z){
+                gnet->input[z] = rand_normal();
+            }
+            for(z = 0; z < gnet->batch; ++z){
+                float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs);
+                scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag);
+            }
+            /*
+               for(z = 0; z < 100; ++z){
+               printf("%f, ", gnet->input[z]);
+               }
+               printf("\n");
+               printf("input: %f %f\n", mean_array(gnet->input, x_size), variance_array(gnet->input, x_size));
+             */
+
+            //cuda_push_array(gnet->input_gpu, gnet->input, x_size);
+            //cuda_push_array(gnet->truth_gpu, gnet->truth, y_size);
+            *gnet->seen += gnet->batch;
+            forward_network(gnet);
+
+            fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
+            fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1);
+            copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1);
+            anet->delta_gpu = imerror;
+            forward_network(anet);
+            backward_network(anet);
+
+            //float genaloss = *anet->cost / anet->batch;
+            //printf("%f\n", genaloss);
+
+            scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
+            scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1);
+
+            //printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
+            //printf("features %f\n", cuda_mag_array(gnet->layers[gnet->n-1].delta_gpu, imlayer.outputs*imlayer.batch));
+
+            axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1);
+
+            backward_network(gnet);
+
+            /*
+               for(k = 0; k < gnet->n; ++k){
+               layer l = gnet->layers[k];
+               cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
+               printf("%d: %f %f\n", k, mean_array(l.output, l.outputs*l.batch), variance_array(l.output, l.outputs*l.batch));
+               }
+             */
+
+            for(k = 0; k < gnet->batch; ++k){
+                int index = j*gnet->batch + k;
+                copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1);
+            }
+        }
+        harmless_update_network_gpu(anet);
+
+        data merge = concat_data(train, gen);
+        //randomize_data(merge);
+        float aloss = train_network(anet, merge);
+
+        //translate_image(im, 1);
+        //scale_image(im, .5);
+        //translate_image(im2, 1);
+        //scale_image(im2, .5);
+#ifdef OPENCV
+        if(display){
+            image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]);
+            image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]);
+            show_image(im, "gen", 1);
+            show_image(im2, "train", 1);
+            save_image(im, "gen");
+            save_image(im2, "train");
+        }
+#endif
+
+        /*
+           if(aloss < .1){
+           anet->learning_rate = 0;
+           } else if (aloss > .3){
+           anet->learning_rate = orig_rate;
+           }
+         */
+
+        update_network_gpu(gnet);
+
+        free_data(merge);
+        free_data(train);
+        free_data(gen);
+        if (aloss_avg < 0) aloss_avg = aloss;
+        aloss_avg = aloss_avg*.9 + aloss*.1;
+
+        printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs);
+        if(i%10000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(gnet, buff);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+            save_weights(anet, buff);
+        }
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(gnet, buff);
+            sprintf(buff, "%s/%s.backup", backup_directory, abase);
+            save_weights(anet, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(gnet, buff);
+#endif
+}
+
+void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display)
+{
+#ifdef GPU
+    //char *train_images = "/home/pjreddie/data/coco/train1.txt";
+    //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+    char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list";
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    char *base = basecfg(cfg);
+    char *abase = basecfg(acfg);
+    printf("%s\n", base);
+    network *net = load_network(cfg, weight, clear);
+    network *anet = load_network(acfg, aweight, clear);
+
+    int i, j, k;
+    layer imlayer = {0};
+    for (i = 0; i < net->n; ++i) {
+        if (net->layers[i].out_c == 3) {
+            imlayer = net->layers[i];
+            break;
+        }
+    }
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    int imgs = net->batch*net->subdivisions;
+    i = *net->seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args= get_base_args(net);
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+
+    args.type = CLASSIFICATION_DATA;
+    args.classes = 1;
+    char *ls[2] = {"imagenet"};
+    args.labels = ls;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+
+    int x_size = net->inputs*net->batch;
+    //int y_size = x_size;
+    net->delta = 0;
+    net->train = 1;
+    float *pixs = calloc(x_size, sizeof(float));
+    float *graypixs = calloc(x_size, sizeof(float));
+    //float *y = calloc(y_size, sizeof(float));
+
+    //int ay_size = anet->outputs*anet->batch;
+    anet->delta = 0;
+    anet->train = 1;
+
+    float *imerror = cuda_make_array(0, imlayer.outputs*imlayer.batch);
+
+    float aloss_avg = -1;
+    float gloss_avg = -1;
+
+    //data generated = copy_data(train);
+
+    while (get_current_batch(net) < net->max_batches) {
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        data gray = copy_data(train);
+        for(j = 0; j < imgs; ++j){
+            image gim = float_to_image(net->w, net->h, net->c, gray.X.vals[j]);
+            grayscale_image_3c(gim);
+            train.y.vals[j][0] = .95;
+            gray.y.vals[j][0] = .05;
+        }
+        time=clock();
+        float gloss = 0;
+
+        for(j = 0; j < net->subdivisions; ++j){
+            get_next_batch(train, net->batch, j*net->batch, pixs, 0);
+            get_next_batch(gray, net->batch, j*net->batch, graypixs, 0);
+            cuda_push_array(net->input_gpu, graypixs, net->inputs*net->batch);
+            cuda_push_array(net->truth_gpu, pixs, net->truths*net->batch);
+            /*
+               image origi = float_to_image(net->w, net->h, 3, pixs);
+               image grayi = float_to_image(net->w, net->h, 3, graypixs);
+               show_image(grayi, "gray");
+               show_image(origi, "orig");
+               cvWaitKey(0);
+             */
+            *net->seen += net->batch;
+            forward_network_gpu(net);
+
+            fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
+            copy_gpu(anet->inputs*anet->batch, imlayer.output_gpu, 1, anet->input_gpu, 1);
+            fill_gpu(anet->inputs*anet->batch, .95, anet->truth_gpu, 1);
+            anet->delta_gpu = imerror;
+            forward_network_gpu(anet);
+            backward_network_gpu(anet);
+
+            scal_gpu(imlayer.outputs*imlayer.batch, 1./100., net->layers[net->n-1].delta_gpu, 1);
+
+            scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
+
+            printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
+            printf("features %f\n", cuda_mag_array(net->layers[net->n-1].delta_gpu, imlayer.outputs*imlayer.batch));
+
+            axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net->layers[net->n-1].delta_gpu, 1);
+
+            backward_network_gpu(net);
+
+
+            gloss += *net->cost /(net->subdivisions*net->batch);
+
+            for(k = 0; k < net->batch; ++k){
+                int index = j*net->batch + k;
+                copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1);
+            }
+        }
+        harmless_update_network_gpu(anet);
+
+        data merge = concat_data(train, gray);
+        //randomize_data(merge);
+        float aloss = train_network(anet, merge);
+
+        update_network_gpu(net);
+
+#ifdef OPENCV
+        if(display){
+            image im = float_to_image(anet->w, anet->h, anet->c, gray.X.vals[0]);
+            image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]);
+            show_image(im, "gen", 1);
+            show_image(im2, "train", 1);
+        }
+#endif
+        free_data(merge);
+        free_data(train);
+        free_data(gray);
+        if (aloss_avg < 0) aloss_avg = aloss;
+        aloss_avg = aloss_avg*.9 + aloss*.1;
+        gloss_avg = gloss_avg*.9 + gloss*.1;
+
+        printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+            save_weights(anet, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+            sprintf(buff, "%s/%s.backup", backup_directory, abase);
+            save_weights(anet, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+#endif
+}
+
+/*
+   void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfile, int clear)
+   {
+#ifdef GPU
+char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+char *backup_directory = "/home/pjreddie/backup/";
+srand(time(0));
+char *base = basecfg(cfgfile);
+printf("%s\n", base);
+network net = parse_network_cfg(cfgfile);
+if(weightfile){
+load_weights(&net, weightfile);
+}
+if(clear) *net->seen = 0;
+
+char *abase = basecfg(acfgfile);
+network anet = parse_network_cfg(acfgfile);
+if(aweightfile){
+load_weights(&anet, aweightfile);
+}
+if(clear) *anet->seen = 0;
+
+int i, j, k;
+layer imlayer = {0};
+for (i = 0; i < net->n; ++i) {
+if (net->layers[i].out_c == 3) {
+imlayer = net->layers[i];
+break;
+}
+}
+
+printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+int imgs = net->batch*net->subdivisions;
+i = *net->seen/imgs;
+data train, buffer;
+
+
+list *plist = get_paths(train_images);
+//int N = plist->size;
+char **paths = (char **)list_to_array(plist);
+
+load_args args = {0};
+args.w = net->w;
+args.h = net->h;
+args.paths = paths;
+args.n = imgs;
+args.m = plist->size;
+args.d = &buffer;
+
+args.min = net->min_crop;
+args.max = net->max_crop;
+args.angle = net->angle;
+args.aspect = net->aspect;
+args.exposure = net->exposure;
+args.saturation = net->saturation;
+args.hue = net->hue;
+args.size = net->w;
+args.type = CLASSIFICATION_DATA;
+args.classes = 1;
+char *ls[1] = {"coco"};
+args.labels = ls;
+
+pthread_t load_thread = load_data_in_thread(args);
+clock_t time;
+
+network_state gstate = {0};
+gstate.index = 0;
+gstate.net = net;
+int x_size = get_network_input_size(net)*net->batch;
+int y_size = 1*net->batch;
+gstate.input = cuda_make_array(0, x_size);
+gstate.truth = 0;
+gstate.delta = 0;
+gstate.train = 1;
+float *X = calloc(x_size, sizeof(float));
+float *y = calloc(y_size, sizeof(float));
+
+network_state astate = {0};
+astate.index = 0;
+astate.net = anet;
+int ay_size = get_network_output_size(anet)*anet->batch;
+astate.input = 0;
+astate.truth = 0;
+astate.delta = 0;
+astate.train = 1;
+
+float *imerror = cuda_make_array(0, imlayer.outputs);
+float *ones_gpu = cuda_make_array(0, ay_size);
+fill_gpu(ay_size, 1, ones_gpu, 1);
+
+float aloss_avg = -1;
+float gloss_avg = -1;
+
+//data generated = copy_data(train);
+
+while (get_current_batch(net) < net->max_batches) {
+    i += 1;
+    time=clock();
+    pthread_join(load_thread, 0);
+    train = buffer;
+    load_thread = load_data_in_thread(args);
+
+    printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+    data generated = copy_data(train);
+    time=clock();
+    float gloss = 0;
+
+    for(j = 0; j < net->subdivisions; ++j){
+        get_next_batch(train, net->batch, j*net->batch, X, y);
+        cuda_push_array(gstate.input, X, x_size);
+        *net->seen += net->batch;
+        forward_network_gpu(net, gstate);
+
+        fill_gpu(imlayer.outputs, 0, imerror, 1);
+        astate.input = imlayer.output_gpu;
+        astate.delta = imerror;
+        astate.truth = ones_gpu;
+        forward_network_gpu(anet, astate);
+        backward_network_gpu(anet, astate);
+
+        scal_gpu(imlayer.outputs, 1, imerror, 1);
+        axpy_gpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
+
+        backward_network_gpu(net, gstate);
+
+        printf("features %f\n", cuda_mag_array(imlayer.delta_gpu, imlayer.outputs));
+        printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs));
+
+        gloss += get_network_cost(net) /(net->subdivisions*net->batch);
+
+        cuda_pull_array(imlayer.output_gpu, imlayer.output, imlayer.outputs*imlayer.batch);
+        for(k = 0; k < net->batch; ++k){
+            int index = j*net->batch + k;
+            copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1);
+            generated.y.vals[index][0] = 0;
+        }
+    }
+    harmless_update_network_gpu(anet);
+
+    data merge = concat_data(train, generated);
+    randomize_data(merge);
+    float aloss = train_network(anet, merge);
+
+    update_network_gpu(net);
+    update_network_gpu(anet);
+    free_data(merge);
+    free_data(train);
+    free_data(generated);
+    if (aloss_avg < 0) aloss_avg = aloss;
+    aloss_avg = aloss_avg*.9 + aloss*.1;
+    gloss_avg = gloss_avg*.9 + gloss*.1;
+
+    printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs);
+    if(i%1000==0){
+        char buff[256];
+        sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+        save_weights(net, buff);
+        sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+        save_weights(anet, buff);
+    }
+    if(i%100==0){
+        char buff[256];
+        sprintf(buff, "%s/%s.backup", backup_directory, base);
+        save_weights(net, buff);
+        sprintf(buff, "%s/%s.backup", backup_directory, abase);
+        save_weights(anet, buff);
+    }
+}
+char buff[256];
+sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+save_weights(net, buff);
+#endif
+}
+*/
+
+/*
+   void train_lsd(char *cfgfile, char *weightfile, int clear)
+   {
+   char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+   char *backup_directory = "/home/pjreddie/backup/";
+   srand(time(0));
+   char *base = basecfg(cfgfile);
+   printf("%s\n", base);
+   float avg_loss = -1;
+   network net = parse_network_cfg(cfgfile);
+   if(weightfile){
+   load_weights(&net, weightfile);
+   }
+   if(clear) *net->seen = 0;
+   printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+   int imgs = net->batch*net->subdivisions;
+   int i = *net->seen/imgs;
+   data train, buffer;
+
+
+   list *plist = get_paths(train_images);
+//int N = plist->size;
+char **paths = (char **)list_to_array(plist);
+
+load_args args = {0};
+args.w = net->w;
+args.h = net->h;
+args.paths = paths;
+args.n = imgs;
+args.m = plist->size;
+args.d = &buffer;
+
+args.min = net->min_crop;
+args.max = net->max_crop;
+args.angle = net->angle;
+args.aspect = net->aspect;
+args.exposure = net->exposure;
+args.saturation = net->saturation;
+args.hue = net->hue;
+args.size = net->w;
+args.type = CLASSIFICATION_DATA;
+args.classes = 1;
+char *ls[1] = {"coco"};
+args.labels = ls;
+
+pthread_t load_thread = load_data_in_thread(args);
+clock_t time;
+//while(i*imgs < N*120){
+while(get_current_batch(net) < net->max_batches){
+i += 1;
+time=clock();
+pthread_join(load_thread, 0);
+train = buffer;
+load_thread = load_data_in_thread(args);
+
+printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+time=clock();
+float loss = train_network(net, train);
+if (avg_loss < 0) avg_loss = loss;
+avg_loss = avg_loss*.9 + loss*.1;
+
+printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
+if(i%1000==0){
+char buff[256];
+sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+save_weights(net, buff);
+}
+if(i%100==0){
+char buff[256];
+sprintf(buff, "%s/%s.backup", backup_directory, base);
+save_weights(net, buff);
+}
+free_data(train);
+}
+char buff[256];
+sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+save_weights(net, buff);
+}
+*/
+
+void test_lsd(char *cfg, char *weights, char *filename, int gray)
+{
+    network *net = load_network(cfg, weights, 0);
+    set_batch_network(net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    int i, imlayer = 0;
+
+    for (i = 0; i < net->n; ++i) {
+        if (net->layers[i].out_c == 3) {
+            imlayer = i;
+            printf("%d\n", i);
+            break;
+        }
+    }
+
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        image resized = resize_min(im, net->w);
+        image crop = crop_image(resized, (resized.w - net->w)/2, (resized.h - net->h)/2, net->w, net->h);
+        if(gray) grayscale_image_3c(crop);
+
+        float *X = crop.data;
+        time=clock();
+        network_predict(net, X);
+        image out = get_network_image_layer(net, imlayer);
+        //yuv_to_rgb(out);
+        constrain_image(out);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        save_image(out, "out");
+        show_image(out, "out", 1);
+        show_image(crop, "crop", 0);
+
+        free_image(im);
+        free_image(resized);
+        free_image(crop);
+        if (filename) break;
+    }
+}
+
+
+void run_lsd(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    int clear = find_arg(argc, argv, "-clear");
+    int display = find_arg(argc, argv, "-display");
+    int batches = find_int_arg(argc, argv, "-b", 0);
+    char *file = find_char_arg(argc, argv, "-file", "/home/pjreddie/data/imagenet/imagenet1k.train.list");
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *filename = (argc > 5) ? argv[5] : 0;
+    char *acfg = argv[5];
+    char *aweights = (argc > 6) ? argv[6] : 0;
+    //if(0==strcmp(argv[2], "train")) train_lsd(cfg, weights, clear);
+    //else if(0==strcmp(argv[2], "train2")) train_lsd2(cfg, weights, acfg, aweights, clear);
+    //else if(0==strcmp(argv[2], "traincolor")) train_colorizer(cfg, weights, acfg, aweights, clear);
+    //else if(0==strcmp(argv[2], "train3")) train_lsd3(argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], clear);
+    if(0==strcmp(argv[2], "traingan")) train_dcgan(cfg, weights, acfg, aweights, clear, display, file, batches);
+    else if(0==strcmp(argv[2], "trainprog")) train_prog(cfg, weights, acfg, aweights, clear, display, file, batches);
+    else if(0==strcmp(argv[2], "traincolor")) train_colorizer(cfg, weights, acfg, aweights, clear, display);
+    else if(0==strcmp(argv[2], "gan")) test_dcgan(cfg, weights);
+    else if(0==strcmp(argv[2], "inter")) inter_dcgan(cfg, weights);
+    else if(0==strcmp(argv[2], "test")) test_lsd(cfg, weights, filename, 0);
+    else if(0==strcmp(argv[2], "color")) test_lsd(cfg, weights, filename, 1);
+    /*
+       else if(0==strcmp(argv[2], "valid")) validate_lsd(cfg, weights);
+     */
+}
diff --git a/image.darknet/inst/include/darknet/src/nightmare.c b/image.darknet/inst/include/darknet/examples/nightmare.c
similarity index 58%
rename from image.darknet/inst/include/darknet/src/nightmare.c
rename to image.darknet/inst/include/darknet/examples/nightmare.c
index ec7166c..2978eb6 100644
--- a/image.darknet/inst/include/darknet/src/nightmare.c
+++ b/image.darknet/inst/include/darknet/examples/nightmare.c
@@ -1,12 +1,6 @@
+#include "darknet.h"
 
-#include "network.h"
-#include "parser.h"
-#include "blas.h"
-#include "utils.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
+#include <math.h>
 
 // ./darknet nightmare cfg/extractor.recon.cfg ~/trained/yolo-coco.conv frame6.png -reconstruct -iters 500 -i 3 -lambda .1 -rate .01 -smooth 2
 
@@ -51,31 +45,30 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa
 
     image delta = make_image(im.w, im.h, im.c);
 
-    network_state state = {0};
-
 #ifdef GPU
-    state.input = cuda_make_array(im.data, im.w*im.h*im.c);
-    state.delta = cuda_make_array(im.data, im.w*im.h*im.c);
+    net->delta_gpu = cuda_make_array(delta.data, im.w*im.h*im.c);
+    copy_cpu(net->inputs, im.data, 1, net->input, 1);
 
-    forward_network_gpu(*net, state);
-    copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1);
+    forward_network_gpu(net);
+    copy_gpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1);
 
     cuda_pull_array(last.delta_gpu, last.delta, last.outputs);
     calculate_loss(last.delta, last.delta, last.outputs, thresh);
     cuda_push_array(last.delta_gpu, last.delta, last.outputs);
 
-    backward_network_gpu(*net, state);
+    backward_network_gpu(net);
 
-    cuda_pull_array(state.delta, delta.data, im.w*im.h*im.c);
-    cuda_free(state.input);
-    cuda_free(state.delta);
+    cuda_pull_array(net->delta_gpu, delta.data, im.w*im.h*im.c);
+    cuda_free(net->delta_gpu);
+    net->delta_gpu = 0;
 #else
-    state.input = im.data;
-    state.delta = delta.data;
-    forward_network(*net, state);
+    printf("\nnet: %d %d %d im: %d %d %d\n", net->w, net->h, net->inputs, im.w, im.h, im.c);
+    copy_cpu(net->inputs, im.data, 1, net->input, 1);
+    net->delta = delta.data;
+    forward_network(net);
     copy_cpu(last.outputs, last.output, 1, last.delta, 1);
     calculate_loss(last.output, last.delta, last.outputs, thresh);
-    backward_network(*net, state);
+    backward_network(net);
 #endif
 
     if(flip) flip_image(delta);
@@ -90,6 +83,10 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa
      */
 
     //rate = rate / abs_mean(out.data, out.w*out.h*out.c);
+    image gray = make_image(out.w, out.h, out.c);
+    fill_image(gray, .5);
+    axpy_cpu(orig.w*orig.h*orig.c, -1, orig.data, 1, gray.data, 1);
+    axpy_cpu(orig.w*orig.h*orig.c, .1, gray.data, 1, out.data, 1);
 
     if(norm) normalize_array(out.data, out.w*out.h*out.c);
     axpy_cpu(orig.w*orig.h*orig.c, rate, out.data, 1, orig.data, 1);
@@ -135,42 +132,44 @@ void smooth(image recon, image update, float lambda, int num)
     }
 }
 
-void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters)
+void reconstruct_picture(network *net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters)
 {
     int iter = 0;
     for (iter = 0; iter < iters; ++iter) {
         image delta = make_image(recon.w, recon.h, recon.c);
 
-        network_state state = {0};
 #ifdef GPU
-        state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c);
-        state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c);
-        state.truth = cuda_make_array(features, get_network_output_size(net));
+        layer l = get_network_output_layer(net);
+        cuda_push_array(net->input_gpu, recon.data, recon.w*recon.h*recon.c);
+        //cuda_push_array(net->truth_gpu, features, net->truths);
+        net->delta_gpu = cuda_make_array(delta.data, delta.w*delta.h*delta.c);
 
-        forward_network_gpu(net, state);
-        backward_network_gpu(net, state);
+        forward_network_gpu(net);
+        cuda_push_array(l.delta_gpu, features, l.outputs);
+        axpy_gpu(l.outputs, -1, l.output_gpu, 1, l.delta_gpu, 1);
+        backward_network_gpu(net);
 
-        cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c);
+        cuda_pull_array(net->delta_gpu, delta.data, delta.w*delta.h*delta.c);
 
-        cuda_free(state.input);
-        cuda_free(state.delta);
-        cuda_free(state.truth);
+        cuda_free(net->delta_gpu);
 #else
-        state.input = recon.data;
-        state.delta = delta.data;
-        state.truth = features;
+        net->input = recon.data;
+        net->delta = delta.data;
+        net->truth = features;
 
-        forward_network(net, state);
-        backward_network(net, state);
+        forward_network(net);
+        backward_network(net);
 #endif
 
+        //normalize_array(delta.data, delta.w*delta.h*delta.c);
         axpy_cpu(recon.w*recon.h*recon.c, 1, delta.data, 1, update.data, 1);
-        smooth(recon, update, lambda, smooth_size);
+        //smooth(recon, update, lambda, smooth_size);
 
         axpy_cpu(recon.w*recon.h*recon.c, rate, update.data, 1, recon.data, 1);
         scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1);
 
-        //float mag = mag_array(recon.data, recon.w*recon.h*recon.c);
+        float mag = mag_array(delta.data, recon.w*recon.h*recon.c);
+        printf("mag: %f\n", mag);
         //scal_cpu(recon.w*recon.h*recon.c, 600/mag, recon.data, 1);
 
         constrain_image(recon);
@@ -178,6 +177,113 @@ void reconstruct_picture(network net, float *features, image recon, image update
     }
 }
 
+/*
+void run_lsd(int argc, char **argv)
+{
+    srand(0);
+    if(argc < 3){
+        fprintf(stderr, "usage: %s %s [cfg] [weights] [image] [options! (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[2];
+    char *weights = argv[3];
+    char *input = argv[4];
+
+    int norm = find_int_arg(argc, argv, "-norm", 1);
+    int rounds = find_int_arg(argc, argv, "-rounds", 1);
+    int iters = find_int_arg(argc, argv, "-iters", 10);
+    float rate = find_float_arg(argc, argv, "-rate", .04);
+    float momentum = find_float_arg(argc, argv, "-momentum", .9);
+    float lambda = find_float_arg(argc, argv, "-lambda", .01);
+    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
+    int reconstruct = find_arg(argc, argv, "-reconstruct");
+    int smooth_size = find_int_arg(argc, argv, "-smooth", 1);
+
+    network net = parse_network_cfg(cfg);
+    load_weights(&net, weights);
+    char *cfgbase = basecfg(cfg);
+    char *imbase = basecfg(input);
+
+    set_batch_network(&net, 1);
+    image im = load_image_color(input, 0, 0);
+
+    float *features = 0;
+    image update;
+    if (reconstruct){
+        im = letterbox_image(im, net->w, net->h);
+
+        int zz = 0;
+        network_predict(net, im.data);
+        image out_im = get_network_image(net);
+        image crop = crop_image(out_im, zz, zz, out_im.w-2*zz, out_im.h-2*zz);
+        //flip_image(crop);
+        image f_im = resize_image(crop, out_im.w, out_im.h);
+        free_image(crop);
+        printf("%d features\n", out_im.w*out_im.h*out_im.c);
+
+
+        im = resize_image(im, im.w, im.h);
+        f_im = resize_image(f_im, f_im.w, f_im.h);
+        features = f_im.data;
+
+        int i;
+        for(i = 0; i < 14*14*512; ++i){
+            features[i] += rand_uniform(-.19, .19);
+        }
+
+        free_image(im);
+        im = make_random_image(im.w, im.h, im.c);
+        update = make_image(im.w, im.h, im.c);
+
+    }
+
+    int e;
+    int n;
+    for(e = 0; e < rounds; ++e){
+        fprintf(stderr, "Iteration: ");
+        fflush(stderr);
+        for(n = 0; n < iters; ++n){  
+            fprintf(stderr, "%d, ", n);
+            fflush(stderr);
+            if(reconstruct){
+                reconstruct_picture(net, features, im, update, rate, momentum, lambda, smooth_size, 1);
+                //if ((n+1)%30 == 0) rate *= .5;
+                show_image(im, "reconstruction");
+#ifdef OPENCV
+                cvWaitKey(10);
+#endif
+            }else{
+                int layer = max_layer + rand()%range - range/2;
+                int octave = rand()%octaves;
+                optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm);
+            }
+        }
+        fprintf(stderr, "done\n");
+        char buff[256];
+        if (prefix){
+            sprintf(buff, "%s/%s_%s_%d_%06d",prefix, imbase, cfgbase, max_layer, e);
+        }else{
+            sprintf(buff, "%s_%s_%d_%06d",imbase, cfgbase, max_layer, e);
+        }
+        printf("%d %s\n", e, buff);
+        save_image(im, buff);
+        //show_image(im, buff);
+        //cvWaitKey(0);
+
+        if(rotate){
+            image rot = rotate_image(im, rotate);
+            free_image(im);
+            im = rot;
+        }
+        image crop = crop_image(im, im.w * (1. - zoom)/2., im.h * (1.-zoom)/2., im.w*zoom, im.h*zoom);
+        image resized = resize_image(crop, im.w, im.h);
+        free_image(im);
+        free_image(crop);
+        im = resized;
+    }
+}
+*/
 
 void run_nightmare(int argc, char **argv)
 {
@@ -207,12 +313,11 @@ void run_nightmare(int argc, char **argv)
     int reconstruct = find_arg(argc, argv, "-reconstruct");
     int smooth_size = find_int_arg(argc, argv, "-smooth", 1);
 
-    network net = parse_network_cfg(cfg);
-    load_weights(&net, weights);
+    network *net = load_network(cfg, weights, 0);
     char *cfgbase = basecfg(cfg);
     char *imbase = basecfg(input);
 
-    set_batch_network(&net, 1);
+    set_batch_network(net, 1);
     image im = load_image_color(input, 0, 0);
     if(0){
         float scale = 1;
@@ -224,35 +329,40 @@ void run_nightmare(int argc, char **argv)
         free_image(im);
         im = resized;
     }
+    //im = letterbox_image(im, net->w, net->h);
 
     float *features = 0;
     image update;
     if (reconstruct){
-        resize_network(&net, im.w, im.h);
+        net->n = max_layer;
+        im = letterbox_image(im, net->w, net->h);
+        //resize_network(&net, im.w, im.h);
 
-        int zz = 0;
         network_predict(net, im.data);
-        image out_im = get_network_image(net);
-        image crop = crop_image(out_im, zz, zz, out_im.w-2*zz, out_im.h-2*zz);
+        if(net->layers[net->n-1].type == REGION){
+            printf("region!\n");
+            zero_objectness(net->layers[net->n-1]);
+        }
+        image out_im = copy_image(get_network_image(net));
+        /*
+           image crop = crop_image(out_im, zz, zz, out_im.w-2*zz, out_im.h-2*zz);
         //flip_image(crop);
         image f_im = resize_image(crop, out_im.w, out_im.h);
         free_image(crop);
+         */
         printf("%d features\n", out_im.w*out_im.h*out_im.c);
 
+        features = out_im.data;
 
-        im = resize_image(im, im.w, im.h);
-        f_im = resize_image(f_im, f_im.w, f_im.h);
-        features = f_im.data;
-
+        /*
         int i;
-        for(i = 0; i < 14*14*512; ++i){
-            features[i] += rand_uniform(-.19, .19);
+           for(i = 0; i < 14*14*512; ++i){
+        //features[i] += rand_uniform(-.19, .19);
         }
-
         free_image(im);
         im = make_random_image(im.w, im.h, im.c);
+         */
         update = make_image(im.w, im.h, im.c);
-
     }
 
     int e;
@@ -266,14 +376,11 @@ void run_nightmare(int argc, char **argv)
             if(reconstruct){
                 reconstruct_picture(net, features, im, update, rate, momentum, lambda, smooth_size, 1);
                 //if ((n+1)%30 == 0) rate *= .5;
-                show_image(im, "reconstruction");
-#ifdef OPENCV
-                cvWaitKey(10);
-#endif
+                show_image(im, "reconstruction", 10);
             }else{
                 int layer = max_layer + rand()%range - range/2;
                 int octave = rand()%octaves;
-                optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm);
+                optimize_picture(net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm);
             }
         }
         fprintf(stderr, "done\n");
@@ -290,8 +397,7 @@ void run_nightmare(int argc, char **argv)
         }
         printf("%d %s\n", e, buff);
         save_image(im, buff);
-        //show_image(im, buff);
-        //cvWaitKey(0);
+        //show_image(im, buff, 0);
 
         if(rotate){
             image rot = rotate_image(im, rotate);
diff --git a/image.darknet/inst/include/darknet/examples/regressor.c b/image.darknet/inst/include/darknet/examples/regressor.c
new file mode 100644
index 0000000..20cec0f
--- /dev/null
+++ b/image.darknet/inst/include/darknet/examples/regressor.c
@@ -0,0 +1,240 @@
+#include "darknet.h"
+#include <sys/time.h>
+#include <assert.h>
+
+void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
+{
+    int i;
+
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    printf("%d\n", ngpus);
+    network **nets = calloc(ngpus, sizeof(network*));
+
+    srand(time(0));
+    int seed = rand();
+    for(i = 0; i < ngpus; ++i){
+        srand(seed);
+#ifdef GPU
+        cuda_set_device(gpus[i]);
+#endif
+        nets[i] = load_network(cfgfile, weightfile, clear);
+        nets[i]->learning_rate *= ngpus;
+    }
+    srand(time(0));
+    network *net = nets[0];
+
+    int imgs = net->batch * net->subdivisions * ngpus;
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    list *options = read_data_cfg(datacfg);
+
+    char *backup_directory = option_find_str(options, "backup", "/backup/");
+    char *train_list = option_find_str(options, "train", "data/train.list");
+    int classes = option_find_int(options, "classes", 1);
+
+    list *plist = get_paths(train_list);
+    char **paths = (char **)list_to_array(plist);
+    printf("%d\n", plist->size);
+    int N = plist->size;
+    clock_t time;
+
+    load_args args = {0};
+    args.w = net->w;
+    args.h = net->h;
+    args.threads = 32;
+    args.classes = classes;
+
+    args.min = net->min_ratio*net->w;
+    args.max = net->max_ratio*net->w;
+    args.angle = net->angle;
+    args.aspect = net->aspect;
+    args.exposure = net->exposure;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
+    args.size = net->w;
+
+    args.paths = paths;
+    args.n = imgs;
+    args.m = N;
+    args.type = REGRESSION_DATA;
+
+    data train;
+    data buffer;
+    pthread_t load_thread;
+    args.d = &buffer;
+    load_thread = load_data(args);
+
+    int epoch = (*net->seen)/N;
+    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
+        time=clock();
+
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        time=clock();
+
+        float loss = 0;
+#ifdef GPU
+        if(ngpus == 1){
+            loss = train_network(net, train);
+        } else {
+            loss = train_networks(nets, ngpus, train, 4);
+        }
+#else
+        loss = train_network(net, train);
+#endif
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net->seen);
+        free_data(train);
+        if(*net->seen/N > epoch){
+            epoch = *net->seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            save_weights(net, buff);
+        }
+        if(get_current_batch(net)%100 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s.weights", backup_directory, base);
+    save_weights(net, buff);
+
+    free_network(net);
+    free_ptrs((void**)paths, plist->size);
+    free_list(plist);
+    free(base);
+}
+
+void predict_regressor(char *cfgfile, char *weightfile, char *filename)
+{
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        image sized = letterbox_image(im, net->w, net->h);
+
+        float *X = sized.data;
+        time=clock();
+        float *predictions = network_predict(net, X);
+        printf("Predicted: %f\n", predictions[0]);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        free_image(im);
+        free_image(sized);
+        if (filename) break;
+    }
+}
+
+
+void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
+{
+#ifdef OPENCV
+    printf("Regressor Demo\n");
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+
+    srand(2222222);
+    list *options = read_data_cfg(datacfg);
+    int classes = option_find_int(options, "classes", 1);
+    char *name_list = option_find_str(options, "names", 0);
+    char **names = get_labels(name_list);
+
+    void * cap = open_video_stream(filename, cam_index, 0,0,0);
+    if(!cap) error("Couldn't connect to webcam.\n");
+    float fps = 0;
+
+    while(1){
+        struct timeval tval_before, tval_after, tval_result;
+        gettimeofday(&tval_before, NULL);
+
+        image in = get_image_from_stream(cap);
+        image crop = center_crop_image(in, net->w, net->h);
+        grayscale_image_3c(crop);
+
+        float *predictions = network_predict(net, crop.data);
+
+        printf("\033[2J");
+        printf("\033[1;1H");
+        printf("\nFPS:%.0f\n",fps);
+
+        int i;
+        for(i = 0; i < classes; ++i){
+            printf("%s: %f\n", names[i], predictions[i]);
+        }
+
+        show_image(crop, "Regressor", 10);
+        free_image(in);
+        free_image(crop);
+
+        gettimeofday(&tval_after, NULL);
+        timersub(&tval_after, &tval_before, &tval_result);
+        float curr = 1000000.f/((long int)tval_result.tv_usec);
+        fps = .9*fps + .1*curr;
+    }
+#endif
+}
+
+
+void run_regressor(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    int *gpus = 0;
+    int gpu = 0;
+    int ngpus = 0;
+    if(gpu_list){
+        printf("%s\n", gpu_list);
+        int len = strlen(gpu_list);
+        ngpus = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (gpu_list[i] == ',') ++ngpus;
+        }
+        gpus = calloc(ngpus, sizeof(int));
+        for(i = 0; i < ngpus; ++i){
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',')+1;
+        }
+    } else {
+        gpu = gpu_index;
+        gpus = &gpu;
+        ngpus = 1;
+    }
+
+    int cam_index = find_int_arg(argc, argv, "-c", 0);
+    int clear = find_arg(argc, argv, "-clear");
+    char *data = argv[3];
+    char *cfg = argv[4];
+    char *weights = (argc > 5) ? argv[5] : 0;
+    char *filename = (argc > 6) ? argv[6]: 0;
+    if(0==strcmp(argv[2], "test")) predict_regressor(data, cfg, weights);
+    else if(0==strcmp(argv[2], "train")) train_regressor(data, cfg, weights, gpus, ngpus, clear);
+    else if(0==strcmp(argv[2], "demo")) demo_regressor(data, cfg, weights, cam_index, filename);
+}
+
+
diff --git a/image.darknet/src/rnn.c b/image.darknet/inst/include/darknet/examples/rnn.c
similarity index 72%
rename from image.darknet/src/rnn.c
rename to image.darknet/inst/include/darknet/examples/rnn.c
index eca6f55..5d49eaa 100644
--- a/image.darknet/src/rnn.c
+++ b/image.darknet/inst/include/darknet/examples/rnn.c
@@ -1,18 +1,26 @@
-#include "network.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "blas.h"
-#include "parser.h"
+#include "darknet.h"
 
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
+#include <math.h>
 
 typedef struct {
     float *x;
     float *y;
 } float_pair;
 
+unsigned char **load_files(char *filename, int *n)
+{
+    list *paths = get_paths(filename);
+    *n = paths->size;
+    unsigned char **contents = calloc(*n, sizeof(char *));
+    int i;
+    node *x = paths->front;
+    for(i = 0; i < *n; ++i){
+        contents[i] = read_file((char *)x->val);
+        x = x->next;
+    }
+    return contents;
+}
+
 int *read_tokenized_data(char *filename, size_t *read)
 {
     size_t size = 512;
@@ -49,6 +57,7 @@ char **read_tokens(char *filename, size_t *read)
             size = size*2;
             d = realloc(d, size*sizeof(char *));
         }
+        if(0==strcmp(line, "<NEWLINE>")) line = "\n";
         d[count-1] = line;
     }
     fclose(fp);
@@ -57,6 +66,7 @@ char **read_tokens(char *filename, size_t *read)
     return d;
 }
 
+
 float_pair get_rnn_token_data(int *tokens, size_t *offsets, int characters, size_t len, int batch, int steps)
 {
     float *x = calloc(batch * steps * characters, sizeof(float));
@@ -83,6 +93,37 @@ float_pair get_rnn_token_data(int *tokens, size_t *offsets, int characters, size
     return p;
 }
 
+float_pair get_seq2seq_data(char **source, char **dest, int n, int characters, size_t len, int batch, int steps)
+{
+    int i,j;
+    float *x = calloc(batch * steps * characters, sizeof(float));
+    float *y = calloc(batch * steps * characters, sizeof(float));
+    for(i = 0; i < batch; ++i){
+        int index = rand()%n;
+        //int slen = strlen(source[index]);
+        //int dlen = strlen(dest[index]);
+        for(j = 0; j < steps; ++j){
+            unsigned char curr = source[index][j];
+            unsigned char next = dest[index][j];
+
+            x[(j*batch + i)*characters + curr] = 1;
+            y[(j*batch + i)*characters + next] = 1;
+
+            if(curr > 255 || curr <= 0 || next > 255 || next <= 0){
+                /*text[(index+j+2)%len] = 0;
+                printf("%ld %d %d %d %d\n", index, j, len, (int)text[index+j], (int)text[index+j+1]);
+                printf("%s", text+index);
+                */
+                error("Bad char");
+            }
+        }
+    }
+    float_pair p;
+    p.x = x;
+    p.y = y;
+    return p;
+}
+
 float_pair get_rnn_data(unsigned char *text, size_t *offsets, int characters, size_t len, int batch, int steps)
 {
     float *x = calloc(batch * steps * characters, sizeof(float));
@@ -113,19 +154,6 @@ float_pair get_rnn_data(unsigned char *text, size_t *offsets, int characters, si
     return p;
 }
 
-void reset_rnn_state(network net, int b)
-{
-    int i;
-    for (i = 0; i < net.n; ++i) {
-        #ifdef GPU
-        layer l = net.layers[i];
-        if(l.state_gpu){
-            fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
-        }
-        #endif
-    }
-}
-
 void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear, int tokenized)
 {
     srand(time(0));
@@ -135,32 +163,22 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
     if(tokenized){
         tokens = read_tokenized_data(filename, &size);
     } else {
-        FILE *fp = fopen(filename, "rb");
-
-        fseek(fp, 0, SEEK_END); 
-        size = ftell(fp);
-        fseek(fp, 0, SEEK_SET); 
-
-        text = calloc(size+1, sizeof(char));
-        fread(text, 1, size, fp);
-        fclose(fp);
+        text = read_file(filename);
+        size = strlen((const char*)text);
     }
 
     char *backup_directory = "/home/pjreddie/backup/";
     char *base = basecfg(cfgfile);
     fprintf(stderr, "%s\n", base);
     float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfgfile, weightfile, clear);
 
-    int inputs = get_network_input_size(net);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int batch = net.batch;
-    int steps = net.time_steps;
-    if(clear) *net.seen = 0;
-    int i = (*net.seen)/net.batch;
+    int inputs = net->inputs;
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g, Inputs: %d %d %d\n", net->learning_rate, net->momentum, net->decay, inputs, net->batch, net->time_steps);
+    int batch = net->batch;
+    int steps = net->time_steps;
+    if(clear) *net->seen = 0;
+    int i = (*net->seen)/net->batch;
 
     int streams = batch/steps;
     size_t *offsets = calloc(streams, sizeof(size_t));
@@ -170,7 +188,7 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
     }
 
     clock_t time;
-    while(get_current_batch(net) < net.max_batches){
+    while(get_current_batch(net) < net->max_batches){
         i += 1;
         time=clock();
         float_pair p;
@@ -180,30 +198,32 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
             p = get_rnn_data(text, offsets, inputs, size, streams, steps);
         }
 
-        float loss = train_network_datum(net, p.x, p.y) / (batch);
+        copy_cpu(net->inputs*net->batch, p.x, 1, net->input, 1);
+        copy_cpu(net->truths*net->batch, p.y, 1, net->truth, 1);
+        float loss = train_network_datum(net) / (batch);
         free(p.x);
         free(p.y);
         if (avg_loss < 0) avg_loss = loss;
         avg_loss = avg_loss*.9 + loss*.1;
 
-        int chars = get_current_batch(net)*batch;
+        size_t chars = get_current_batch(net)*batch;
         fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds, %f epochs\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), (float) chars/size);
 
         for(j = 0; j < streams; ++j){
             //printf("%d\n", j);
-            if(rand()%10 == 0){
+            if(rand()%64 == 0){
                 //fprintf(stderr, "Reset\n");
                 offsets[j] = rand_size_t()%size;
-                reset_rnn_state(net, j);
+                reset_network_state(net, j);
             }
         }
 
-        if(i%1000==0){
+        if(i%10000==0){
             char buff[256];
             sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
             save_weights(net, buff);
         }
-        if(i%10==0){
+        if(i%100==0){
             char buff[256];
             sprintf(buff, "%s/%s.backup", backup_directory, base);
             save_weights(net, buff);
@@ -234,14 +254,11 @@ void test_char_rnn(char *cfgfile, char *weightfile, int num, char *seed, float t
     char *base = basecfg(cfgfile);
     fprintf(stderr, "%s\n", base);
 
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int inputs = get_network_input_size(net);
+    network *net = load_network(cfgfile, weightfile, 0);
+    int inputs = net->inputs;
 
     int i, j;
-    for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp;
+    for(i = 0; i < net->n; ++i) net->layers[i].temperature = temp;
     int c = 0;
     int len = strlen(seed);
     float *input = calloc(inputs, sizeof(float));
@@ -279,7 +296,7 @@ void test_char_rnn(char *cfgfile, char *weightfile, int num, char *seed, float t
     printf("\n");
 }
 
-void test_tactic_rnn(char *cfgfile, char *weightfile, int num, float temp, int rseed, char *token_file)
+void test_tactic_rnn_multi(char *cfgfile, char *weightfile, int num, float temp, int rseed, char *token_file)
 {
     char **tokens = 0;
     if(token_file){
@@ -291,14 +308,56 @@ void test_tactic_rnn(char *cfgfile, char *weightfile, int num, float temp, int r
     char *base = basecfg(cfgfile);
     fprintf(stderr, "%s\n", base);
 
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
+    network *net = load_network(cfgfile, weightfile, 0);
+    int inputs = net->inputs;
+
+    int i, j;
+    for(i = 0; i < net->n; ++i) net->layers[i].temperature = temp;
+    int c = 0;
+    float *input = calloc(inputs, sizeof(float));
+    float *out = 0;
+
+    while(1){
+        reset_network_state(net, 0);
+        while((c = getc(stdin)) != EOF && c != 0){
+            input[c] = 1;
+            out = network_predict(net, input);
+            input[c] = 0;
+        }
+        for(i = 0; i < num; ++i){
+            for(j = 0; j < inputs; ++j){
+                if (out[j] < .0001) out[j] = 0;
+            }
+            int next = sample_array(out, inputs);
+            if(c == '.' && next == '\n') break;
+            c = next;
+            print_symbol(c, tokens);
+
+            input[c] = 1;
+            out = network_predict(net, input);
+            input[c] = 0;
+        }
+        printf("\n");
+    }
+}
+
+void test_tactic_rnn(char *cfgfile, char *weightfile, int num, float temp, int rseed, char *token_file)
+{
+    char **tokens = 0;
+    if(token_file){
+        size_t n;
+        tokens = read_tokens(token_file, &n);
     }
-    int inputs = get_network_input_size(net);
+
+    srand(rseed);
+    char *base = basecfg(cfgfile);
+    fprintf(stderr, "%s\n", base);
+
+    network *net = load_network(cfgfile, weightfile, 0);
+    int inputs = net->inputs;
 
     int i, j;
-    for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp;
+    for(i = 0; i < net->n; ++i) net->layers[i].temperature = temp;
     int c = 0;
     float *input = calloc(inputs, sizeof(float));
     float *out = 0;
@@ -329,11 +388,8 @@ void valid_tactic_rnn(char *cfgfile, char *weightfile, char *seed)
     char *base = basecfg(cfgfile);
     fprintf(stderr, "%s\n", base);
 
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int inputs = get_network_input_size(net);
+    network *net = load_network(cfgfile, weightfile, 0);
+    int inputs = net->inputs;
 
     int count = 0;
     int words = 1;
@@ -381,11 +437,8 @@ void valid_char_rnn(char *cfgfile, char *weightfile, char *seed)
     char *base = basecfg(cfgfile);
     fprintf(stderr, "%s\n", base);
 
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int inputs = get_network_input_size(net);
+    network *net = load_network(cfgfile, weightfile, 0);
+    int inputs = net->inputs;
 
     int count = 0;
     int words = 1;
@@ -413,7 +466,7 @@ void valid_char_rnn(char *cfgfile, char *weightfile, char *seed)
         input[c] = 0;
         sum += log(out[next])/log2;
         c = next;
-        printf("%d Perplexity: %4.4f    Word Perplexity: %4.4f\n", count, pow(2, -sum/count), pow(2, -sum/words));
+        printf("%d BPC: %4.4f   Perplexity: %4.4f    Word Perplexity: %4.4f\n", count, -sum/count, pow(2, -sum/count), pow(2, -sum/words));
     }
 }
 
@@ -422,11 +475,8 @@ void vec_char_rnn(char *cfgfile, char *weightfile, char *seed)
     char *base = basecfg(cfgfile);
     fprintf(stderr, "%s\n", base);
 
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int inputs = get_network_input_size(net);
+    network *net = load_network(cfgfile, weightfile, 0);
+    int inputs = net->inputs;
 
     int c;
     int seed_len = strlen(seed);
@@ -434,7 +484,7 @@ void vec_char_rnn(char *cfgfile, char *weightfile, char *seed)
     int i;
     char *line;
     while((line=fgetl(stdin)) != 0){
-        reset_rnn_state(net, 0);
+        reset_network_state(net, 0);
         for(i = 0; i < seed_len; ++i){
             c = seed[i];
             input[(int)c] = 1;
@@ -454,7 +504,7 @@ void vec_char_rnn(char *cfgfile, char *weightfile, char *seed)
         network_predict(net, input);
         input[(int)c] = 0;
 
-        layer l = net.layers[0];
+        layer l = net->layers[0];
         #ifdef GPU
         cuda_pull_array(l.output_gpu, l.output, l.outputs);
         #endif
diff --git a/image.darknet/inst/include/darknet/src/rnn_vid.c b/image.darknet/inst/include/darknet/examples/rnn_vid.c
similarity index 96%
rename from image.darknet/inst/include/darknet/src/rnn_vid.c
rename to image.darknet/inst/include/darknet/examples/rnn_vid.c
index 36912d6..e887923 100644
--- a/image.darknet/inst/include/darknet/src/rnn_vid.c
+++ b/image.darknet/inst/include/darknet/examples/rnn_vid.c
@@ -1,11 +1,6 @@
-#include "network.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "blas.h"
+#include "darknet.h"
 
 #ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
 image get_image_from_stream(CvCapture *cap);
 image ipl_to_image(IplImage* src);
 
@@ -104,7 +99,9 @@ void train_vid_rnn(char *cfgfile, char *weightfile)
         time=clock();
         float_pair p = get_rnn_vid_data(extractor, paths, N, batch, steps);
 
-        float loss = train_network_datum(net, p.x, p.y) / (net.batch);
+        copy_cpu(net.inputs*net.batch, p.x, 1, net.input, 1);
+        copy_cpu(net.truths*net.batch, p.y, 1, net.truth, 1);
+        float loss = train_network_datum(net) / (net.batch);
 
 
         free(p.x);
diff --git a/image.darknet/inst/include/darknet/examples/segmenter.c b/image.darknet/inst/include/darknet/examples/segmenter.c
new file mode 100644
index 0000000..2e7cea0
--- /dev/null
+++ b/image.darknet/inst/include/darknet/examples/segmenter.c
@@ -0,0 +1,255 @@
+#include "darknet.h"
+#include <sys/time.h>
+#include <assert.h>
+
+void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear, int display)
+{
+    int i;
+
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    printf("%d\n", ngpus);
+    network **nets = calloc(ngpus, sizeof(network*));
+
+    srand(time(0));
+    int seed = rand();
+    for(i = 0; i < ngpus; ++i){
+        srand(seed);
+#ifdef GPU
+        cuda_set_device(gpus[i]);
+#endif
+        nets[i] = load_network(cfgfile, weightfile, clear);
+        nets[i]->learning_rate *= ngpus;
+    }
+    srand(time(0));
+    network *net = nets[0];
+    image pred = get_network_image(net);
+
+    int div = net->w/pred.w;
+    assert(pred.w * div == net->w);
+    assert(pred.h * div == net->h);
+
+    int imgs = net->batch * net->subdivisions * ngpus;
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    list *options = read_data_cfg(datacfg);
+
+    char *backup_directory = option_find_str(options, "backup", "/backup/");
+    char *train_list = option_find_str(options, "train", "data/train.list");
+
+    list *plist = get_paths(train_list);
+    char **paths = (char **)list_to_array(plist);
+    printf("%d\n", plist->size);
+    int N = plist->size;
+
+    load_args args = {0};
+    args.w = net->w;
+    args.h = net->h;
+    args.threads = 32;
+    args.scale = div;
+
+    args.min = net->min_crop;
+    args.max = net->max_crop;
+    args.angle = net->angle;
+    args.aspect = net->aspect;
+    args.exposure = net->exposure;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
+    args.size = net->w;
+    args.classes = 80;
+
+    args.paths = paths;
+    args.n = imgs;
+    args.m = N;
+    args.type = SEGMENTATION_DATA;
+
+    data train;
+    data buffer;
+    pthread_t load_thread;
+    args.d = &buffer;
+    load_thread = load_data(args);
+
+    int epoch = (*net->seen)/N;
+    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
+        double time = what_time_is_it_now();
+
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data(args);
+
+        printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
+        time = what_time_is_it_now();
+
+        float loss = 0;
+#ifdef GPU
+        if(ngpus == 1){
+            loss = train_network(net, train);
+        } else {
+            loss = train_networks(nets, ngpus, train, 4);
+        }
+#else
+        loss = train_network(net, train);
+#endif
+        if(display){
+            image tr = float_to_image(net->w/div, net->h/div, 80, train.y.vals[net->batch*(net->subdivisions-1)]);
+            image im = float_to_image(net->w, net->h, net->c, train.X.vals[net->batch*(net->subdivisions-1)]);
+            image mask = mask_to_rgb(tr);
+            image prmask = mask_to_rgb(pred);
+            show_image(im, "input", 1);
+            show_image(prmask, "pred", 1);
+            show_image(mask, "truth", 100);
+            free_image(mask);
+            free_image(prmask);
+        }
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, *net->seen);
+        free_data(train);
+        if(*net->seen/N > epoch){
+            epoch = *net->seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            save_weights(net, buff);
+        }
+        if(get_current_batch(net)%100 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s.weights", backup_directory, base);
+    save_weights(net, buff);
+
+    free_network(net);
+    free_ptrs((void**)paths, plist->size);
+    free_list(plist);
+    free(base);
+}
+
+void predict_segmenter(char *datafile, char *cfg, char *weights, char *filename)
+{
+    network *net = load_network(cfg, weights, 0);
+    set_batch_network(net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        image sized = letterbox_image(im, net->w, net->h);
+
+        float *X = sized.data;
+        time=clock();
+        float *predictions = network_predict(net, X);
+        image pred = get_network_image(net);
+        image prmask = mask_to_rgb(pred);
+        printf("Predicted: %f\n", predictions[0]);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        show_image(sized, "orig", 1);
+        show_image(prmask, "pred", 0);
+        free_image(im);
+        free_image(sized);
+        free_image(prmask);
+        if (filename) break;
+    }
+}
+
+
+void demo_segmenter(char *datacfg, char *cfg, char *weights, int cam_index, const char *filename)
+{
+#ifdef OPENCV
+    printf("Classifier Demo\n");
+    network *net = load_network(cfg, weights, 0);
+    set_batch_network(net, 1);
+
+    srand(2222222);
+    void * cap = open_video_stream(filename, cam_index, 0,0,0);
+
+    if(!cap) error("Couldn't connect to webcam.\n");
+    float fps = 0;
+
+    while(1){
+        struct timeval tval_before, tval_after, tval_result;
+        gettimeofday(&tval_before, NULL);
+
+        image in = get_image_from_stream(cap);
+        image in_s = letterbox_image(in, net->w, net->h);
+
+        network_predict(net, in_s.data);
+
+        printf("\033[2J");
+        printf("\033[1;1H");
+        printf("\nFPS:%.0f\n",fps);
+
+        image pred = get_network_image(net);
+        image prmask = mask_to_rgb(pred);
+        show_image(prmask, "Segmenter", 10);
+        
+        free_image(in_s);
+        free_image(in);
+        free_image(prmask);
+
+        gettimeofday(&tval_after, NULL);
+        timersub(&tval_after, &tval_before, &tval_result);
+        float curr = 1000000.f/((long int)tval_result.tv_usec);
+        fps = .9*fps + .1*curr;
+    }
+#endif
+}
+
+
+void run_segmenter(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    int *gpus = 0;
+    int gpu = 0;
+    int ngpus = 0;
+    if(gpu_list){
+        printf("%s\n", gpu_list);
+        int len = strlen(gpu_list);
+        ngpus = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (gpu_list[i] == ',') ++ngpus;
+        }
+        gpus = calloc(ngpus, sizeof(int));
+        for(i = 0; i < ngpus; ++i){
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',')+1;
+        }
+    } else {
+        gpu = gpu_index;
+        gpus = &gpu;
+        ngpus = 1;
+    }
+
+    int cam_index = find_int_arg(argc, argv, "-c", 0);
+    int clear = find_arg(argc, argv, "-clear");
+    int display = find_arg(argc, argv, "-display");
+    char *data = argv[3];
+    char *cfg = argv[4];
+    char *weights = (argc > 5) ? argv[5] : 0;
+    char *filename = (argc > 6) ? argv[6]: 0;
+    if(0==strcmp(argv[2], "test")) predict_segmenter(data, cfg, weights, filename);
+    else if(0==strcmp(argv[2], "train")) train_segmenter(data, cfg, weights, gpus, ngpus, clear, display);
+    else if(0==strcmp(argv[2], "demo")) demo_segmenter(data, cfg, weights, cam_index, filename);
+}
+
+
diff --git a/image.darknet/src/super.c b/image.darknet/inst/include/darknet/examples/super.c
similarity index 78%
rename from image.darknet/src/super.c
rename to image.darknet/inst/include/darknet/examples/super.c
index 63e9860..d34406b 100644
--- a/image.darknet/src/super.c
+++ b/image.darknet/inst/include/darknet/examples/super.c
@@ -1,13 +1,6 @@
-#include "network.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
+#include "darknet.h"
 
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-void train_super(char *cfgfile, char *weightfile)
+void train_super(char *cfgfile, char *weightfile, int clear)
 {
     char *train_images = "/data/imagenet/imagenet1k.train.list";
     char *backup_directory = "/home/pjreddie/backup/";
@@ -15,13 +8,10 @@ void train_super(char *cfgfile, char *weightfile)
     char *base = basecfg(cfgfile);
     printf("%s\n", base);
     float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = net.batch*net.subdivisions;
-    int i = *net.seen/imgs;
+    network *net = load_network(cfgfile, weightfile, clear);
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    int imgs = net->batch*net->subdivisions;
+    int i = *net->seen/imgs;
     data train, buffer;
 
 
@@ -30,8 +20,8 @@ void train_super(char *cfgfile, char *weightfile)
     char **paths = (char **)list_to_array(plist);
 
     load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
+    args.w = net->w;
+    args.h = net->h;
     args.scale = 4;
     args.paths = paths;
     args.n = imgs;
@@ -42,7 +32,7 @@ void train_super(char *cfgfile, char *weightfile)
     pthread_t load_thread = load_data_in_thread(args);
     clock_t time;
     //while(i*imgs < N*120){
-    while(get_current_batch(net) < net.max_batches){
+    while(get_current_batch(net) < net->max_batches){
         i += 1;
         time=clock();
         pthread_join(load_thread, 0);
@@ -76,11 +66,8 @@ void train_super(char *cfgfile, char *weightfile)
 
 void test_super(char *cfgfile, char *weightfile, char *filename)
 {
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
     srand(2222222);
 
     clock_t time;
@@ -97,7 +84,7 @@ void test_super(char *cfgfile, char *weightfile, char *filename)
             strtok(input, "\n");
         }
         image im = load_image_color(input, 0, 0);
-        resize_network(&net, im.w, im.h);
+        resize_network(net, im.w, im.h);
         printf("%d %d\n", im.w, im.h);
 
         float *X = im.data;
@@ -106,6 +93,7 @@ void test_super(char *cfgfile, char *weightfile, char *filename)
         image out = get_network_image(net);
         printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
         save_image(out, "out");
+        show_image(out, "out", 0);
 
         free_image(im);
         if (filename) break;
@@ -123,7 +111,8 @@ void run_super(int argc, char **argv)
     char *cfg = argv[3];
     char *weights = (argc > 4) ? argv[4] : 0;
     char *filename = (argc > 5) ? argv[5] : 0;
-    if(0==strcmp(argv[2], "train")) train_super(cfg, weights);
+    int clear = find_arg(argc, argv, "-clear");
+    if(0==strcmp(argv[2], "train")) train_super(cfg, weights, clear);
     else if(0==strcmp(argv[2], "test")) test_super(cfg, weights, filename);
     /*
     else if(0==strcmp(argv[2], "valid")) validate_super(cfg, weights);
diff --git a/image.darknet/src/swag.c b/image.darknet/inst/include/darknet/examples/swag.c
similarity index 92%
rename from image.darknet/src/swag.c
rename to image.darknet/inst/include/darknet/examples/swag.c
index 2cb3093..c22d785 100644
--- a/image.darknet/src/swag.c
+++ b/image.darknet/inst/include/darknet/examples/swag.c
@@ -1,13 +1,5 @@
-#include "network.h"
-#include "detection_layer.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "box.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
+#include "darknet.h"
+#include <sys/time.h>
 
 void train_swag(char *cfgfile, char *weightfile)
 {
diff --git a/image.darknet/src/tag.c b/image.darknet/inst/include/darknet/examples/tag.c
similarity index 72%
rename from image.darknet/src/tag.c
rename to image.darknet/inst/include/darknet/examples/tag.c
index 1e43e7d..4caf8cb 100644
--- a/image.darknet/src/tag.c
+++ b/image.darknet/inst/include/darknet/examples/tag.c
@@ -1,10 +1,4 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
+#include "darknet.h"
 
 void train_tag(char *cfgfile, char *weightfile, int clear)
 {
@@ -13,12 +7,8 @@ void train_tag(char *cfgfile, char *weightfile, int clear)
     char *base = basecfg(cfgfile);
     char *backup_directory = "/home/pjreddie/backup/";
     printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    if(clear) *net.seen = 0;
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    network *net = load_network(cfgfile, weightfile, clear);
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
     int imgs = 1024;
     list *plist = get_paths("/home/pjreddie/tag/train.list");
     char **paths = (char **)list_to_array(plist);
@@ -30,30 +20,30 @@ void train_tag(char *cfgfile, char *weightfile, int clear)
     data buffer;
 
     load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
+    args.w = net->w;
+    args.h = net->h;
 
-    args.min = net.w;
-    args.max = net.max_crop;
-    args.size = net.w;
+    args.min = net->w;
+    args.max = net->max_crop;
+    args.size = net->w;
 
     args.paths = paths;
-    args.classes = net.outputs;
+    args.classes = net->outputs;
     args.n = imgs;
     args.m = N;
     args.d = &buffer;
     args.type = TAG_DATA;
 
-    args.angle = net.angle;
-    args.exposure = net.exposure;
-    args.saturation = net.saturation;
-    args.hue = net.hue;
+    args.angle = net->angle;
+    args.exposure = net->exposure;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
 
-    fprintf(stderr, "%d classes\n", net.outputs);
+    fprintf(stderr, "%d classes\n", net->outputs);
 
     load_thread = load_data_in_thread(args);
-    int epoch = (*net.seen)/N;
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+    int epoch = (*net->seen)/N;
+    while(get_current_batch(net) < net->max_batches || net->max_batches == 0){
         time=clock();
         pthread_join(load_thread, 0);
         train = buffer;
@@ -64,10 +54,10 @@ void train_tag(char *cfgfile, char *weightfile, int clear)
         float loss = train_network(net, train);
         if(avg_loss == -1) avg_loss = loss;
         avg_loss = avg_loss*.9 + loss*.1;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net->seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net->seen);
         free_data(train);
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
+        if(*net->seen/N > epoch){
+            epoch = *net->seen/N;
             char buff[256];
             sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
             save_weights(net, buff);
@@ -92,11 +82,8 @@ void train_tag(char *cfgfile, char *weightfile, int clear)
 
 void test_tag(char *cfgfile, char *weightfile, char *filename)
 {
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
     srand(2222222);
     int i = 0;
     char **names = get_labels("data/tags.txt");
@@ -104,7 +91,7 @@ void test_tag(char *cfgfile, char *weightfile, char *filename)
     int indexes[10];
     char buff[256];
     char *input = buff;
-    int size = net.w;
+    int size = net->w;
     while(1){
         if(filename){
             strncpy(input, filename, 256);
@@ -117,7 +104,7 @@ void test_tag(char *cfgfile, char *weightfile, char *filename)
         }
         image im = load_image_color(input, 0, 0);
         image r = resize_min(im, size);
-        resize_network(&net, r.w, r.h);
+        resize_network(net, r.w, r.h);
         printf("%d %d\n", r.w, r.h);
 
         float *X = r.data;
diff --git a/image.darknet/src/voxel.c b/image.darknet/inst/include/darknet/examples/voxel.c
similarity index 96%
rename from image.darknet/src/voxel.c
rename to image.darknet/inst/include/darknet/examples/voxel.c
index 1b53880..01ea9bb 100644
--- a/image.darknet/src/voxel.c
+++ b/image.darknet/inst/include/darknet/examples/voxel.c
@@ -1,12 +1,4 @@
-#include "network.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-image get_image_from_stream(CvCapture *cap);
-#endif
+#include "darknet.h"
 
 void extract_voxel(char *lfile, char *rfile, char *prefix)
 {
diff --git a/image.darknet/inst/include/darknet/src/writing.c b/image.darknet/inst/include/darknet/examples/writing.c
similarity index 90%
rename from image.darknet/inst/include/darknet/src/writing.c
rename to image.darknet/inst/include/darknet/examples/writing.c
index 0a76d48..1b6ff83 100644
--- a/image.darknet/inst/include/darknet/src/writing.c
+++ b/image.darknet/inst/include/darknet/examples/writing.c
@@ -1,10 +1,4 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
+#include "darknet.h"
 
 void train_writing(char *cfgfile, char *weightfile)
 {
@@ -69,11 +63,11 @@ void train_writing(char *cfgfile, char *weightfile)
 
         if(avg_loss == -1) avg_loss = loss;
         avg_loss = avg_loss*.9 + loss*.1;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
+        printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
         free_data(train);
         if(get_current_batch(net)%100 == 0){
             char buff[256];
-            sprintf(buff, "%s/%s_batch_%d.weights", backup_directory, base, get_current_batch(net));
+            sprintf(buff, "%s/%s_batch_%ld.weights", backup_directory, base, get_current_batch(net));
             save_weights(net, buff);
         }
         if(*net.seen/N > epoch){
diff --git a/image.darknet/src/yolo.c b/image.darknet/inst/include/darknet/examples/yolo.c
similarity index 68%
rename from image.darknet/src/yolo.c
rename to image.darknet/inst/include/darknet/examples/yolo.c
index ee5f73b..4ddb69a 100644
--- a/image.darknet/src/yolo.c
+++ b/image.darknet/inst/include/darknet/examples/yolo.c
@@ -1,14 +1,4 @@
-#include "network.h"
-#include "detection_layer.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "box.h"
-#include "demo.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
+#include "darknet.h"
 
 char *voc_names[] = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"};
 
@@ -20,17 +10,14 @@ void train_yolo(char *cfgfile, char *weightfile)
     char *base = basecfg(cfgfile);
     printf("%s\n", base);
     float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = net.batch*net.subdivisions;
-    int i = *net.seen/imgs;
+    network *net = load_network(cfgfile, weightfile, 0);
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+    int imgs = net->batch*net->subdivisions;
+    int i = *net->seen/imgs;
     data train, buffer;
 
 
-    layer l = net.layers[net.n - 1];
+    layer l = net->layers[net->n - 1];
 
     int side = l.side;
     int classes = l.classes;
@@ -41,8 +28,8 @@ void train_yolo(char *cfgfile, char *weightfile)
     char **paths = (char **)list_to_array(plist);
 
     load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
+    args.w = net->w;
+    args.h = net->h;
     args.paths = paths;
     args.n = imgs;
     args.m = plist->size;
@@ -52,15 +39,15 @@ void train_yolo(char *cfgfile, char *weightfile)
     args.d = &buffer;
     args.type = REGION_DATA;
 
-    args.angle = net.angle;
-    args.exposure = net.exposure;
-    args.saturation = net.saturation;
-    args.hue = net.hue;
+    args.angle = net->angle;
+    args.exposure = net->exposure;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
 
     pthread_t load_thread = load_data_in_thread(args);
     clock_t time;
     //while(i*imgs < N*120){
-    while(get_current_batch(net) < net.max_batches){
+    while(get_current_batch(net) < net->max_batches){
         i += 1;
         time=clock();
         pthread_join(load_thread, 0);
@@ -87,14 +74,14 @@ void train_yolo(char *cfgfile, char *weightfile)
     save_weights(net, buff);
 }
 
-void print_yolo_detections(FILE **fps, char *id, box *boxes, float **probs, int total, int classes, int w, int h)
+void print_yolo_detections(FILE **fps, char *id, int total, int classes, int w, int h, detection *dets)
 {
     int i, j;
     for(i = 0; i < total; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
+        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
 
         if (xmin < 0) xmin = 0;
         if (ymin < 0) ymin = 0;
@@ -102,20 +89,17 @@ void print_yolo_detections(FILE **fps, char *id, box *boxes, float **probs, int
         if (ymax > h) ymax = h;
 
         for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, probs[i][j],
+            if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
                     xmin, ymin, xmax, ymax);
         }
     }
 }
 
-void validate_yolo(char *cfgfile, char *weightfile)
+void validate_yolo(char *cfg, char *weights)
 {
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    network *net = load_network(cfg, weights, 0);
+    set_batch_network(net, 1);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
     srand(time(0));
 
     char *base = "results/comp4_det_test_";
@@ -124,7 +108,7 @@ void validate_yolo(char *cfgfile, char *weightfile)
     //list *plist = get_paths("data/voc.2012.test");
     char **paths = (char **)list_to_array(plist);
 
-    layer l = net.layers[net.n-1];
+    layer l = net->layers[net->n-1];
     int classes = l.classes;
 
     int j;
@@ -134,9 +118,6 @@ void validate_yolo(char *cfgfile, char *weightfile)
         snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
         fps[j] = fopen(buff, "w");
     }
-    box *boxes = calloc(l.side*l.side*l.n, sizeof(box));
-    float **probs = calloc(l.side*l.side*l.n, sizeof(float *));
-    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
 
     int m = plist->size;
     int i=0;
@@ -154,8 +135,8 @@ void validate_yolo(char *cfgfile, char *weightfile)
     pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
 
     load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
+    args.w = net->w;
+    args.h = net->h;
     args.type = IMAGE_DATA;
 
     for(t = 0; t < nthreads; ++t){
@@ -185,9 +166,11 @@ void validate_yolo(char *cfgfile, char *weightfile)
             network_predict(net, X);
             int w = val[t].w;
             int h = val[t].h;
-            get_detection_boxes(l, w, h, thresh, probs, boxes, 0);
-            if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, classes, iou_thresh);
-            print_yolo_detections(fps, id, boxes, probs, l.side*l.side*l.n, classes, w, h);
+            int nboxes = 0;
+            detection *dets = get_network_boxes(net, w, h, thresh, 0, 0, 0, &nboxes);
+            if (nms) do_nms_sort(dets, l.side*l.side*l.n, classes, iou_thresh);
+            print_yolo_detections(fps, id, l.side*l.side*l.n, classes, w, h, dets);
+            free_detections(dets, nboxes);
             free(id);
             free_image(val[t]);
             free_image(val_resized[t]);
@@ -196,21 +179,18 @@ void validate_yolo(char *cfgfile, char *weightfile)
     fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
 }
 
-void validate_yolo_recall(char *cfgfile, char *weightfile)
+void validate_yolo_recall(char *cfg, char *weights)
 {
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    network *net = load_network(cfg, weights, 0);
+    set_batch_network(net, 1);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
     srand(time(0));
 
     char *base = "results/comp4_det_test_";
     list *plist = get_paths("data/voc.2007.test");
     char **paths = (char **)list_to_array(plist);
 
-    layer l = net.layers[net.n-1];
+    layer l = net->layers[net->n-1];
     int classes = l.classes;
     int side = l.side;
 
@@ -221,9 +201,6 @@ void validate_yolo_recall(char *cfgfile, char *weightfile)
         snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
         fps[j] = fopen(buff, "w");
     }
-    box *boxes = calloc(side*side*l.n, sizeof(box));
-    float **probs = calloc(side*side*l.n, sizeof(float *));
-    for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
 
     int m = plist->size;
     int i=0;
@@ -240,11 +217,13 @@ void validate_yolo_recall(char *cfgfile, char *weightfile)
     for(i = 0; i < m; ++i){
         char *path = paths[i];
         image orig = load_image_color(path, 0, 0);
-        image sized = resize_image(orig, net.w, net.h);
+        image sized = resize_image(orig, net->w, net->h);
         char *id = basecfg(path);
         network_predict(net, sized.data);
-        get_detection_boxes(l, orig.w, orig.h, thresh, probs, boxes, 1);
-        if (nms) do_nms(boxes, probs, side*side*l.n, 1, nms);
+
+        int nboxes = 0;
+        detection *dets = get_network_boxes(net, orig.w, orig.h, thresh, 0, 0, 1, &nboxes);
+        if (nms) do_nms_obj(dets, side*side*l.n, 1, nms);
 
         char labelpath[4096];
         find_replace(path, "images", "labels", labelpath);
@@ -255,7 +234,7 @@ void validate_yolo_recall(char *cfgfile, char *weightfile)
         int num_labels = 0;
         box_label *truth = read_boxes(labelpath, &num_labels);
         for(k = 0; k < side*side*l.n; ++k){
-            if(probs[k][0] > thresh){
+            if(dets[k].objectness > thresh){
                 ++proposals;
             }
         }
@@ -264,8 +243,8 @@ void validate_yolo_recall(char *cfgfile, char *weightfile)
             box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
             float best_iou = 0;
             for(k = 0; k < side*side*l.n; ++k){
-                float iou = box_iou(boxes[k], t);
-                if(probs[k][0] > thresh && iou > best_iou){
+                float iou = box_iou(dets[k].bbox, t);
+                if(dets[k].objectness > thresh && iou > best_iou){
                     best_iou = iou;
                 }
             }
@@ -276,6 +255,7 @@ void validate_yolo_recall(char *cfgfile, char *weightfile)
         }
 
         fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
+        free_detections(dets, nboxes);
         free(id);
         free_image(orig);
         free_image(sized);
@@ -285,21 +265,14 @@ void validate_yolo_recall(char *cfgfile, char *weightfile)
 void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh)
 {
     image **alphabet = load_alphabet();
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    detection_layer l = net.layers[net.n-1];
-    set_batch_network(&net, 1);
+    network *net = load_network(cfgfile, weightfile, 0);
+    layer l = net->layers[net->n-1];
+    set_batch_network(net, 1);
     srand(2222222);
     clock_t time;
     char buff[256];
     char *input = buff;
-    int j;
     float nms=.4;
-    box *boxes = calloc(l.side*l.side*l.n, sizeof(box));
-    float **probs = calloc(l.side*l.side*l.n, sizeof(float *));
-    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
     while(1){
         if(filename){
             strncpy(input, filename, 256);
@@ -311,24 +284,22 @@ void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh)
             strtok(input, "\n");
         }
         image im = load_image_color(input,0,0);
-        image sized = resize_image(im, net.w, net.h);
+        image sized = resize_image(im, net->w, net->h);
         float *X = sized.data;
         time=clock();
         network_predict(net, X);
         printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 0);
-        if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms);
-        //draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, alphabet, 20);
-        draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, alphabet, 20);
-        save_image(im, "predictions");
-        show_image(im, "predictions");
 
+        int nboxes = 0;
+        detection *dets = get_network_boxes(net, 1, 1, thresh, 0, 0, 0, &nboxes);
+        if (nms) do_nms_sort(dets, l.side*l.side*l.n, l.classes, nms);
+
+        draw_detections(im, dets, l.side*l.side*l.n, thresh, voc_names, alphabet, 20);
+        save_image(im, "predictions");
+        show_image(im, "predictions", 0);
+        free_detections(dets, nboxes);
         free_image(im);
         free_image(sized);
-#ifdef OPENCV
-        cvWaitKey(0);
-        cvDestroyAllWindows();
-#endif
         if (filename) break;
     }
 }
@@ -344,6 +315,7 @@ void run_yolo(int argc, char **argv)
         return;
     }
 
+    int avg = find_int_arg(argc, argv, "-avg", 1);
     char *cfg = argv[3];
     char *weights = (argc > 4) ? argv[4] : 0;
     char *filename = (argc > 5) ? argv[5]: 0;
@@ -351,5 +323,5 @@ void run_yolo(int argc, char **argv)
     else if(0==strcmp(argv[2], "train")) train_yolo(cfg, weights);
     else if(0==strcmp(argv[2], "valid")) validate_yolo(cfg, weights);
     else if(0==strcmp(argv[2], "recall")) validate_yolo_recall(cfg, weights);
-    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, voc_names, 20, frame_skip, prefix, .5);
+    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, voc_names, 20, frame_skip, prefix, avg, .5, 0,0,0,0);
 }
diff --git a/image.darknet/inst/include/darknet/include/darknet.h b/image.darknet/inst/include/darknet/include/darknet.h
new file mode 100644
index 0000000..4390c61
--- /dev/null
+++ b/image.darknet/inst/include/darknet/include/darknet.h
@@ -0,0 +1,805 @@
+#ifndef DARKNET_API
+#define DARKNET_API
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+
+#ifdef GPU
+    #define BLOCK 512
+
+    #include "cuda_runtime.h"
+    #include "curand.h"
+    #include "cublas_v2.h"
+
+    #ifdef CUDNN
+    #include "cudnn.h"
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SECRET_NUM -1234
+extern int gpu_index;
+
+typedef struct{
+    int classes;
+    char **names;
+} metadata;
+
+metadata get_metadata(char *file);
+
+typedef struct{
+    int *leaf;
+    int n;
+    int *parent;
+    int *child;
+    int *group;
+    char **name;
+
+    int groups;
+    int *group_size;
+    int *group_offset;
+} tree;
+tree *read_tree(char *filename);
+
+typedef enum{
+    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU
+} ACTIVATION;
+
+typedef enum{
+    PNG, BMP, TGA, JPG
+} IMTYPE;
+
+typedef enum{
+    MULT, ADD, SUB, DIV
+} BINARY_ACTIVATION;
+
+typedef enum {
+    CONVOLUTIONAL,
+    DECONVOLUTIONAL,
+    CONNECTED,
+    MAXPOOL,
+    SOFTMAX,
+    DETECTION,
+    DROPOUT,
+    CROP,
+    ROUTE,
+    COST,
+    NORMALIZATION,
+    AVGPOOL,
+    LOCAL,
+    SHORTCUT,
+    ACTIVE,
+    RNN,
+    GRU,
+    LSTM,
+    CRNN,
+    BATCHNORM,
+    NETWORK,
+    XNOR,
+    REGION,
+    YOLO,
+    ISEG,
+    REORG,
+    UPSAMPLE,
+    LOGXENT,
+    L2NORM,
+    BLANK
+} LAYER_TYPE;
+
+typedef enum{
+    SSE, MASKED, L1, SEG, SMOOTH,WGAN
+} COST_TYPE;
+
+typedef struct{
+    int batch;
+    float learning_rate;
+    float momentum;
+    float decay;
+    int adam;
+    float B1;
+    float B2;
+    float eps;
+    int t;
+} update_args;
+
+struct network;
+typedef struct network network;
+
+struct layer;
+typedef struct layer layer;
+
+struct layer{
+    LAYER_TYPE type;
+    ACTIVATION activation;
+    COST_TYPE cost_type;
+    void (*forward)   (struct layer, struct network);
+    void (*backward)  (struct layer, struct network);
+    void (*update)    (struct layer, update_args);
+    void (*forward_gpu)   (struct layer, struct network);
+    void (*backward_gpu)  (struct layer, struct network);
+    void (*update_gpu)    (struct layer, update_args);
+    int batch_normalize;
+    int shortcut;
+    int batch;
+    int forced;
+    int flipped;
+    int inputs;
+    int outputs;
+    int nweights;
+    int nbiases;
+    int extra;
+    int truths;
+    int h,w,c;
+    int out_h, out_w, out_c;
+    int n;
+    int max_boxes;
+    int groups;
+    int size;
+    int side;
+    int stride;
+    int reverse;
+    int flatten;
+    int spatial;
+    int pad;
+    int sqrt;
+    int flip;
+    int index;
+    int binary;
+    int xnor;
+    int steps;
+    int hidden;
+    int truth;
+    float smooth;
+    float dot;
+    float angle;
+    float jitter;
+    float saturation;
+    float exposure;
+    float shift;
+    float ratio;
+    float learning_rate_scale;
+    float clip;
+    int noloss;
+    int softmax;
+    int classes;
+    int coords;
+    int background;
+    int rescore;
+    int objectness;
+    int joint;
+    int noadjust;
+    int reorg;
+    int log;
+    int tanh;
+    int *mask;
+    int total;
+
+    float alpha;
+    float beta;
+    float kappa;
+
+    float coord_scale;
+    float object_scale;
+    float noobject_scale;
+    float mask_scale;
+    float class_scale;
+    int bias_match;
+    int random;
+    float ignore_thresh;
+    float truth_thresh;
+    float thresh;
+    float focus;
+    int classfix;
+    int absolute;
+
+    int onlyforward;
+    int stopbackward;
+    int dontload;
+    int dontsave;
+    int dontloadscales;
+    int numload;
+
+    float temperature;
+    float probability;
+    float scale;
+
+    char  * cweights;
+    int   * indexes;
+    int   * input_layers;
+    int   * input_sizes;
+    int   * map;
+    int   * counts;
+    float ** sums;
+    float * rand;
+    float * cost;
+    float * state;
+    float * prev_state;
+    float * forgot_state;
+    float * forgot_delta;
+    float * state_delta;
+    float * combine_cpu;
+    float * combine_delta_cpu;
+
+    float * concat;
+    float * concat_delta;
+
+    float * binary_weights;
+
+    float * biases;
+    float * bias_updates;
+
+    float * scales;
+    float * scale_updates;
+
+    float * weights;
+    float * weight_updates;
+
+    float * delta;
+    float * output;
+    float * loss;
+    float * squared;
+    float * norms;
+
+    float * spatial_mean;
+    float * mean;
+    float * variance;
+
+    float * mean_delta;
+    float * variance_delta;
+
+    float * rolling_mean;
+    float * rolling_variance;
+
+    float * x;
+    float * x_norm;
+
+    float * m;
+    float * v;
+    
+    float * bias_m;
+    float * bias_v;
+    float * scale_m;
+    float * scale_v;
+
+
+    float *z_cpu;
+    float *r_cpu;
+    float *h_cpu;
+    float * prev_state_cpu;
+
+    float *temp_cpu;
+    float *temp2_cpu;
+    float *temp3_cpu;
+
+    float *dh_cpu;
+    float *hh_cpu;
+    float *prev_cell_cpu;
+    float *cell_cpu;
+    float *f_cpu;
+    float *i_cpu;
+    float *g_cpu;
+    float *o_cpu;
+    float *c_cpu;
+    float *dc_cpu; 
+
+    float * binary_input;
+
+    struct layer *input_layer;
+    struct layer *self_layer;
+    struct layer *output_layer;
+
+    struct layer *reset_layer;
+    struct layer *update_layer;
+    struct layer *state_layer;
+
+    struct layer *input_gate_layer;
+    struct layer *state_gate_layer;
+    struct layer *input_save_layer;
+    struct layer *state_save_layer;
+    struct layer *input_state_layer;
+    struct layer *state_state_layer;
+
+    struct layer *input_z_layer;
+    struct layer *state_z_layer;
+
+    struct layer *input_r_layer;
+    struct layer *state_r_layer;
+
+    struct layer *input_h_layer;
+    struct layer *state_h_layer;
+	
+    struct layer *wz;
+    struct layer *uz;
+    struct layer *wr;
+    struct layer *ur;
+    struct layer *wh;
+    struct layer *uh;
+    struct layer *uo;
+    struct layer *wo;
+    struct layer *uf;
+    struct layer *wf;
+    struct layer *ui;
+    struct layer *wi;
+    struct layer *ug;
+    struct layer *wg;
+
+    tree *softmax_tree;
+
+    size_t workspace_size;
+
+#ifdef GPU
+    int *indexes_gpu;
+
+    float *z_gpu;
+    float *r_gpu;
+    float *h_gpu;
+
+    float *temp_gpu;
+    float *temp2_gpu;
+    float *temp3_gpu;
+
+    float *dh_gpu;
+    float *hh_gpu;
+    float *prev_cell_gpu;
+    float *cell_gpu;
+    float *f_gpu;
+    float *i_gpu;
+    float *g_gpu;
+    float *o_gpu;
+    float *c_gpu;
+    float *dc_gpu; 
+
+    float *m_gpu;
+    float *v_gpu;
+    float *bias_m_gpu;
+    float *scale_m_gpu;
+    float *bias_v_gpu;
+    float *scale_v_gpu;
+
+    float * combine_gpu;
+    float * combine_delta_gpu;
+
+    float * prev_state_gpu;
+    float * forgot_state_gpu;
+    float * forgot_delta_gpu;
+    float * state_gpu;
+    float * state_delta_gpu;
+    float * gate_gpu;
+    float * gate_delta_gpu;
+    float * save_gpu;
+    float * save_delta_gpu;
+    float * concat_gpu;
+    float * concat_delta_gpu;
+
+    float * binary_input_gpu;
+    float * binary_weights_gpu;
+
+    float * mean_gpu;
+    float * variance_gpu;
+
+    float * rolling_mean_gpu;
+    float * rolling_variance_gpu;
+
+    float * variance_delta_gpu;
+    float * mean_delta_gpu;
+
+    float * x_gpu;
+    float * x_norm_gpu;
+    float * weights_gpu;
+    float * weight_updates_gpu;
+    float * weight_change_gpu;
+
+    float * biases_gpu;
+    float * bias_updates_gpu;
+    float * bias_change_gpu;
+
+    float * scales_gpu;
+    float * scale_updates_gpu;
+    float * scale_change_gpu;
+
+    float * output_gpu;
+    float * loss_gpu;
+    float * delta_gpu;
+    float * rand_gpu;
+    float * squared_gpu;
+    float * norms_gpu;
+#ifdef CUDNN
+    cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
+    cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
+    cudnnTensorDescriptor_t normTensorDesc;
+    cudnnFilterDescriptor_t weightDesc;
+    cudnnFilterDescriptor_t dweightDesc;
+    cudnnConvolutionDescriptor_t convDesc;
+    cudnnConvolutionFwdAlgo_t fw_algo;
+    cudnnConvolutionBwdDataAlgo_t bd_algo;
+    cudnnConvolutionBwdFilterAlgo_t bf_algo;
+#endif
+#endif
+};
+
+void free_layer(layer);
+
+typedef enum {
+    CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM
+} learning_rate_policy;
+
+typedef struct network{
+    int n;
+    int batch;
+    size_t *seen;
+    int *t;
+    float epoch;
+    int subdivisions;
+    layer *layers;
+    float *output;
+    learning_rate_policy policy;
+
+    float learning_rate;
+    float momentum;
+    float decay;
+    float gamma;
+    float scale;
+    float power;
+    int time_steps;
+    int step;
+    int max_batches;
+    float *scales;
+    int   *steps;
+    int num_steps;
+    int burn_in;
+
+    int adam;
+    float B1;
+    float B2;
+    float eps;
+
+    int inputs;
+    int outputs;
+    int truths;
+    int notruth;
+    int h, w, c;
+    int max_crop;
+    int min_crop;
+    float max_ratio;
+    float min_ratio;
+    int center;
+    float angle;
+    float aspect;
+    float exposure;
+    float saturation;
+    float hue;
+    int random;
+
+    int gpu_index;
+    tree *hierarchy;
+
+    float *input;
+    float *truth;
+    float *delta;
+    float *workspace;
+    int train;
+    int index;
+    float *cost;
+    float clip;
+
+#ifdef GPU
+    float *input_gpu;
+    float *truth_gpu;
+    float *delta_gpu;
+    float *output_gpu;
+#endif
+
+} network;
+
+typedef struct {
+    int w;
+    int h;
+    float scale;
+    float rad;
+    float dx;
+    float dy;
+    float aspect;
+} augment_args;
+
+typedef struct {
+    int w;
+    int h;
+    int c;
+    float *data;
+} image;
+
+typedef struct{
+    float x, y, w, h;
+} box;
+
+typedef struct detection{
+    box bbox;
+    int classes;
+    float *prob;
+    float *mask;
+    float objectness;
+    int sort_class;
+} detection;
+
+typedef struct matrix{
+    int rows, cols;
+    float **vals;
+} matrix;
+
+
+typedef struct{
+    int w, h;
+    matrix X;
+    matrix y;
+    int shallow;
+    int *num_boxes;
+    box **boxes;
+} data;
+
+typedef enum {
+    CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA, LETTERBOX_DATA, REGRESSION_DATA, SEGMENTATION_DATA, INSTANCE_DATA, ISEG_DATA
+} data_type;
+
+typedef struct load_args{
+    int threads;
+    char **paths;
+    char *path;
+    int n;
+    int m;
+    char **labels;
+    int h;
+    int w;
+    int out_w;
+    int out_h;
+    int nh;
+    int nw;
+    int num_boxes;
+    int min, max, size;
+    int classes;
+    int background;
+    int scale;
+    int center;
+    int coords;
+    float jitter;
+    float angle;
+    float aspect;
+    float saturation;
+    float exposure;
+    float hue;
+    data *d;
+    image *im;
+    image *resized;
+    data_type type;
+    tree *hierarchy;
+} load_args;
+
+typedef struct{
+    int id;
+    float x,y,w,h;
+    float left, right, top, bottom;
+} box_label;
+
+
+network *load_network(char *cfg, char *weights, int clear);
+load_args get_base_args(network *net);
+
+void free_data(data d);
+
+typedef struct node{
+    void *val;
+    struct node *next;
+    struct node *prev;
+} node;
+
+typedef struct list{
+    int size;
+    node *front;
+    node *back;
+} list;
+
+pthread_t load_data(load_args args);
+list *read_data_cfg(char *filename);
+list *read_cfg(char *filename);
+unsigned char *read_file(char *filename);
+data resize_data(data orig, int w, int h);
+data *tile_data(data orig, int divs, int size);
+data select_data(data *orig, int *inds);
+
+void forward_network(network *net);
+void backward_network(network *net);
+void update_network(network *net);
+
+
+float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
+void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
+void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
+void scal_cpu(int N, float ALPHA, float *X, int INCX);
+void fill_cpu(int N, float ALPHA, float * X, int INCX);
+void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
+void softmax(float *input, int n, float temp, int stride, float *output);
+
+int best_3d_shift_r(image a, image b, int min, int max);
+#ifdef GPU
+void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
+void fill_gpu(int N, float ALPHA, float * X, int INCX);
+void scal_gpu(int N, float ALPHA, float * X, int INCX);
+void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
+
+void cuda_set_device(int n);
+void cuda_free(float *x_gpu);
+float *cuda_make_array(float *x, size_t n);
+void cuda_pull_array(float *x_gpu, float *x, size_t n);
+float cuda_mag_array(float *x_gpu, size_t n);
+void cuda_push_array(float *x_gpu, float *x, size_t n);
+
+void forward_network_gpu(network *net);
+void backward_network_gpu(network *net);
+void update_network_gpu(network *net);
+
+float train_networks(network **nets, int n, data d, int interval);
+void sync_nets(network **nets, int n, int interval);
+void harmless_update_network_gpu(network *net);
+#endif
+image get_label(image **characters, char *string, int size);
+void draw_label(image a, int r, int c, image label, const float *rgb);
+void save_image(image im, const char *name);
+void save_image_options(image im, const char *name, IMTYPE f, int quality);
+void get_next_batch(data d, int n, int offset, float *X, float *y);
+void grayscale_image_3c(image im);
+void normalize_image(image p);
+void matrix_to_csv(matrix m);
+float train_network_sgd(network *net, data d, int n);
+void rgbgr_image(image im);
+data copy_data(data d);
+data concat_data(data d1, data d2);
+data load_cifar10_data(char *filename);
+float matrix_topk_accuracy(matrix truth, matrix guess, int k);
+void matrix_add_matrix(matrix from, matrix to);
+void scale_matrix(matrix m, float scale);
+matrix csv_to_matrix(char *filename);
+float *network_accuracies(network *net, data d, int n);
+float train_network_datum(network *net);
+image make_random_image(int w, int h, int c);
+
+void denormalize_connected_layer(layer l);
+void denormalize_convolutional_layer(layer l);
+void statistics_connected_layer(layer l);
+void rescale_weights(layer l, float scale, float trans);
+void rgbgr_weights(layer l);
+image *get_weights(layer l);
+
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, int avg, float hier_thresh, int w, int h, int fps, int fullscreen);
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets);
+
+char *option_find_str(list *l, char *key, char *def);
+int option_find_int(list *l, char *key, int def);
+int option_find_int_quiet(list *l, char *key, int def);
+
+network *parse_network_cfg(char *filename);
+void save_weights(network *net, char *filename);
+void load_weights(network *net, char *filename);
+void save_weights_upto(network *net, char *filename, int cutoff);
+void load_weights_upto(network *net, char *filename, int start, int cutoff);
+
+void zero_objectness(layer l);
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets);
+int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets);
+void free_network(network *net);
+void set_batch_network(network *net, int b);
+void set_temp_network(network *net, float t);
+image load_image(char *filename, int w, int h, int c);
+image load_image_color(char *filename, int w, int h);
+image make_image(int w, int h, int c);
+image resize_image(image im, int w, int h);
+void censor_image(image im, int dx, int dy, int w, int h);
+image letterbox_image(image im, int w, int h);
+image crop_image(image im, int dx, int dy, int w, int h);
+image center_crop_image(image im, int w, int h);
+image resize_min(image im, int min);
+image resize_max(image im, int max);
+image threshold_image(image im, float thresh);
+image mask_to_rgb(image mask);
+int resize_network(network *net, int w, int h);
+void free_matrix(matrix m);
+void test_resize(char *filename);
+int show_image(image p, const char *name, int ms);
+image copy_image(image p);
+void draw_box_width(image a, int x1, int y1, int x2, int y2, int w, float r, float g, float b);
+float get_current_rate(network *net);
+void composite_3d(char *f1, char *f2, char *out, int delta);
+data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h);
+size_t get_current_batch(network *net);
+void constrain_image(image im);
+image get_network_image_layer(network *net, int i);
+layer get_network_output_layer(network *net);
+void top_predictions(network *net, int n, int *index);
+void flip_image(image a);
+image float_to_image(int w, int h, int c, float *data);
+void ghost_image(image source, image dest, int dx, int dy);
+float network_accuracy(network *net, data d);
+void random_distort_image(image im, float hue, float saturation, float exposure);
+void fill_image(image m, float s);
+image grayscale_image(image im);
+void rotate_image_cw(image im, int times);
+double what_time_is_it_now();
+image rotate_image(image m, float rad);
+void visualize_network(network *net);
+float box_iou(box a, box b);
+data load_all_cifar10();
+box_label *read_boxes(char *filename, int *n);
+box float_to_box(float *f, int stride);
+void draw_detections(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes);
+
+matrix network_predict_data(network *net, data test);
+image **load_alphabet();
+image get_network_image(network *net);
+float *network_predict(network *net, float *input);
+
+int network_width(network *net);
+int network_height(network *net);
+float *network_predict_image(network *net, image im);
+void network_detect(network *net, image im, float thresh, float hier_thresh, float nms, detection *dets);
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num);
+void free_detections(detection *dets, int n);
+
+void reset_network_state(network *net, int b);
+
+char **get_labels(char *filename);
+void do_nms_obj(detection *dets, int total, int classes, float thresh);
+void do_nms_sort(detection *dets, int total, int classes, float thresh);
+
+matrix make_matrix(int rows, int cols);
+
+#ifdef OPENCV
+void *open_video_stream(const char *f, int c, int w, int h, int fps);
+image get_image_from_stream(void *p);
+void make_window(char *name, int w, int h, int fullscreen);
+#endif
+
+void free_image(image m);
+float train_network(network *net, data d);
+pthread_t load_data_in_thread(load_args args);
+void load_data_blocking(load_args args);
+list *get_paths(char *filename);
+void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves, int stride);
+void change_leaves(tree *t, char *leaf_list);
+
+int find_int_arg(int argc, char **argv, char *arg, int def);
+float find_float_arg(int argc, char **argv, char *arg, float def);
+int find_arg(int argc, char* argv[], char *arg);
+char *find_char_arg(int argc, char **argv, char *arg, char *def);
+char *basecfg(char *cfgfile);
+void find_replace(char *str, char *orig, char *rep, char *output);
+void free_ptrs(void **ptrs, int n);
+char *fgetl(FILE *fp);
+void strip(char *s);
+float sec(clock_t clocks);
+void **list_to_array(list *l);
+void top_k(float *a, int n, int k, int *index);
+int *read_map(char *filename);
+void error(const char *s);
+int max_index(float *a, int n);
+int max_int_index(int *a, int n);
+int sample_array(float *a, int n);
+int *random_index_order(int min, int max);
+void free_list(list *l);
+float mse_array(float *a, int n);
+float variance_array(float *a, int n);
+float mag_array(float *a, int n);
+void scale_array(float *a, int n, float s);
+float mean_array(float *a, int n);
+float sum_array(float *a, int n);
+void normalize_array(float *a, int n);
+int *read_intlist(char *s, int *n, int d);
+size_t rand_size_t();
+float rand_normal();
+float rand_uniform(float min, float max);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/image.darknet/inst/include/darknet/python/darknet.py b/image.darknet/inst/include/darknet/python/darknet.py
new file mode 100644
index 0000000..88d84cd
--- /dev/null
+++ b/image.darknet/inst/include/darknet/python/darknet.py
@@ -0,0 +1,156 @@
+from ctypes import *
+import math
+import random
+
+def sample(probs):
+    s = sum(probs)
+    probs = [a/s for a in probs]
+    r = random.uniform(0, 1)
+    for i in range(len(probs)):
+        r = r - probs[i]
+        if r <= 0:
+            return i
+    return len(probs)-1
+
+def c_array(ctype, values):
+    arr = (ctype*len(values))()
+    arr[:] = values
+    return arr
+
+class BOX(Structure):
+    _fields_ = [("x", c_float),
+                ("y", c_float),
+                ("w", c_float),
+                ("h", c_float)]
+
+class DETECTION(Structure):
+    _fields_ = [("bbox", BOX),
+                ("classes", c_int),
+                ("prob", POINTER(c_float)),
+                ("mask", POINTER(c_float)),
+                ("objectness", c_float),
+                ("sort_class", c_int)]
+
+
+class IMAGE(Structure):
+    _fields_ = [("w", c_int),
+                ("h", c_int),
+                ("c", c_int),
+                ("data", POINTER(c_float))]
+
+class METADATA(Structure):
+    _fields_ = [("classes", c_int),
+                ("names", POINTER(c_char_p))]
+
+    
+
+#lib = CDLL("/home/pjreddie/documents/darknet/libdarknet.so", RTLD_GLOBAL)
+lib = CDLL("libdarknet.so", RTLD_GLOBAL)
+lib.network_width.argtypes = [c_void_p]
+lib.network_width.restype = c_int
+lib.network_height.argtypes = [c_void_p]
+lib.network_height.restype = c_int
+
+predict = lib.network_predict
+predict.argtypes = [c_void_p, POINTER(c_float)]
+predict.restype = POINTER(c_float)
+
+set_gpu = lib.cuda_set_device
+set_gpu.argtypes = [c_int]
+
+make_image = lib.make_image
+make_image.argtypes = [c_int, c_int, c_int]
+make_image.restype = IMAGE
+
+get_network_boxes = lib.get_network_boxes
+get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int, POINTER(c_int)]
+get_network_boxes.restype = POINTER(DETECTION)
+
+make_network_boxes = lib.make_network_boxes
+make_network_boxes.argtypes = [c_void_p]
+make_network_boxes.restype = POINTER(DETECTION)
+
+free_detections = lib.free_detections
+free_detections.argtypes = [POINTER(DETECTION), c_int]
+
+free_ptrs = lib.free_ptrs
+free_ptrs.argtypes = [POINTER(c_void_p), c_int]
+
+network_predict = lib.network_predict
+network_predict.argtypes = [c_void_p, POINTER(c_float)]
+
+reset_rnn = lib.reset_rnn
+reset_rnn.argtypes = [c_void_p]
+
+load_net = lib.load_network
+load_net.argtypes = [c_char_p, c_char_p, c_int]
+load_net.restype = c_void_p
+
+do_nms_obj = lib.do_nms_obj
+do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
+
+do_nms_sort = lib.do_nms_sort
+do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
+
+free_image = lib.free_image
+free_image.argtypes = [IMAGE]
+
+letterbox_image = lib.letterbox_image
+letterbox_image.argtypes = [IMAGE, c_int, c_int]
+letterbox_image.restype = IMAGE
+
+load_meta = lib.get_metadata
+lib.get_metadata.argtypes = [c_char_p]
+lib.get_metadata.restype = METADATA
+
+load_image = lib.load_image_color
+load_image.argtypes = [c_char_p, c_int, c_int]
+load_image.restype = IMAGE
+
+rgbgr_image = lib.rgbgr_image
+rgbgr_image.argtypes = [IMAGE]
+
+predict_image = lib.network_predict_image
+predict_image.argtypes = [c_void_p, IMAGE]
+predict_image.restype = POINTER(c_float)
+
+def classify(net, meta, im):
+    out = predict_image(net, im)
+    res = []
+    for i in range(meta.classes):
+        res.append((meta.names[i], out[i]))
+    res = sorted(res, key=lambda x: -x[1])
+    return res
+
+def detect(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
+    im = load_image(image, 0, 0)
+    num = c_int(0)
+    pnum = pointer(num)
+    predict_image(net, im)
+    dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0, pnum)
+    num = pnum[0]
+    if (nms): do_nms_obj(dets, num, meta.classes, nms);
+
+    res = []
+    for j in range(num):
+        for i in range(meta.classes):
+            if dets[j].prob[i] > 0:
+                b = dets[j].bbox
+                res.append((meta.names[i], dets[j].prob[i], (b.x, b.y, b.w, b.h)))
+    res = sorted(res, key=lambda x: -x[1])
+    free_image(im)
+    free_detections(dets, num)
+    return res
+    
+if __name__ == "__main__":
+    #net = load_net("cfg/densenet201.cfg", "/home/pjreddie/trained/densenet201.weights", 0)
+    #im = load_image("data/wolf.jpg", 0, 0)
+    #meta = load_meta("cfg/imagenet1k.data")
+    #r = classify(net, meta, im)
+    #print r[:10]
+    net = load_net("cfg/tiny-yolo.cfg", "tiny-yolo.weights", 0)
+    meta = load_meta("cfg/coco.data")
+    r = detect(net, meta, "data/dog.jpg")
+    print r
+    
+
diff --git a/image.darknet/inst/include/darknet/python/proverbot.py b/image.darknet/inst/include/darknet/python/proverbot.py
new file mode 100644
index 0000000..095aae8
--- /dev/null
+++ b/image.darknet/inst/include/darknet/python/proverbot.py
@@ -0,0 +1,37 @@
+from darknet import *
+
+def predict_tactic(net, s):
+    prob = 0
+    d = c_array(c_float, [0.0]*256)
+    tac = ''
+    if not len(s):
+        s = '\n'
+    for c in s[:-1]:
+        d[ord(c)] = 1
+        pred = predict(net, d)
+        d[ord(c)] = 0
+    c = s[-1]
+    while 1:
+        d[ord(c)] = 1
+        pred = predict(net, d)
+        d[ord(c)] = 0
+        pred = [pred[i] for i in range(256)]
+        ind = sample(pred)
+        c = chr(ind)
+        prob += math.log(pred[ind])
+        if len(tac) and tac[-1] == '.':
+            break
+        tac = tac + c
+    return (tac, prob)
+
+def predict_tactics(net, s, n):
+    tacs = []
+    for i in range(n):
+        reset_rnn(net)
+        tacs.append(predict_tactic(net, s))
+    tacs = sorted(tacs, key=lambda x: -x[1])
+    return tacs
+
+net = load_net("cfg/coq.test.cfg", "/home/pjreddie/backup/coq.backup", 0)
+t = predict_tactics(net, "+++++\n", 10)
+print t
diff --git a/image.darknet/inst/include/darknet/scripts/get_coco_dataset.sh b/image.darknet/inst/include/darknet/scripts/get_coco_dataset.sh
new file mode 100644
index 0000000..2846301
--- /dev/null
+++ b/image.darknet/inst/include/darknet/scripts/get_coco_dataset.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Clone COCO API
+git clone https://github.com/pdollar/coco
+cd coco
+
+mkdir images
+cd images
+
+# Download Images
+wget -c https://pjreddie.com/media/files/train2014.zip
+wget -c https://pjreddie.com/media/files/val2014.zip
+
+# Unzip
+unzip -q train2014.zip
+unzip -q val2014.zip
+
+cd ..
+
+# Download COCO Metadata
+wget -c https://pjreddie.com/media/files/instances_train-val2014.zip
+wget -c https://pjreddie.com/media/files/coco/5k.part
+wget -c https://pjreddie.com/media/files/coco/trainvalno5k.part
+wget -c https://pjreddie.com/media/files/coco/labels.tgz
+tar xzf labels.tgz
+unzip -q instances_train-val2014.zip
+
+# Set Up Image Lists
+paste <(awk "{print \"$PWD\"}" <5k.part) 5k.part | tr -d '\t' > 5k.txt
+paste <(awk "{print \"$PWD\"}" <trainvalno5k.part) trainvalno5k.part | tr -d '\t' > trainvalno5k.txt
+
diff --git a/image.darknet/inst/include/darknet/scripts/voc_label.py b/image.darknet/inst/include/darknet/scripts/voc_label.py
index d1e8823..679fc36 100644
--- a/image.darknet/inst/include/darknet/scripts/voc_label.py
+++ b/image.darknet/inst/include/darknet/scripts/voc_label.py
@@ -10,10 +10,10 @@
 
 
 def convert(size, box):
-    dw = 1./size[0]
-    dh = 1./size[1]
-    x = (box[0] + box[1])/2.0
-    y = (box[2] + box[3])/2.0
+    dw = 1./(size[0])
+    dh = 1./(size[1])
+    x = (box[0] + box[1])/2.0 - 1
+    y = (box[2] + box[3])/2.0 - 1
     w = box[1] - box[0]
     h = box[3] - box[2]
     x = x*dw
@@ -34,7 +34,7 @@ def convert_annotation(year, image_id):
     for obj in root.iter('object'):
         difficult = obj.find('difficult').text
         cls = obj.find('name').text
-        if cls not in classes or int(difficult) == 1:
+        if cls not in classes or int(difficult)==1:
             continue
         cls_id = classes.index(cls)
         xmlbox = obj.find('bndbox')
@@ -54,3 +54,6 @@ def convert_annotation(year, image_id):
         convert_annotation(year, image_id)
     list_file.close()
 
+os.system("cat 2007_train.txt 2007_val.txt 2012_train.txt 2012_val.txt > train.txt")
+os.system("cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt > train.all.txt")
+
diff --git a/image.darknet/inst/include/darknet/src/activation_kernels.cu b/image.darknet/inst/include/darknet/src/activation_kernels.cu
index 994e206..4dc5804 100644
--- a/image.darknet/inst/include/darknet/src/activation_kernels.cu
+++ b/image.darknet/inst/include/darknet/src/activation_kernels.cu
@@ -10,8 +10,8 @@ extern "C" {
 
 __device__ float lhtan_activate_kernel(float x)
 {
-    if(x < 0) return .001*x;
-    if(x > 1) return .001*(x-1) + 1;
+    if(x < 0) return .001f*x;
+    if(x > 1) return .001f*(x-1.f) + 1.f;
     return x;
 }
 __device__ float lhtan_gradient_kernel(float x)
@@ -27,25 +27,26 @@ __device__ float hardtan_activate_kernel(float x)
     return x;
 }
 __device__ float linear_activate_kernel(float x){return x;}
-__device__ float logistic_activate_kernel(float x){return 1./(1. + exp(-x));}
-__device__ float loggy_activate_kernel(float x){return 2./(1. + exp(-x)) - 1;}
+__device__ float logistic_activate_kernel(float x){return 1.f/(1.f + expf(-x));}
+__device__ float loggy_activate_kernel(float x){return 2.f/(1.f + expf(-x)) - 1;}
 __device__ float relu_activate_kernel(float x){return x*(x>0);}
-__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);}
-__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01*x;}
-__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1*x;}
-__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1*x;}
-__device__ float tanh_activate_kernel(float x){return (2/(1 + exp(-2*x)) - 1);}
+__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(expf(x)-1);}
+__device__ float selu_activate_kernel(float x){return (x >= 0)*1.0507f*x + (x < 0)*1.0507f*1.6732f*(expf(x)-1);}
+__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01f*x;}
+__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1f*x;}
+__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1f*x;}
+__device__ float tanh_activate_kernel(float x){return (2.f/(1 + expf(-2*x)) - 1);}
 __device__ float plse_activate_kernel(float x)
 {
-    if(x < -4) return .01 * (x + 4);
-    if(x > 4)  return .01 * (x - 4) + 1;
-    return .125*x + .5;
+    if(x < -4) return .01f * (x + 4);
+    if(x > 4)  return .01f * (x - 4) + 1;
+    return .125f*x + .5f;
 }
 __device__ float stair_activate_kernel(float x)
 {
-    int n = floor(x);
-    if (n%2 == 0) return floor(x/2.);
-    else return (x - n) + floor(x/2.);
+    int n = floorf(x);
+    if (n%2 == 0) return floorf(x/2);
+    else return (x - n) + floorf(x/2);
 }
  
 
@@ -58,19 +59,20 @@ __device__ float linear_gradient_kernel(float x){return 1;}
 __device__ float logistic_gradient_kernel(float x){return (1-x)*x;}
 __device__ float loggy_gradient_kernel(float x)
 {
-    float y = (x+1.)/2.;
+    float y = (x+1)/2;
     return 2*(1-y)*y;
 }
 __device__ float relu_gradient_kernel(float x){return (x>0);}
 __device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);}
-__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01;}
-__device__ float ramp_gradient_kernel(float x){return (x>0)+.1;}
-__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1;}
+__device__ float selu_gradient_kernel(float x){return (x >= 0)*1.0507 + (x < 0)*(x + 1.0507*1.6732);}
+__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01f;}
+__device__ float ramp_gradient_kernel(float x){return (x>0)+.1f;}
+__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1f;}
 __device__ float tanh_gradient_kernel(float x){return 1-x*x;}
-__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01 : .125;}
+__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01f : .125f;}
 __device__ float stair_gradient_kernel(float x)
 {
-    if (floor(x) == x) return 0;
+    if (floorf(x) == x) return 0;
     return 1;
 }
 
@@ -87,6 +89,8 @@ __device__ float activate_kernel(float x, ACTIVATION a)
             return relu_activate_kernel(x);
         case ELU:
             return elu_activate_kernel(x);
+        case SELU:
+            return selu_activate_kernel(x);
         case RELIE:
             return relie_activate_kernel(x);
         case RAMP:
@@ -120,6 +124,8 @@ __device__ float gradient_kernel(float x, ACTIVATION a)
             return relu_gradient_kernel(x);
         case ELU:
             return elu_gradient_kernel(x);
+        case SELU:
+            return selu_gradient_kernel(x);
         case RELIE:
             return relie_gradient_kernel(x);
         case RAMP:
@@ -140,6 +146,41 @@ __device__ float gradient_kernel(float x, ACTIVATION a)
     return 0;
 }
 
+__global__ void binary_gradient_array_kernel(float *x, float *dy, int n, int s, BINARY_ACTIVATION a, float *dx)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    int i = id % s;
+    int b = id / s;
+    float x1 = x[b*s + i];
+    float x2 = x[b*s + s/2 + i];
+    if(id < n) {
+        float de = dy[id];
+        dx[b*s + i] = x2*de;
+        dx[b*s + s/2 + i] = x1*de; 
+    }
+}
+
+extern "C" void binary_gradient_array_gpu(float *x, float *dx, int n, int size, BINARY_ACTIVATION a, float *y) 
+{
+    binary_gradient_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, dx, n/2, size, a, y);
+    check_error(cudaPeekAtLastError());
+}
+__global__ void binary_activate_array_kernel(float *x, int n, int s, BINARY_ACTIVATION a, float *y)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    int i = id % s;
+    int b = id / s;
+    float x1 = x[b*s + i];
+    float x2 = x[b*s + s/2 + i];
+    if(id < n) y[id] = x1*x2;
+}
+
+extern "C" void binary_activate_array_gpu(float *x, int n, int size, BINARY_ACTIVATION a, float *y) 
+{
+    binary_activate_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, n/2, size, a, y);
+    check_error(cudaPeekAtLastError());
+}
+
 __global__ void activate_array_kernel(float *x, int n, ACTIVATION a)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -152,13 +193,13 @@ __global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delt
     if(i < n) delta[i] *= gradient_kernel(x[i], a);
 }
 
-extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a) 
+extern "C" void activate_array_gpu(float *x, int n, ACTIVATION a) 
 {
     activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta) 
+extern "C" void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta) 
 {
     gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta);
     check_error(cudaPeekAtLastError());
diff --git a/image.darknet/inst/include/darknet/src/activation_layer.c b/image.darknet/inst/include/darknet/src/activation_layer.c
index 3430dac..b4ba953 100644
--- a/image.darknet/inst/include/darknet/src/activation_layer.c
+++ b/image.darknet/inst/include/darknet/src/activation_layer.c
@@ -35,29 +35,29 @@ layer make_activation_layer(int batch, int inputs, ACTIVATION activation)
     return l;
 }
 
-void forward_activation_layer(layer l, network_state state)
+void forward_activation_layer(layer l, network net)
 {
-    copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
+    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
     activate_array(l.output, l.outputs*l.batch, l.activation);
 }
 
-void backward_activation_layer(layer l, network_state state)
+void backward_activation_layer(layer l, network net)
 {
     gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
-    copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
+    copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
 }
 
 #ifdef GPU
 
-void forward_activation_layer_gpu(layer l, network_state state)
+void forward_activation_layer_gpu(layer l, network net)
 {
-    copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
 }
 
-void backward_activation_layer_gpu(layer l, network_state state)
+void backward_activation_layer_gpu(layer l, network net)
 {
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
-    copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
diff --git a/image.darknet/inst/include/darknet/src/activation_layer.h b/image.darknet/inst/include/darknet/src/activation_layer.h
index a09756a..42118a8 100644
--- a/image.darknet/inst/include/darknet/src/activation_layer.h
+++ b/image.darknet/inst/include/darknet/src/activation_layer.h
@@ -7,12 +7,12 @@
 
 layer make_activation_layer(int batch, int inputs, ACTIVATION activation);
 
-void forward_activation_layer(layer l, network_state state);
-void backward_activation_layer(layer l, network_state state);
+void forward_activation_layer(layer l, network net);
+void backward_activation_layer(layer l, network net);
 
 #ifdef GPU
-void forward_activation_layer_gpu(layer l, network_state state);
-void backward_activation_layer_gpu(layer l, network_state state);
+void forward_activation_layer_gpu(layer l, network net);
+void backward_activation_layer_gpu(layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/activations.c b/image.darknet/inst/include/darknet/src/activations.c
index 0cbb2f5..da1a17a 100644
--- a/image.darknet/inst/include/darknet/src/activations.c
+++ b/image.darknet/inst/include/darknet/src/activations.c
@@ -16,6 +16,8 @@ char *get_activation_string(ACTIVATION a)
             return "relu";
         case ELU:
             return "elu";
+        case SELU:
+            return "selu";
         case RELIE:
             return "relie";
         case RAMP:
@@ -46,6 +48,7 @@ ACTIVATION get_activation(char *s)
     if (strcmp(s, "loggy")==0) return LOGGY;
     if (strcmp(s, "relu")==0) return RELU;
     if (strcmp(s, "elu")==0) return ELU;
+    if (strcmp(s, "selu")==0) return SELU;
     if (strcmp(s, "relie")==0) return RELIE;
     if (strcmp(s, "plse")==0) return PLSE;
     if (strcmp(s, "hardtan")==0) return HARDTAN;
@@ -72,6 +75,8 @@ float activate(float x, ACTIVATION a)
             return relu_activate(x);
         case ELU:
             return elu_activate(x);
+        case SELU:
+            return selu_activate(x);
         case RELIE:
             return relie_activate(x);
         case RAMP:
@@ -113,6 +118,8 @@ float gradient(float x, ACTIVATION a)
             return relu_gradient(x);
         case ELU:
             return elu_gradient(x);
+        case SELU:
+            return selu_gradient(x);
         case RELIE:
             return relie_gradient(x);
         case RAMP:
diff --git a/image.darknet/inst/include/darknet/src/activations.h b/image.darknet/inst/include/darknet/src/activations.h
index 1c36ff5..9780d2c 100644
--- a/image.darknet/inst/include/darknet/src/activations.h
+++ b/image.darknet/inst/include/darknet/src/activations.h
@@ -1,12 +1,9 @@
 #ifndef ACTIVATIONS_H
 #define ACTIVATIONS_H
+#include "darknet.h"
 #include "cuda.h"
 #include "math.h"
 
-typedef enum{
-    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN
-}ACTIVATION;
-
 ACTIVATION get_activation(char *s);
 
 char *get_activation_string(ACTIVATION a);
@@ -15,8 +12,8 @@ float gradient(float x, ACTIVATION a);
 void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
 void activate_array(float *x, const int n, const ACTIVATION a);
 #ifdef GPU
-void activate_array_ongpu(float *x, int n, ACTIVATION a);
-void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta);
+void activate_array_gpu(float *x, int n, ACTIVATION a);
+void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta);
 #endif
 
 static inline float stair_activate(float x)
@@ -36,6 +33,7 @@ static inline float logistic_activate(float x){return 1./(1. + exp(-x));}
 static inline float loggy_activate(float x){return 2./(1. + exp(-x)) - 1;}
 static inline float relu_activate(float x){return x*(x>0);}
 static inline float elu_activate(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);}
+static inline float selu_activate(float x){return (x >= 0)*1.0507*x + (x < 0)*1.0507*1.6732*(exp(x)-1);}
 static inline float relie_activate(float x){return (x>0) ? x : .01*x;}
 static inline float ramp_activate(float x){return x*(x>0)+.1*x;}
 static inline float leaky_activate(float x){return (x>0) ? x : .1*x;}
@@ -78,6 +76,7 @@ static inline float stair_gradient(float x)
 }
 static inline float relu_gradient(float x){return (x>0);}
 static inline float elu_gradient(float x){return (x >= 0) + (x < 0)*(x + 1);}
+static inline float selu_gradient(float x){return (x >= 0)*1.0507 + (x < 0)*(x + 1.0507*1.6732);}
 static inline float relie_gradient(float x){return (x>0) ? 1 : .01;}
 static inline float ramp_gradient(float x){return (x>0)+.1;}
 static inline float leaky_gradient(float x){return (x>0) ? 1 : .1;}
diff --git a/image.darknet/inst/include/darknet/src/art.c b/image.darknet/inst/include/darknet/src/art.c
deleted file mode 100644
index 71d3719..0000000
--- a/image.darknet/inst/include/darknet/src/art.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-#include "option_list.h"
-#include "blas.h"
-#include "classifier.h"
-#include <sys/time.h>
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-image get_image_from_stream(CvCapture *cap);
-#endif
-
-
-void demo_art(char *cfgfile, char *weightfile, int cam_index)
-{
-#ifdef OPENCV
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-
-    srand(2222222);
-    CvCapture * cap;
-
-    cap = cvCaptureFromCAM(cam_index);
-
-    char *window = "ArtJudgementBot9000!!!";
-    if(!cap) error("Couldn't connect to webcam.\n");
-    cvNamedWindow(window, CV_WINDOW_NORMAL); 
-    cvResizeWindow(window, 512, 512);
-    int i;
-    int idx[] = {37, 401, 434};
-    int n = sizeof(idx)/sizeof(idx[0]);
-
-    while(1){
-        image in = get_image_from_stream(cap);
-        image in_s = resize_image(in, net.w, net.h);
-        show_image(in, window);
-
-        float *p = network_predict(net, in_s.data);
-
-        printf("\033[2J");
-        printf("\033[1;1H");
-
-        float score = 0;
-        for(i = 0; i < n; ++i){
-            float s = p[idx[i]];
-            if (s > score) score = s;
-        }
-        score = score;
-        printf("I APPRECIATE THIS ARTWORK: %10.7f%%\n", score*100);
-        printf("[");
-	int upper = 30;
-        for(i = 0; i < upper; ++i){
-            printf("%c", ((i+.5) < score*upper) ? 219 : ' ');
-        }
-        printf("]\n");
-
-        free_image(in_s);
-        free_image(in);
-
-        cvWaitKey(1);
-    }
-#endif
-}
-
-
-void run_art(int argc, char **argv)
-{
-    int cam_index = find_int_arg(argc, argv, "-c", 0);
-    char *cfg = argv[2];
-    char *weights = argv[3];
-    demo_art(cfg, weights, cam_index);
-}
-
diff --git a/image.darknet/inst/include/darknet/src/avgpool_layer.c b/image.darknet/inst/include/darknet/src/avgpool_layer.c
index b6932fe..83034db 100644
--- a/image.darknet/inst/include/darknet/src/avgpool_layer.c
+++ b/image.darknet/inst/include/darknet/src/avgpool_layer.c
@@ -37,7 +37,7 @@ void resize_avgpool_layer(avgpool_layer *l, int w, int h)
     l->inputs = h*w*l->c;
 }
 
-void forward_avgpool_layer(const avgpool_layer l, network_state state)
+void forward_avgpool_layer(const avgpool_layer l, network net)
 {
     int b,i,k;
 
@@ -47,14 +47,14 @@ void forward_avgpool_layer(const avgpool_layer l, network_state state)
             l.output[out_index] = 0;
             for(i = 0; i < l.h*l.w; ++i){
                 int in_index = i + l.h*l.w*(k + b*l.c);
-                l.output[out_index] += state.input[in_index];
+                l.output[out_index] += net.input[in_index];
             }
             l.output[out_index] /= l.h*l.w;
         }
     }
 }
 
-void backward_avgpool_layer(const avgpool_layer l, network_state state)
+void backward_avgpool_layer(const avgpool_layer l, network net)
 {
     int b,i,k;
 
@@ -63,7 +63,7 @@ void backward_avgpool_layer(const avgpool_layer l, network_state state)
             int out_index = k + b*l.c;
             for(i = 0; i < l.h*l.w; ++i){
                 int in_index = i + l.h*l.w*(k + b*l.c);
-                state.delta[in_index] += l.delta[out_index] / (l.h*l.w);
+                net.delta[in_index] += l.delta[out_index] / (l.h*l.w);
             }
         }
     }
diff --git a/image.darknet/inst/include/darknet/src/avgpool_layer.h b/image.darknet/inst/include/darknet/src/avgpool_layer.h
index f8329ae..3bd356c 100644
--- a/image.darknet/inst/include/darknet/src/avgpool_layer.h
+++ b/image.darknet/inst/include/darknet/src/avgpool_layer.h
@@ -11,12 +11,12 @@ typedef layer avgpool_layer;
 image get_avgpool_image(avgpool_layer l);
 avgpool_layer make_avgpool_layer(int batch, int w, int h, int c);
 void resize_avgpool_layer(avgpool_layer *l, int w, int h);
-void forward_avgpool_layer(const avgpool_layer l, network_state state);
-void backward_avgpool_layer(const avgpool_layer l, network_state state);
+void forward_avgpool_layer(const avgpool_layer l, network net);
+void backward_avgpool_layer(const avgpool_layer l, network net);
 
 #ifdef GPU
-void forward_avgpool_layer_gpu(avgpool_layer l, network_state state);
-void backward_avgpool_layer_gpu(avgpool_layer l, network_state state);
+void forward_avgpool_layer_gpu(avgpool_layer l, network net);
+void backward_avgpool_layer_gpu(avgpool_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/avgpool_layer_kernels.cu b/image.darknet/inst/include/darknet/src/avgpool_layer_kernels.cu
index b7e2770..a7eca3a 100644
--- a/image.darknet/inst/include/darknet/src/avgpool_layer_kernels.cu
+++ b/image.darknet/inst/include/darknet/src/avgpool_layer_kernels.cu
@@ -43,19 +43,19 @@ __global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float
     }
 }
 
-extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network_state state)
+extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network net)
 {
     size_t n = layer.c*layer.batch;
 
-    forward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, state.input, layer.output_gpu);
+    forward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.input_gpu, layer.output_gpu);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network_state state)
+extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network net)
 {
     size_t n = layer.c*layer.batch;
 
-    backward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, state.delta, layer.delta_gpu);
+    backward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.delta_gpu, layer.delta_gpu);
     check_error(cudaPeekAtLastError());
 }
 
diff --git a/image.darknet/inst/include/darknet/src/batchnorm_layer.c b/image.darknet/inst/include/darknet/src/batchnorm_layer.c
index b53548b..ebff387 100644
--- a/image.darknet/inst/include/darknet/src/batchnorm_layer.c
+++ b/image.darknet/inst/include/darknet/src/batchnorm_layer.c
@@ -1,3 +1,4 @@
+#include "convolutional_layer.h"
 #include "batchnorm_layer.h"
 #include "blas.h"
 #include <stdio.h>
@@ -5,55 +6,67 @@
 layer make_batchnorm_layer(int batch, int w, int h, int c)
 {
     fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
-    layer layer = {0};
-    layer.type = BATCHNORM;
-    layer.batch = batch;
-    layer.h = layer.out_h = h;
-    layer.w = layer.out_w = w;
-    layer.c = layer.out_c = c;
-    layer.output = calloc(h * w * c * batch, sizeof(float));
-    layer.delta  = calloc(h * w * c * batch, sizeof(float));
-    layer.inputs = w*h*c;
-    layer.outputs = layer.inputs;
-
-    layer.scales = calloc(c, sizeof(float));
-    layer.scale_updates = calloc(c, sizeof(float));
+    layer l = {0};
+    l.type = BATCHNORM;
+    l.batch = batch;
+    l.h = l.out_h = h;
+    l.w = l.out_w = w;
+    l.c = l.out_c = c;
+    l.output = calloc(h * w * c * batch, sizeof(float));
+    l.delta  = calloc(h * w * c * batch, sizeof(float));
+    l.inputs = w*h*c;
+    l.outputs = l.inputs;
+
+    l.scales = calloc(c, sizeof(float));
+    l.scale_updates = calloc(c, sizeof(float));
+    l.biases = calloc(c, sizeof(float));
+    l.bias_updates = calloc(c, sizeof(float));
     int i;
     for(i = 0; i < c; ++i){
-        layer.scales[i] = 1;
+        l.scales[i] = 1;
     }
 
-    layer.mean = calloc(c, sizeof(float));
-    layer.variance = calloc(c, sizeof(float));
+    l.mean = calloc(c, sizeof(float));
+    l.variance = calloc(c, sizeof(float));
 
-    layer.rolling_mean = calloc(c, sizeof(float));
-    layer.rolling_variance = calloc(c, sizeof(float));
+    l.rolling_mean = calloc(c, sizeof(float));
+    l.rolling_variance = calloc(c, sizeof(float));
 
-    layer.forward = forward_batchnorm_layer;
-    layer.backward = backward_batchnorm_layer;
+    l.forward = forward_batchnorm_layer;
+    l.backward = backward_batchnorm_layer;
 #ifdef GPU
-    layer.forward_gpu = forward_batchnorm_layer_gpu;
-    layer.backward_gpu = backward_batchnorm_layer_gpu;
+    l.forward_gpu = forward_batchnorm_layer_gpu;
+    l.backward_gpu = backward_batchnorm_layer_gpu;
+
+    l.output_gpu =  cuda_make_array(l.output, h * w * c * batch);
+    l.delta_gpu =   cuda_make_array(l.delta, h * w * c * batch);
+
+    l.biases_gpu = cuda_make_array(l.biases, c);
+    l.bias_updates_gpu = cuda_make_array(l.bias_updates, c);
 
-    layer.output_gpu =  cuda_make_array(layer.output, h * w * c * batch);
-    layer.delta_gpu =   cuda_make_array(layer.delta, h * w * c * batch);
+    l.scales_gpu = cuda_make_array(l.scales, c);
+    l.scale_updates_gpu = cuda_make_array(l.scale_updates, c);
 
-    layer.scales_gpu = cuda_make_array(layer.scales, c);
-    layer.scale_updates_gpu = cuda_make_array(layer.scale_updates, c);
+    l.mean_gpu = cuda_make_array(l.mean, c);
+    l.variance_gpu = cuda_make_array(l.variance, c);
 
-    layer.mean_gpu = cuda_make_array(layer.mean, c);
-    layer.variance_gpu = cuda_make_array(layer.variance, c);
+    l.rolling_mean_gpu = cuda_make_array(l.mean, c);
+    l.rolling_variance_gpu = cuda_make_array(l.variance, c);
 
-    layer.rolling_mean_gpu = cuda_make_array(layer.mean, c);
-    layer.rolling_variance_gpu = cuda_make_array(layer.variance, c);
+    l.mean_delta_gpu = cuda_make_array(l.mean, c);
+    l.variance_delta_gpu = cuda_make_array(l.variance, c);
 
-    layer.mean_delta_gpu = cuda_make_array(layer.mean, c);
-    layer.variance_delta_gpu = cuda_make_array(layer.variance, c);
+    l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
+    l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
+    #ifdef CUDNN
+    cudnnCreateTensorDescriptor(&l.normTensorDesc);
+    cudnnCreateTensorDescriptor(&l.dstTensorDesc);
+    cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
+    cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
 
-    layer.x_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
-    layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
+    #endif
 #endif
-    return layer;
+    return l;
 }
 
 void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
@@ -108,7 +121,7 @@ void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_del
         for(f = 0; f < filters; ++f){
             for(k = 0; k < spatial; ++k){
                 int index = j*filters*spatial + f*spatial + k;
-                delta[index] = delta[index] * 1./(sqrt(variance[f]) + .00001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
+                delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
             }
         }
     }
@@ -119,33 +132,35 @@ void resize_batchnorm_layer(layer *layer, int w, int h)
     fprintf(stderr, "Not implemented\n");
 }
 
-void forward_batchnorm_layer(layer l, network_state state)
+void forward_batchnorm_layer(layer l, network net)
 {
-    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
-    if(l.type == CONNECTED){
-        l.out_c = l.outputs;
-        l.out_h = l.out_w = 1;
-    }
-    if(state.train){
+    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
+    copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
+    if(net.train){
         mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
         variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
 
-        scal_cpu(l.out_c, .9, l.rolling_mean, 1);
-        axpy_cpu(l.out_c, .1, l.mean, 1, l.rolling_mean, 1);
-        scal_cpu(l.out_c, .9, l.rolling_variance, 1);
-        axpy_cpu(l.out_c, .1, l.variance, 1, l.rolling_variance, 1);
+        scal_cpu(l.out_c, .99, l.rolling_mean, 1);
+        axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
+        scal_cpu(l.out_c, .99, l.rolling_variance, 1);
+        axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);
 
-        copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
         normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);   
         copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
     } else {
         normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
     }
     scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
+    add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
 }
 
-void backward_batchnorm_layer(const layer l, network_state state)
+void backward_batchnorm_layer(layer l, network net)
 {
+    if(!net.train){
+        l.mean = l.rolling_mean;
+        l.variance = l.rolling_variance;
+    }
+    backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
     backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
 
     scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
@@ -153,7 +168,7 @@ void backward_batchnorm_layer(const layer l, network_state state)
     mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
     variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
     normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
-    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
+    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
 }
 
 #ifdef GPU
@@ -171,34 +186,86 @@ void push_batchnorm_layer(layer l)
     cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
 }
 
-void forward_batchnorm_layer_gpu(layer l, network_state state)
+void forward_batchnorm_layer_gpu(layer l, network net)
 {
-    if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
-    if(l.type == CONNECTED){
-        l.out_c = l.outputs;
-        l.out_h = l.out_w = 1;
-    }
-    if (state.train) {
+    if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
+    if (net.train) {
+#ifdef CUDNN
+        float one = 1;
+        float zero = 0;
+        cudnnBatchNormalizationForwardTraining(cudnn_handle(),
+                CUDNN_BATCHNORM_SPATIAL,
+                &one,
+                &zero,
+                l.dstTensorDesc,
+                l.x_gpu,
+                l.dstTensorDesc,
+                l.output_gpu,
+                l.normTensorDesc,
+                l.scales_gpu,
+                l.biases_gpu,
+                .01,
+                l.rolling_mean_gpu,
+                l.rolling_variance_gpu,
+                .00001,
+                l.mean_gpu,
+                l.variance_gpu);
+#else
         fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
         fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
 
-        scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
-        axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
-        scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
-        axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
+        scal_gpu(l.out_c, .99, l.rolling_mean_gpu, 1);
+        axpy_gpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
+        scal_gpu(l.out_c, .99, l.rolling_variance_gpu, 1);
+        axpy_gpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
         normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
-        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
+
+        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+#endif
     } else {
         normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
     }
 
-    scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
 }
 
-void backward_batchnorm_layer_gpu(const layer l, network_state state)
+void backward_batchnorm_layer_gpu(layer l, network net)
 {
+    if(!net.train){
+        l.mean_gpu = l.rolling_mean_gpu;
+        l.variance_gpu = l.rolling_variance_gpu;
+    }
+#ifdef CUDNN
+    float one = 1;
+    float zero = 0;
+    cudnnBatchNormalizationBackward(cudnn_handle(),
+            CUDNN_BATCHNORM_SPATIAL,
+            &one,
+            &zero,
+            &one,
+            &one,
+            l.dstTensorDesc,
+            l.x_gpu,
+            l.dstTensorDesc,
+            l.delta_gpu,
+            l.dstTensorDesc,
+            l.x_norm_gpu,
+            l.normTensorDesc,
+            l.scales_gpu,
+            l.scale_updates_gpu,
+            l.bias_updates_gpu,
+            .00001,
+            l.mean_gpu,
+            l.variance_gpu);
+    copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
+#else
+    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
     backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
 
     scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
@@ -206,6 +273,7 @@ void backward_batchnorm_layer_gpu(const layer l, network_state state)
     fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
     fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
     normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
-    if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
+#endif
+    if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
diff --git a/image.darknet/inst/include/darknet/src/batchnorm_layer.h b/image.darknet/inst/include/darknet/src/batchnorm_layer.h
index 99d1d0f..25a18a3 100644
--- a/image.darknet/inst/include/darknet/src/batchnorm_layer.h
+++ b/image.darknet/inst/include/darknet/src/batchnorm_layer.h
@@ -6,12 +6,12 @@
 #include "network.h"
 
 layer make_batchnorm_layer(int batch, int w, int h, int c);
-void forward_batchnorm_layer(layer l, network_state state);
-void backward_batchnorm_layer(layer l, network_state state);
+void forward_batchnorm_layer(layer l, network net);
+void backward_batchnorm_layer(layer l, network net);
 
 #ifdef GPU
-void forward_batchnorm_layer_gpu(layer l, network_state state);
-void backward_batchnorm_layer_gpu(layer l, network_state state);
+void forward_batchnorm_layer_gpu(layer l, network net);
+void backward_batchnorm_layer_gpu(layer l, network net);
 void pull_batchnorm_layer(layer l);
 void push_batchnorm_layer(layer l);
 #endif
diff --git a/image.darknet/inst/include/darknet/src/blas.c b/image.darknet/inst/include/darknet/src/blas.c
index 31bd86b..9e16044 100644
--- a/image.darknet/inst/include/darknet/src/blas.c
+++ b/image.darknet/inst/include/darknet/src/blas.c
@@ -1,5 +1,6 @@
 #include "blas.h"
-#include "math.h"
+
+#include <math.h>
 #include <assert.h>
 #include <float.h>
 #include <stdio.h>
@@ -54,7 +55,17 @@ void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
     }
 }
 
-void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        if(da) da[i] += dc[i] * s[i];
+        if(db) db[i] += dc[i] * (1-s[i]);
+        ds[i] += dc[i] * (a[i] - b[i]);
+    }
+}
+
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
 {
     int stride = w1/w2;
     int sample = w2/w1;
@@ -73,7 +84,7 @@ void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2,
                 for(i = 0; i < minw; ++i){
                     int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
                     int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
-                    out[out_index] += add[add_index];
+                    out[out_index] = s1*out[out_index] + s2*add[add_index];
                 }
             }
         }
@@ -112,6 +123,27 @@ void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, fl
     }
 }
 
+void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial)
+{
+    int b,f,i;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < spatial; ++i){
+            float sum = 0;
+            for(f = 0; f < filters; ++f){
+                int index = b*filters*spatial + f*spatial + i;
+                sum += powf(x[index], 2);
+            }
+            sum = sqrtf(sum);
+            for(f = 0; f < filters; ++f){
+                int index = b*filters*spatial + f*spatial + i;
+                x[index] /= sum;
+                dx[index] = (1 - x[index]) / sum;
+            }
+        }
+    }
+}
+
+
 void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
 {
     int b, f, i;
@@ -161,12 +193,48 @@ void fill_cpu(int N, float ALPHA, float *X, int INCX)
     for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
 }
 
+void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    int i, j;
+    int index = 0;
+    for(j = 0; j < B; ++j) {
+        for(i = 0; i < NX; ++i){
+            if(X) X[j*NX + i] += OUT[index];
+            ++index;
+        }
+        for(i = 0; i < NY; ++i){
+            if(Y) Y[j*NY + i] += OUT[index];
+            ++index;
+        }
+    }
+}
+
+void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    int i, j;
+    int index = 0;
+    for(j = 0; j < B; ++j) {
+        for(i = 0; i < NX; ++i){
+            OUT[index++] = X[j*NX + i];
+        }
+        for(i = 0; i < NY; ++i){
+            OUT[index++] = Y[j*NY + i];
+        }
+    }
+}
+
 void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
 {
     int i;
     for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
 }
 
+void mult_add_into_cpu(int N, float *X, float *Y, float *Z)
+{
+    int i;
+    for(i = 0; i < N; ++i) Z[i] += X[i]*Y[i];
+}
+
 void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
 {
     int i;
@@ -179,11 +247,43 @@ void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
         }
         else {
             error[i] = 2*abs_val - 1;
-            delta[i] = (diff < 0) ? -1 : 1;
+            delta[i] = (diff < 0) ? 1 : -1;
         }
     }
 }
 
+void l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float diff = truth[i] - pred[i];
+        error[i] = fabs(diff);
+        delta[i] = diff > 0 ? 1 : -1;
+    }
+}
+
+void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = (t) ? -log(p) : 0;
+        delta[i] = t-p;
+    }
+}
+
+void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = -t*log(p) - (1-t)*log(1-p);
+        delta[i] = t-p;
+    }
+}
+
 void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
 {
     int i;
@@ -202,21 +302,50 @@ float dot_cpu(int N, float *X, int INCX, float *Y, int INCY)
     return dot;
 }
 
-void softmax(float *input, int n, float temp, float *output)
+void softmax(float *input, int n, float temp, int stride, float *output)
 {
     int i;
     float sum = 0;
     float largest = -FLT_MAX;
     for(i = 0; i < n; ++i){
-        if(input[i] > largest) largest = input[i];
+        if(input[i*stride] > largest) largest = input[i*stride];
     }
     for(i = 0; i < n; ++i){
-        float e = exp(input[i]/temp - largest/temp);
+        float e = exp(input[i*stride]/temp - largest/temp);
         sum += e;
-        output[i] = e;
+        output[i*stride] = e;
     }
     for(i = 0; i < n; ++i){
-        output[i] /= sum;
+        output[i*stride] /= sum;
     }
 }
 
+
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    int g, b;
+    for(b = 0; b < batch; ++b){
+        for(g = 0; g < groups; ++g){
+            softmax(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
+        }
+    }
+}
+
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+    int i, j, k, b;
+    for(b = 0; b < batch; ++b){
+        for(k = 0; k < c; ++k){
+            for(j = 0; j < h*stride; ++j){
+                for(i = 0; i < w*stride; ++i){
+                    int in_index = b*w*h*c + k*w*h + (j/stride)*w + i/stride;
+                    int out_index = b*w*h*c*stride*stride + k*w*h*stride*stride + j*w*stride + i;
+                    if(forward) out[out_index] = scale*in[in_index];
+                    else in[in_index] += scale*out[out_index];
+                }
+            }
+        }
+    }
+}
+
+
diff --git a/image.darknet/inst/include/darknet/src/blas.h b/image.darknet/inst/include/darknet/src/blas.h
index 3d6ee7d..707291d 100644
--- a/image.darknet/inst/include/darknet/src/blas.h
+++ b/image.darknet/inst/include/darknet/src/blas.h
@@ -1,5 +1,7 @@
 #ifndef BLAS_H
 #define BLAS_H
+#include "darknet.h"
+
 void flatten(float *x, int size, int layers, int batch, int forward);
 void pm(int M, int N, float *A);
 float *random_matrix(int rows, int cols);
@@ -8,53 +10,60 @@ void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward
 
 void test_blas();
 
+void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
+void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
+void mult_add_into_cpu(int N, float *X, float *Y, float *Z);
+
 void const_cpu(int N, float ALPHA, float *X, int INCX);
-void constrain_ongpu(int N, float ALPHA, float * X, int INCX);
+void constrain_gpu(int N, float ALPHA, float * X, int INCX);
 void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
 void mul_cpu(int N, float *X, int INCX, float *Y, int INCY);
 
-void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
-void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
-void scal_cpu(int N, float ALPHA, float *X, int INCX);
-void fill_cpu(int N, float ALPHA, float * X, int INCX);
-float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
-void test_gpu_blas();
-void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
+int test_gpu_blas();
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out);
 
 void mean_cpu(float *x, int batch, int filters, int spatial, float *mean);
 void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
-void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
 
 void scale_bias(float *output, float *scales, int batch, int n, int size);
 void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
 void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta);
 void  variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta);
 void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);
+void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial);
 
 void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
 void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
 void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);
+void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc);
 
-void softmax(float *input, int n, float temp, float *output);
+void softmax(float *input, int n, float temp, int stride, float *output);
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
 
 #ifdef GPU
 #include "cuda.h"
-
-void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
-void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
-void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
-void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
-void scal_ongpu(int N, float ALPHA, float * X, int INCX);
-void supp_ongpu(int N, float ALPHA, float * X, int INCX);
-void mask_ongpu(int N, float * X, float mask_num, float * mask);
-void const_ongpu(int N, float ALPHA, float *X, int INCX);
-void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
-void mul_ongpu(int N, float *X, int INCX, float *Y, int INCY);
-void fill_ongpu(int N, float ALPHA, float * X, int INCX);
+#include "tree.h"
+
+void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
+void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
+void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
+void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
+void add_gpu(int N, float ALPHA, float * X, int INCX);
+void supp_gpu(int N, float ALPHA, float * X, int INCX);
+void mask_gpu(int N, float * X, float mask_num, float * mask, float val);
+void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale);
+void const_gpu(int N, float ALPHA, float *X, int INCX);
+void pow_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
+void mul_gpu(int N, float *X, int INCX, float *Y, int INCY);
 
 void mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
 void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
 void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
+void l2normalize_gpu(float *x, float *dx, int batch, int filters, int spatial);
 
 void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);
 
@@ -63,25 +72,34 @@ void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *varianc
 
 void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
 void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
-void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
+void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out);
 void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
 void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
 
+void logistic_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void l2_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void wgan_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc);
 void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c);
 void mult_add_into_gpu(int num, float *a, float *b, float *c);
+void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
+void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
 
-void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
+void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
 
-void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output);
+void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
+void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
 void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t);
 
-void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out);
+void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out);
+void softmax_tree(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier);
+void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
 
 #endif
 #endif
diff --git a/image.darknet/inst/include/darknet/src/blas_kernels.cu b/image.darknet/inst/include/darknet/src/blas_kernels.cu
index d940176..47e8217 100644
--- a/image.darknet/inst/include/darknet/src/blas_kernels.cu
+++ b/image.darknet/inst/include/darknet/src/blas_kernels.cu
@@ -53,24 +53,40 @@ void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size,
     check_error(cudaPeekAtLastError());
 }
 
-__global__ void add_bias_kernel(float *output, float *biases, int n, int size)
+__global__ void add_bias_kernel(float *output, float *biases, int batch, int n, int size)
 {
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-    int filter = blockIdx.y;
-    int batch = blockIdx.z;
+    int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (index >= n*size*batch) return;
+    int i = index % size;
+    index /= size;
+    int j = index % n;
+    index /= n;
+    int k = index;
 
-    if(offset < size) output[(batch*n+filter)*size + offset] += biases[filter];
+    output[(k*n+j)*size + i] += biases[j];
 }
 
 void add_bias_gpu(float *output, float *biases, int batch, int n, int size)
 {
-    dim3 dimGrid((size-1)/BLOCK + 1, n, batch);
-    dim3 dimBlock(BLOCK, 1, 1);
+    int num = n*size*batch;
 
-    add_bias_kernel<<<dimGrid, dimBlock>>>(output, biases, n, size);
+    add_bias_kernel<<<cuda_gridsize(num), BLOCK>>>(output, biases, batch, n, size);
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void backward_bias_conn_kernel(float *bias_updates, float *delta, int batch, int n)
+{
+    int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (index >= n) return;
+    int b;
+    float sum = 0;
+    for(b = 0; b < batch; ++b){
+        int i = b*n + index;
+        sum += delta[i];
+    }
+    bias_updates[index] += sum;
+}
+
 __global__ void backward_bias_kernel(float *bias_updates, float *delta, int batch, int n, int size)
 {
     __shared__ float part[BLOCK];
@@ -91,6 +107,16 @@ __global__ void backward_bias_kernel(float *bias_updates, float *delta, int batc
     }
 }
 
+void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size)
+{
+    if(size == 1){
+        backward_bias_conn_kernel<<<cuda_gridsize(n), BLOCK>>>(bias_updates, delta, batch, n);
+    }else{
+        backward_bias_kernel<<<n, BLOCK>>>(bias_updates, delta, batch, n, size);
+    }
+    check_error(cudaPeekAtLastError());
+}
+
 /*
 __global__ void dot_kernel(float *output, float scale, int batch, int n, int size, float *delta)
 {
@@ -133,20 +159,16 @@ void dot_error_gpu(layer l)
 }
 */
 
-void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size)
-{
-    backward_bias_kernel<<<n, BLOCK>>>(bias_updates, delta, batch, n, size);
-    check_error(cudaPeekAtLastError());
-}
-
 
 __global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t)
 {
     int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (index >= N) return;
+
+    float mhat = m[index] / (1.f - powf(B1, t));
+    float vhat = v[index] / (1.f - powf(B2, t));
     
-    x[index] = x[index] - (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps));
-    //if(index == 0) printf("%f %f %f %f\n", m[index], v[index], (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps)));
+    x[index] = x[index] + rate * mhat / (sqrtf(vhat) + eps);
 }
 
 extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t)
@@ -155,13 +177,27 @@ extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2
     check_error(cudaPeekAtLastError());
 }
 
+extern "C" void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
+{
+    scal_gpu(n, B1, m, 1);
+    scal_gpu(n, B2, v, 1);
+    axpy_gpu(n, -decay*batch, w, 1, d, 1);
+
+    axpy_gpu(n, (1-B1), d, 1, m, 1);
+    mul_gpu(n, d, 1, d, 1);
+    axpy_gpu(n, (1-B2), d, 1, v, 1);
+
+    adam_gpu(n, w, m, v, B1, B2, rate, eps, t);
+    fill_gpu(n, 0, d, 1);
+}
+
 __global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial)
 {
     int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (index >= N) return;
     int f = (index/spatial)%filters;
     
-    x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f);
+    x[index] = (x[index] - mean[f])/(sqrtf(variance[f] + .00001f));
 }
 
 __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
@@ -170,7 +206,7 @@ __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *vari
     if (index >= N) return;
     int f = (index/spatial)%filters;
     
-    delta[index] = delta[index] * 1./(sqrt(variance[f]) + .000001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
+    delta[index] = delta[index] * 1.f/(sqrtf(variance[f] + .00001f)) + variance_delta[f] * 2.f * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
 }
 
 extern "C" void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
@@ -192,7 +228,7 @@ __global__ void  variance_delta_kernel(float *x, float *delta, float *mean, floa
             variance_delta[i] += delta[index]*(x[index] - mean[i]);
         }
     }
-    variance_delta[i] *= -.5 * pow(variance[i] + .000001f, (float)(-3./2.));
+    variance_delta[i] *= -.5f * powf(variance[i] + .00001f, (float)(-3.f/2.f));
 }
 
 __global__ void accumulate_kernel(float *x, int n, int groups, float *sum)
@@ -224,12 +260,14 @@ __global__ void fast_mean_delta_kernel(float *delta, float *variance, int batch,
         }
     }
 
+    __syncthreads();
+
     if(id == 0){
         mean_delta[filter] = 0;
         for(i = 0; i < threads; ++i){
             mean_delta[filter] += local[i];
         }
-        mean_delta[filter] *= (-1./sqrt(variance[filter] + .000001f));
+        mean_delta[filter] *= (-1.f/sqrtf(variance[filter] + .00001f));
     }
 }
 
@@ -252,12 +290,14 @@ __global__ void  fast_variance_delta_kernel(float *x, float *delta, float *mean,
         }
     }
 
+    __syncthreads();
+
     if(id == 0){
         variance_delta[filter] = 0;
         for(i = 0; i < threads; ++i){
             variance_delta[filter] += local[i];
         }
-        variance_delta[filter] *= -.5 * pow(variance[filter] + .000001f, (float)(-3./2.));
+        variance_delta[filter] *= -.5f * powf(variance[filter] + .00001f, (float)(-3.f/2.f));
     }
 }
 
@@ -274,7 +314,7 @@ __global__ void mean_delta_kernel(float *delta, float *variance, int batch, int
             mean_delta[i] += delta[index];
         }
     }
-    mean_delta[i] *= (-1./sqrt(variance[i] + .000001f));
+    mean_delta[i] *= (-1.f/sqrtf(variance[i] + .00001f));
 }
 
 extern "C" void mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
@@ -297,7 +337,7 @@ extern "C" void fast_variance_delta_gpu(float *x, float *delta, float *mean, flo
 
 __global__ void  mean_kernel(float *x, int batch, int filters, int spatial, float *mean)
 {
-    float scale = 1./(batch * spatial);
+    float scale = 1.f/(batch * spatial);
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (i >= filters) return;
     int j,k;
@@ -313,7 +353,7 @@ __global__ void  mean_kernel(float *x, int batch, int filters, int spatial, floa
 
 __global__ void variance_kernel(float *x, float *mean, int batch, int filters, int spatial, float *variance)
 {
-    float scale = 1./(batch * spatial - 1);
+    float scale = 1.f/(batch * spatial - 1);
     int j,k;
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (i >= filters) return;
@@ -321,7 +361,7 @@ __global__ void variance_kernel(float *x, float *mean, int batch, int filters, i
     for(j = 0; j < batch; ++j){
         for(k = 0; k < spatial; ++k){
             int index = j*filters*spatial + i*spatial + k;
-            variance[i] += pow((x[index] - mean[i]), 2);
+            variance[i] += powf((x[index] - mean[i]), 2);
         }
     }
     variance[i] *= scale;
@@ -391,22 +431,22 @@ __global__ void supp_kernel(int N, float ALPHA, float *X, int INCX)
     }
 }
 
-__global__ void scal_kernel(int N, float ALPHA, float *X, int INCX)
+__global__ void add_kernel(int N, float ALPHA, float *X, int INCX)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(i < N) X[i*INCX] *= ALPHA;
+    if(i < N) X[i*INCX] += ALPHA;
 }
 
-__global__ void fill_kernel(int N, float ALPHA, float *X, int INCX)
+__global__ void scal_kernel(int N, float ALPHA, float *X, int INCX)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(i < N) X[i*INCX] = ALPHA;
+    if(i < N) X[i*INCX] *= ALPHA;
 }
 
-__global__ void mask_kernel(int n,  float *x, float mask_num, float *mask)
+__global__ void fill_kernel(int N, float ALPHA, float *X, int INCX)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(i < n && mask[i] == mask_num) x[i] = mask_num;
+    if(i < N) X[i*INCX] = ALPHA;
 }
 
 __global__ void copy_kernel(int N,  float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
@@ -429,6 +469,35 @@ extern "C" void normalize_gpu(float *x, float *mean, float *variance, int batch,
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void l2norm_kernel(int N, float *x, float *dx, int batch, int filters, int spatial)
+{
+    int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (index >= N) return;
+    int b = index / spatial;
+    int i = index % spatial;
+    int f;
+    float sum = 0;
+    for(f = 0; f < filters; ++f){
+        int index = b*filters*spatial + f*spatial + i;
+        sum += powf(x[index], 2);
+    }
+    sum = sqrtf(sum);
+    if(sum == 0) sum = 1;
+    //printf("%f\n", sum);
+    for(f = 0; f < filters; ++f){
+        int index = b*filters*spatial + f*spatial + i;
+        x[index] /= sum;
+        dx[index] = (1 - x[index]) / sum;
+    }
+}
+
+extern "C" void l2normalize_gpu(float *x, float *dx, int batch, int filters, int spatial)
+{
+    size_t N = batch*spatial;
+    l2norm_kernel<<<cuda_gridsize(N), BLOCK>>>(N, x, dx, batch, filters, spatial);
+    check_error(cudaPeekAtLastError());
+}
+
 __global__ void  fast_mean_kernel(float *x, int batch, int filters, int spatial, float *mean)
 {
     const int threads = BLOCK;
@@ -447,6 +516,8 @@ __global__ void  fast_mean_kernel(float *x, int batch, int filters, int spatial,
         }
     }
 
+    __syncthreads();
+
     if(id == 0){
         mean[filter] = 0;
         for(i = 0; i < threads; ++i){
@@ -471,10 +542,12 @@ __global__ void  fast_variance_kernel(float *x, float *mean, int batch, int filt
         for(i = 0; i < spatial; i += threads){
             int index = j*spatial*filters + filter*spatial + i + id;
 
-            local[id] += (i+id < spatial) ? pow((x[index] - mean[filter]), 2) : 0;
+            local[id] += (i+id < spatial) ? powf((x[index] - mean[filter]), 2) : 0;
         }
     }
 
+    __syncthreads();
+
     if(id == 0){
         variance[filter] = 0;
         for(i = 0; i < threads; ++i){
@@ -509,35 +582,35 @@ extern "C" void variance_gpu(float *x, float *mean, int batch, int filters, int
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
+extern "C" void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
 {
-    axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
+    axpy_gpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
 }
 
-extern "C" void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
+extern "C" void pow_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
 {
     pow_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX, Y, INCY);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
+extern "C" void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
 {
     axpy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY)
+extern "C" void copy_gpu(int N, float * X, int INCX, float * Y, int INCY)
 {
-    copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY);
+    copy_gpu_offset(N, X, 0, INCX, Y, 0, INCY);
 }
 
-extern "C" void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY)
+extern "C" void mul_gpu(int N, float * X, int INCX, float * Y, int INCY)
 {
     mul_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, INCX, Y, INCY);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
+extern "C" void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
 {
     copy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, OFFX, INCX, Y, OFFY, INCY);
     check_error(cudaPeekAtLastError());
@@ -560,58 +633,82 @@ __global__ void flatten_kernel(int N, float *x, int spatial, int layers, int bat
     else out[i1] = x[i2];
 }
 
-extern "C" void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out)
+extern "C" void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out)
 {
     int size = spatial*batch*layers;
     flatten_kernel<<<cuda_gridsize(size), BLOCK>>>(size, x, spatial, layers, batch, forward, out);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
+extern "C" void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
 {
     int size = w*h*c*batch;
     reorg_kernel<<<cuda_gridsize(size), BLOCK>>>(size, x, w, h, c, batch, stride, forward, out);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask)
+__global__ void mask_kernel(int n,  float *x, float mask_num, float *mask, float val)
 {
-    mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask);
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n && mask[i] == mask_num) x[i] = val;
+}
+
+extern "C" void mask_gpu(int N, float * X, float mask_num, float * mask, float val)
+{
+    mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask, val);
+    check_error(cudaPeekAtLastError());
+}
+
+__global__ void scale_mask_kernel(int n,  float *x, float mask_num, float *mask, float scale)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n && mask[i] == mask_num) x[i] *= scale;
+}
+
+extern "C" void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale)
+{
+    scale_mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask, scale);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void const_ongpu(int N, float ALPHA, float * X, int INCX)
+extern "C" void const_gpu(int N, float ALPHA, float * X, int INCX)
 {
     const_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void constrain_ongpu(int N, float ALPHA, float * X, int INCX)
+extern "C" void constrain_gpu(int N, float ALPHA, float * X, int INCX)
 {
     constrain_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
     check_error(cudaPeekAtLastError());
 }
 
 
-extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX)
+extern "C" void add_gpu(int N, float ALPHA, float * X, int INCX)
+{
+    add_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
+    check_error(cudaPeekAtLastError());
+}
+
+extern "C" void scal_gpu(int N, float ALPHA, float * X, int INCX)
 {
     scal_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void supp_ongpu(int N, float ALPHA, float * X, int INCX)
+extern "C" void supp_gpu(int N, float ALPHA, float * X, int INCX)
 {
     supp_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void fill_ongpu(int N, float ALPHA, float * X, int INCX)
+extern "C" void fill_gpu(int N, float ALPHA, float * X, int INCX)
 {
     fill_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
     check_error(cudaPeekAtLastError());
 }
 
-__global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+__global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
 {
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (id >= size) return;
@@ -625,10 +722,11 @@ __global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stri
 
     int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
     int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
-    out[out_index] += add[add_index];
+    out[out_index] = s1*out[out_index] + s2*add[add_index];
+    //out[out_index] += add[add_index];
 }
 
-extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
 {
     int minw = (w1 < w2) ? w1 : w2;
     int minh = (h1 < h2) ? h1 : h2;
@@ -642,7 +740,7 @@ extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int
     if(sample < 1) sample = 1;
 
     int size = batch * minw * minh * minc;
-    shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, out);
+    shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, s1, s2, out);
     check_error(cudaPeekAtLastError());
 }
 
@@ -651,14 +749,14 @@ __global__ void smooth_l1_kernel(int n, float *pred, float *truth, float *delta,
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if(i < n){
         float diff = truth[i] - pred[i];
-        float abs_val = abs(diff);
+        float abs_val = fabsf(diff);
         if(abs_val < 1) {
             error[i] = diff * diff;
             delta[i] = diff;
         }
         else {
             error[i] = 2*abs_val - 1;
-            delta[i] = (diff < 0) ? -1 : 1;
+            delta[i] = (diff > 0) ? 1 : -1;
         }
     }
 }
@@ -669,6 +767,40 @@ extern "C" void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, fl
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void softmax_x_ent_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = (t) ? -log(p) : 0;
+        delta[i] = t-p;
+    }
+}
+
+extern "C" void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    softmax_x_ent_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
+__global__ void logistic_x_ent_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = -t*log(p+.0000001) - (1-t)*log(1-p+.0000001);
+        delta[i] = t-p;
+    }
+}
+
+extern "C" void logistic_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    logistic_x_ent_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
 __global__ void l2_kernel(int n, float *pred, float *truth, float *delta, float *error)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -685,6 +817,38 @@ extern "C" void l2_gpu(int n, float *pred, float *truth, float *delta, float *er
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void l1_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float diff = truth[i] - pred[i];
+        error[i] = abs(diff);
+        delta[i] = (diff > 0) ? 1 : -1;
+    }
+}
+
+extern "C" void l1_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    l1_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
+__global__ void wgan_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        error[i] = truth[i] ? -pred[i] : pred[i];
+        delta[i] = (truth[i] > 0) ? 1 : -1;
+    }
+}
+
+extern "C" void wgan_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    wgan_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
+
 
 
 __global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float *c)
@@ -695,6 +859,46 @@ __global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float *
     }
 }
 
+__global__ void deinter_kernel(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < (NX+NY)*B){
+        int b = i / (NX+NY);
+        int j = i % (NX+NY);
+        if (j < NX){
+            if(X) X[b*NX + j] += OUT[i];
+        } else {
+            if(Y) Y[b*NY + j - NX] += OUT[i];
+        }
+    }
+}
+
+extern "C" void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    deinter_kernel<<<cuda_gridsize((NX+NY)*B), BLOCK>>>(NX, X, NY, Y, B, OUT);
+    check_error(cudaPeekAtLastError());
+}
+
+__global__ void inter_kernel(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < (NX+NY)*B){
+        int b = i / (NX+NY);
+        int j = i % (NX+NY);
+        if (j < NX){
+            OUT[i] = X[b*NX + j];
+        } else {
+            OUT[i] = Y[b*NY + j - NX];
+        }
+    }
+}
+
+extern "C" void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    inter_kernel<<<cuda_gridsize((NX+NY)*B), BLOCK>>>(NX, X, NY, Y, B, OUT);
+    check_error(cudaPeekAtLastError());
+}
+
 extern "C" void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c)
 {
     weighted_sum_kernel<<<cuda_gridsize(num), BLOCK>>>(num, a, b, s, c);
@@ -706,8 +910,8 @@ __global__ void weighted_delta_kernel(int n, float *a, float *b, float *s, float
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if(i < n){
         if(da) da[i] += dc[i] * s[i];
-        db[i] += dc[i] * (1-s[i]);
-        ds[i] += dc[i] * a[i] + dc[i] * -b[i];
+        if(db) db[i] += dc[i] * (1-s[i]);
+        ds[i] += dc[i] * (a[i] - b[i]);
     }
 }
 
@@ -732,36 +936,100 @@ extern "C" void mult_add_into_gpu(int num, float *a, float *b, float *c)
 }
 
 
-__device__ void softmax_device(int n, float *input, float temp, float *output)
+__device__ void softmax_device(float *input, int n, float temp, int stride, float *output)
 {
     int i;
     float sum = 0;
     float largest = -INFINITY;
     for(i = 0; i < n; ++i){
-        int val = input[i];
+        int val = input[i*stride];
         largest = (val>largest) ? val : largest;
     }
     for(i = 0; i < n; ++i){
-        float e = exp(input[i]/temp - largest/temp);
+        float e = expf(input[i*stride]/temp - largest/temp);
         sum += e;
-        output[i] = e;
+        output[i*stride] = e;
     }
     for(i = 0; i < n; ++i){
-        output[i] /= sum;
+        output[i*stride] /= sum;
     }
 }
 
-__global__ void softmax_kernel(int n, int offset, int batch, float *input, float temp, float *output)
+
+__global__ void softmax_tree_kernel(float *input, int spatial, int batch, int stride, float temp, float *output, int groups, int *group_size, int *group_offset)
 {
-    int b = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(b >= batch) return;
-    softmax_device(n, input + b*offset, temp, output + b*offset);
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= spatial*batch*groups) return;
+    int s = id % spatial;
+    id = id / spatial;
+    int g = id % groups;
+    int b = id / groups;
+    int goff = group_offset[g]*spatial;
+    int boff = b*stride;
+    softmax_device(input + goff + boff + s, group_size[g], temp, spatial, output + goff + boff + s);
+}
+
+extern "C" void softmax_tree(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier)
+{
+    int *tree_groups_size = cuda_make_int_array(hier.group_size, hier.groups);
+    int *tree_groups_offset = cuda_make_int_array(hier.group_offset, hier.groups);
+    /*
+       static int *tree_groups_size = 0;
+       static int *tree_groups_offset = 0;
+       if(!tree_groups_size){
+       tree_groups_size = cuda_make_int_array(hier.group_size, hier.groups);
+       tree_groups_offset = cuda_make_int_array(hier.group_offset, hier.groups);
+       }
+     */
+    int num = spatial*batch*hier.groups;
+    softmax_tree_kernel<<<cuda_gridsize(num), BLOCK>>>(input, spatial, batch, stride, temp, output, hier.groups, tree_groups_size, tree_groups_offset);
+    check_error(cudaPeekAtLastError());
+    cuda_free((float *)tree_groups_size);
+    cuda_free((float *)tree_groups_offset);
+}
+
+__global__ void softmax_kernel(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= batch*groups) return;
+    int b = id / groups;
+    int g = id % groups;
+    softmax_device(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
 }
 
-extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output)
+extern "C" void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    softmax_kernel<<<cuda_gridsize(batch*groups), BLOCK>>>(input, n, batch, batch_offset, groups, group_offset, stride, temp, output);
+    check_error(cudaPeekAtLastError());
+}
+
+
+__global__ void upsample_kernel(size_t N, float *x, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+    size_t i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i >= N) return;
+    int out_index = i;
+    int out_w = i%(w*stride);
+    i = i/(w*stride);
+    int out_h = i%(h*stride);
+    i = i/(h*stride);
+    int out_c = i%c;
+    i = i/c;
+    int b = i%batch;
+
+    int in_w = out_w / stride;
+    int in_h = out_h / stride;
+    int in_c = out_c;
+
+    int in_index = b*w*h*c + in_c*w*h + in_h*w + in_w;
+
+
+    if(forward) out[out_index] += scale * x[in_index];
+    else atomicAdd(x+in_index, scale * out[out_index]);
+}
+extern "C" void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
 {
-    int inputs = n;
-    int batch = groups;
-    softmax_kernel<<<cuda_gridsize(batch), BLOCK>>>(inputs, offset, batch, input, temp, output);
+    size_t size = w*h*c*batch*stride*stride;
+    upsample_kernel<<<cuda_gridsize(size), BLOCK>>>(size, in, w, h, c, batch, stride, forward, scale, out);
     check_error(cudaPeekAtLastError());
 }
diff --git a/image.darknet/inst/include/darknet/src/box.c b/image.darknet/inst/include/darknet/src/box.c
index 39dea06..8a1772c 100644
--- a/image.darknet/inst/include/darknet/src/box.c
+++ b/image.darknet/inst/include/darknet/src/box.c
@@ -3,13 +3,98 @@
 #include <math.h>
 #include <stdlib.h>
 
-box float_to_box(float *f)
+int nms_comparator(const void *pa, const void *pb)
 {
-    box b;
+    detection a = *(detection *)pa;
+    detection b = *(detection *)pb;
+    float diff = 0;
+    if(b.sort_class >= 0){
+        diff = a.prob[b.sort_class] - b.prob[b.sort_class];
+    } else {
+        diff = a.objectness - b.objectness;
+    }
+    if(diff < 0) return 1;
+    else if(diff > 0) return -1;
+    return 0;
+}
+
+void do_nms_obj(detection *dets, int total, int classes, float thresh)
+{
+    int i, j, k;
+    k = total-1;
+    for(i = 0; i <= k; ++i){
+        if(dets[i].objectness == 0){
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k+1;
+
+    for(i = 0; i < total; ++i){
+        dets[i].sort_class = -1;
+    }
+
+    qsort(dets, total, sizeof(detection), nms_comparator);
+    for(i = 0; i < total; ++i){
+        if(dets[i].objectness == 0) continue;
+        box a = dets[i].bbox;
+        for(j = i+1; j < total; ++j){
+            if(dets[j].objectness == 0) continue;
+            box b = dets[j].bbox;
+            if (box_iou(a, b) > thresh){
+                dets[j].objectness = 0;
+                for(k = 0; k < classes; ++k){
+                    dets[j].prob[k] = 0;
+                }
+            }
+        }
+    }
+}
+
+
+void do_nms_sort(detection *dets, int total, int classes, float thresh)
+{
+    int i, j, k;
+    k = total-1;
+    for(i = 0; i <= k; ++i){
+        if(dets[i].objectness == 0){
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k+1;
+
+    for(k = 0; k < classes; ++k){
+        for(i = 0; i < total; ++i){
+            dets[i].sort_class = k;
+        }
+        qsort(dets, total, sizeof(detection), nms_comparator);
+        for(i = 0; i < total; ++i){
+            if(dets[i].prob[k] == 0) continue;
+            box a = dets[i].bbox;
+            for(j = i+1; j < total; ++j){
+                box b = dets[j].bbox;
+                if (box_iou(a, b) > thresh){
+                    dets[j].prob[k] = 0;
+                }
+            }
+        }
+    }
+}
+
+box float_to_box(float *f, int stride)
+{
+    box b = {0};
     b.x = f[0];
-    b.y = f[1];
-    b.w = f[2];
-    b.h = f[3];
+    b.y = f[1*stride];
+    b.w = f[2*stride];
+    b.h = f[3*stride];
     return b;
 }
 
@@ -230,79 +315,6 @@ dbox diou(box a, box b)
     return dd;
 }
 
-typedef struct{
-    int index;
-    int class;
-    float **probs;
-} sortable_bbox;
-
-int nms_comparator(const void *pa, const void *pb)
-{
-    sortable_bbox a = *(sortable_bbox *)pa;
-    sortable_bbox b = *(sortable_bbox *)pb;
-    float diff = a.probs[a.index][b.class] - b.probs[b.index][b.class];
-    if(diff < 0) return 1;
-    else if(diff > 0) return -1;
-    return 0;
-}
-
-void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh)
-{
-    int i, j, k;
-    sortable_bbox *s = calloc(total, sizeof(sortable_bbox));
-
-    for(i = 0; i < total; ++i){
-        s[i].index = i;       
-        s[i].class = classes;
-        s[i].probs = probs;
-    }
-
-    qsort(s, total, sizeof(sortable_bbox), nms_comparator);
-    for(i = 0; i < total; ++i){
-        if(probs[s[i].index][classes] == 0) continue;
-        box a = boxes[s[i].index];
-        for(j = i+1; j < total; ++j){
-            box b = boxes[s[j].index];
-            if (box_iou(a, b) > thresh){
-                for(k = 0; k < classes+1; ++k){
-                    probs[s[j].index][k] = 0;
-                }
-            }
-        }
-    }
-    free(s);
-}
-
-
-void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh)
-{
-    int i, j, k;
-    sortable_bbox *s = calloc(total, sizeof(sortable_bbox));
-
-    for(i = 0; i < total; ++i){
-        s[i].index = i;       
-        s[i].class = 0;
-        s[i].probs = probs;
-    }
-
-    for(k = 0; k < classes; ++k){
-        for(i = 0; i < total; ++i){
-            s[i].class = k;
-        }
-        qsort(s, total, sizeof(sortable_bbox), nms_comparator);
-        for(i = 0; i < total; ++i){
-            if(probs[s[i].index][k] == 0) continue;
-            box a = boxes[s[i].index];
-            for(j = i+1; j < total; ++j){
-                box b = boxes[s[j].index];
-                if (box_iou(a, b) > thresh){
-                    probs[s[j].index][k] = 0;
-                }
-            }
-        }
-    }
-    free(s);
-}
 
 void do_nms(box *boxes, float **probs, int total, int classes, float thresh)
 {
diff --git a/image.darknet/inst/include/darknet/src/box.h b/image.darknet/inst/include/darknet/src/box.h
index c65589b..dda3e59 100644
--- a/image.darknet/inst/include/darknet/src/box.h
+++ b/image.darknet/inst/include/darknet/src/box.h
@@ -1,21 +1,13 @@
 #ifndef BOX_H
 #define BOX_H
-
-typedef struct{
-    float x, y, w, h;
-} box;
+#include "darknet.h"
 
 typedef struct{
     float dx, dy, dw, dh;
 } dbox;
 
-box float_to_box(float *f);
-float box_iou(box a, box b);
 float box_rmse(box a, box b);
 dbox diou(box a, box b);
-void do_nms(box *boxes, float **probs, int total, int classes, float thresh);
-void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh);
-void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh);
 box decode_box(box b, box anchor);
 box encode_box(box b, box anchor);
 
diff --git a/image.darknet/inst/include/darknet/src/cifar.c b/image.darknet/inst/include/darknet/src/cifar.c
deleted file mode 100644
index d0ac459..0000000
--- a/image.darknet/inst/include/darknet/src/cifar.c
+++ /dev/null
@@ -1,277 +0,0 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-#include "option_list.h"
-#include "blas.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-void train_cifar(char *cfgfile, char *weightfile)
-{
-    srand(time(0));
-    float avg_loss = -1;
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-
-    char *backup_directory = "/home/pjreddie/backup/";
-    int classes = 10;
-    int N = 50000;
-
-    char **labels = get_labels("data/cifar/labels.txt");
-    int epoch = (*net.seen)/N;
-    data train = load_all_cifar10();
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
-        clock_t time=clock();
-
-        float loss = train_network_sgd(net, train, 1);
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.95 + loss*.05;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
-            save_weights(net, buff);
-        }
-        if(get_current_batch(net)%100 == 0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup",backup_directory,base);
-            save_weights(net, buff);
-        }
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s.weights", backup_directory, base);
-    save_weights(net, buff);
-
-    free_network(net);
-    free_ptrs((void**)labels, classes);
-    free(base);
-    free_data(train);
-}
-
-void train_cifar_distill(char *cfgfile, char *weightfile)
-{
-    srand(time(0));
-    float avg_loss = -1;
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-
-    char *backup_directory = "/home/pjreddie/backup/";
-    int classes = 10;
-    int N = 50000;
-
-    char **labels = get_labels("data/cifar/labels.txt");
-    int epoch = (*net.seen)/N;
-
-    data train = load_all_cifar10();
-    matrix soft = csv_to_matrix("results/ensemble.csv");
-
-    float weight = .9;
-    scale_matrix(soft, weight);
-    scale_matrix(train.y, 1. - weight);
-    matrix_add_matrix(soft, train.y);
-
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
-        clock_t time=clock();
-
-        float loss = train_network_sgd(net, train, 1);
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.95 + loss*.05;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
-            save_weights(net, buff);
-        }
-        if(get_current_batch(net)%100 == 0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup",backup_directory,base);
-            save_weights(net, buff);
-        }
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s.weights", backup_directory, base);
-    save_weights(net, buff);
-
-    free_network(net);
-    free_ptrs((void**)labels, classes);
-    free(base);
-    free_data(train);
-}
-
-void test_cifar_multi(char *filename, char *weightfile)
-{
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(time(0));
-
-    float avg_acc = 0;
-    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
-
-    int i;
-    for(i = 0; i < test.X.rows; ++i){
-        image im = float_to_image(32, 32, 3, test.X.vals[i]);
-
-        float pred[10] = {0};
-
-        float *p = network_predict(net, im.data);
-        axpy_cpu(10, 1, p, 1, pred, 1);
-        flip_image(im);
-        p = network_predict(net, im.data);
-        axpy_cpu(10, 1, p, 1, pred, 1);
-
-        int index = max_index(pred, 10);
-        int class = max_index(test.y.vals[i], 10);
-        if(index == class) avg_acc += 1;
-        free_image(im);
-        printf("%4d: %.2f%%\n", i, 100.*avg_acc/(i+1));
-    }
-}
-
-void test_cifar(char *filename, char *weightfile)
-{
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-
-    clock_t time;
-    float avg_acc = 0;
-    float avg_top5 = 0;
-    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
-
-    time=clock();
-
-    float *acc = network_accuracies(net, test, 2);
-    avg_acc += acc[0];
-    avg_top5 += acc[1];
-    printf("top1: %f, %lf seconds, %d images\n", avg_acc, sec(clock()-time), test.X.rows);
-    free_data(test);
-}
-
-void extract_cifar()
-{
-char *labels[] = {"airplane","automobile","bird","cat","deer","dog","frog","horse","ship","truck"};
-    int i;
-    data train = load_all_cifar10();
-    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
-    for(i = 0; i < train.X.rows; ++i){
-        image im = float_to_image(32, 32, 3, train.X.vals[i]);
-        int class = max_index(train.y.vals[i], 10);
-        char buff[256];
-        sprintf(buff, "data/cifar/train/%d_%s",i,labels[class]);
-        save_image_png(im, buff);
-    }
-    for(i = 0; i < test.X.rows; ++i){
-        image im = float_to_image(32, 32, 3, test.X.vals[i]);
-        int class = max_index(test.y.vals[i], 10);
-        char buff[256];
-        sprintf(buff, "data/cifar/test/%d_%s",i,labels[class]);
-        save_image_png(im, buff);
-    }
-}
-
-void test_cifar_csv(char *filename, char *weightfile)
-{
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-
-    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
-
-    matrix pred = network_predict_data(net, test);
-
-    int i;
-    for(i = 0; i < test.X.rows; ++i){
-        image im = float_to_image(32, 32, 3, test.X.vals[i]);
-        flip_image(im);
-    }
-    matrix pred2 = network_predict_data(net, test);
-    scale_matrix(pred, .5);
-    scale_matrix(pred2, .5);
-    matrix_add_matrix(pred2, pred);
-
-    matrix_to_csv(pred);
-    fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1));
-    free_data(test);
-}
-
-void test_cifar_csvtrain(char *filename, char *weightfile)
-{
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-
-    data test = load_all_cifar10();
-
-    matrix pred = network_predict_data(net, test);
-
-    int i;
-    for(i = 0; i < test.X.rows; ++i){
-        image im = float_to_image(32, 32, 3, test.X.vals[i]);
-        flip_image(im);
-    }
-    matrix pred2 = network_predict_data(net, test);
-    scale_matrix(pred, .5);
-    scale_matrix(pred2, .5);
-    matrix_add_matrix(pred2, pred);
-
-    matrix_to_csv(pred);
-    fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1));
-    free_data(test);
-}
-
-void eval_cifar_csv()
-{
-    data test = load_cifar10_data("data/cifar/cifar-10-batches-bin/test_batch.bin");
-
-    matrix pred = csv_to_matrix("results/combined.csv");
-    fprintf(stderr, "%d %d\n", pred.rows, pred.cols);
-
-    fprintf(stderr, "Accuracy: %f\n", matrix_topk_accuracy(test.y, pred, 1));
-    free_data(test);
-    free_matrix(pred);
-}
-
-
-void run_cifar(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    if(0==strcmp(argv[2], "train")) train_cifar(cfg, weights);
-    else if(0==strcmp(argv[2], "extract")) extract_cifar();
-    else if(0==strcmp(argv[2], "distill")) train_cifar_distill(cfg, weights);
-    else if(0==strcmp(argv[2], "test")) test_cifar(cfg, weights);
-    else if(0==strcmp(argv[2], "multi")) test_cifar_multi(cfg, weights);
-    else if(0==strcmp(argv[2], "csv")) test_cifar_csv(cfg, weights);
-    else if(0==strcmp(argv[2], "csvtrain")) test_cifar_csvtrain(cfg, weights);
-    else if(0==strcmp(argv[2], "eval")) eval_cifar_csv();
-}
-
-
diff --git a/image.darknet/inst/include/darknet/src/classifier.h b/image.darknet/inst/include/darknet/src/classifier.h
index 3c89f49..8b13789 100644
--- a/image.darknet/inst/include/darknet/src/classifier.h
+++ b/image.darknet/inst/include/darknet/src/classifier.h
@@ -1,2 +1 @@
 
-list *read_data_cfg(char *filename);
diff --git a/image.darknet/inst/include/darknet/src/coco.c b/image.darknet/inst/include/darknet/src/coco.c
deleted file mode 100644
index 8f3c968..0000000
--- a/image.darknet/inst/include/darknet/src/coco.c
+++ /dev/null
@@ -1,388 +0,0 @@
-#include <stdio.h>
-
-#include "network.h"
-#include "detection_layer.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "box.h"
-#include "demo.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-char *coco_classes[] = {"person","bicycle","car","motorcycle","airplane","bus","train","truck","boat","traffic light","fire hydrant","stop sign","parking meter","bench","bird","cat","dog","horse","sheep","cow","elephant","bear","zebra","giraffe","backpack","umbrella","handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite","baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle","wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich","orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch","potted plant","bed","dining table","toilet","tv","laptop","mouse","remote","keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","book","clock","vase","scissors","teddy bear","hair drier","toothbrush"};
-
-int coco_ids[] = {1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90};
-
-void train_coco(char *cfgfile, char *weightfile)
-{
-    //char *train_images = "/home/pjreddie/data/voc/test/train.txt";
-    //char *train_images = "/home/pjreddie/data/coco/train.txt";
-    char *train_images = "data/coco.trainval.txt";
-    //char *train_images = "data/bags.train.list";
-    char *backup_directory = "/home/pjreddie/backup/";
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = net.batch*net.subdivisions;
-    int i = *net.seen/imgs;
-    data train, buffer;
-
-
-    layer l = net.layers[net.n - 1];
-
-    int side = l.side;
-    int classes = l.classes;
-    float jitter = l.jitter;
-
-    list *plist = get_paths(train_images);
-    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.paths = paths;
-    args.n = imgs;
-    args.m = plist->size;
-    args.classes = classes;
-    args.jitter = jitter;
-    args.num_boxes = side;
-    args.d = &buffer;
-    args.type = REGION_DATA;
-
-    args.angle = net.angle;
-    args.exposure = net.exposure;
-    args.saturation = net.saturation;
-    args.hue = net.hue;
-
-    pthread_t load_thread = load_data_in_thread(args);
-    clock_t time;
-    //while(i*imgs < N*120){
-    while(get_current_batch(net) < net.max_batches){
-        i += 1;
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data_in_thread(args);
-
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-
-        /*
-           image im = float_to_image(net.w, net.h, 3, train.X.vals[113]);
-           image copy = copy_image(im);
-           draw_coco(copy, train.y.vals[113], 7, "truth");
-           cvWaitKey(0);
-           free_image(copy);
-         */
-
-        time=clock();
-        float loss = train_network(net, train);
-        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-
-        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
-        if(i%1000==0 || (i < 1000 && i%100 == 0)){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-        }
-        if(i%100==0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup", backup_directory, base);
-            save_weights(net, buff);
-        }
-        free_data(train);
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
-}
-
-void print_cocos(FILE *fp, int image_id, box *boxes, float **probs, int num_boxes, int classes, int w, int h)
-{
-    int i, j;
-    for(i = 0; i < num_boxes; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
-
-        if (xmin < 0) xmin = 0;
-        if (ymin < 0) ymin = 0;
-        if (xmax > w) xmax = w;
-        if (ymax > h) ymax = h;
-
-        float bx = xmin;
-        float by = ymin;
-        float bw = xmax - xmin;
-        float bh = ymax - ymin;
-
-        for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, probs[i][j]);
-        }
-    }
-}
-
-int get_coco_image_id(char *filename)
-{
-    char *p = strrchr(filename, '_');
-    return atoi(p+1);
-}
-
-void validate_coco(char *cfgfile, char *weightfile)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    srand(time(0));
-
-    char *base = "results/";
-    list *plist = get_paths("data/coco_val_5k.list");
-    //list *plist = get_paths("/home/pjreddie/data/people-art/test.txt");
-    //list *plist = get_paths("/home/pjreddie/data/voc/test/2007_test.txt");
-    char **paths = (char **)list_to_array(plist);
-
-    layer l = net.layers[net.n-1];
-    int classes = l.classes;
-    int side = l.side;
-
-    int j;
-    char buff[1024];
-    snprintf(buff, 1024, "%s/coco_results.json", base);
-    FILE *fp = fopen(buff, "w");
-    fprintf(fp, "[\n");
-
-    box *boxes = calloc(side*side*l.n, sizeof(box));
-    float **probs = calloc(side*side*l.n, sizeof(float *));
-    for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
-
-    int m = plist->size;
-    int i=0;
-    int t;
-
-    float thresh = .01;
-    int nms = 1;
-    float iou_thresh = .5;
-
-    int nthreads = 8;
-    image *val = calloc(nthreads, sizeof(image));
-    image *val_resized = calloc(nthreads, sizeof(image));
-    image *buf = calloc(nthreads, sizeof(image));
-    image *buf_resized = calloc(nthreads, sizeof(image));
-    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.type = IMAGE_DATA;
-
-    for(t = 0; t < nthreads; ++t){
-        args.path = paths[i+t];
-        args.im = &buf[t];
-        args.resized = &buf_resized[t];
-        thr[t] = load_data_in_thread(args);
-    }
-    time_t start = time(0);
-    for(i = nthreads; i < m+nthreads; i += nthreads){
-        fprintf(stderr, "%d\n", i);
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            pthread_join(thr[t], 0);
-            val[t] = buf[t];
-            val_resized[t] = buf_resized[t];
-        }
-        for(t = 0; t < nthreads && i+t < m; ++t){
-            args.path = paths[i+t];
-            args.im = &buf[t];
-            args.resized = &buf_resized[t];
-            thr[t] = load_data_in_thread(args);
-        }
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            char *path = paths[i+t-nthreads];
-            int image_id = get_coco_image_id(path);
-            float *X = val_resized[t].data;
-            network_predict(net, X);
-            int w = val[t].w;
-            int h = val[t].h;
-            get_detection_boxes(l, w, h, thresh, probs, boxes, 0);
-            if (nms) do_nms_sort(boxes, probs, side*side*l.n, classes, iou_thresh);
-            print_cocos(fp, image_id, boxes, probs, side*side*l.n, classes, w, h);
-            free_image(val[t]);
-            free_image(val_resized[t]);
-        }
-    }
-    fseek(fp, -2, SEEK_CUR); 
-    fprintf(fp, "\n]\n");
-    fclose(fp);
-
-    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
-}
-
-void validate_coco_recall(char *cfgfile, char *weightfile)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    srand(time(0));
-
-    char *base = "results/comp4_det_test_";
-    list *plist = get_paths("/home/pjreddie/data/voc/test/2007_test.txt");
-    char **paths = (char **)list_to_array(plist);
-
-    layer l = net.layers[net.n-1];
-    int classes = l.classes;
-    int side = l.side;
-
-    int j, k;
-    FILE **fps = calloc(classes, sizeof(FILE *));
-    for(j = 0; j < classes; ++j){
-        char buff[1024];
-        snprintf(buff, 1024, "%s%s.txt", base, coco_classes[j]);
-        fps[j] = fopen(buff, "w");
-    }
-    box *boxes = calloc(side*side*l.n, sizeof(box));
-    float **probs = calloc(side*side*l.n, sizeof(float *));
-    for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
-
-    int m = plist->size;
-    int i=0;
-
-    float thresh = .001;
-    int nms = 0;
-    float iou_thresh = .5;
-    float nms_thresh = .5;
-
-    int total = 0;
-    int correct = 0;
-    int proposals = 0;
-    float avg_iou = 0;
-
-    for(i = 0; i < m; ++i){
-        char *path = paths[i];
-        image orig = load_image_color(path, 0, 0);
-        image sized = resize_image(orig, net.w, net.h);
-        char *id = basecfg(path);
-        network_predict(net, sized.data);
-        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 1);
-        if (nms) do_nms(boxes, probs, side*side*l.n, 1, nms_thresh);
-
-        char labelpath[4096];
-        find_replace(path, "images", "labels", labelpath);
-        find_replace(labelpath, "JPEGImages", "labels", labelpath);
-        find_replace(labelpath, ".jpg", ".txt", labelpath);
-        find_replace(labelpath, ".JPEG", ".txt", labelpath);
-
-        int num_labels = 0;
-        box_label *truth = read_boxes(labelpath, &num_labels);
-        for(k = 0; k < side*side*l.n; ++k){
-            if(probs[k][0] > thresh){
-                ++proposals;
-            }
-        }
-        for (j = 0; j < num_labels; ++j) {
-            ++total;
-            box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
-            float best_iou = 0;
-            for(k = 0; k < side*side*l.n; ++k){
-                float iou = box_iou(boxes[k], t);
-                if(probs[k][0] > thresh && iou > best_iou){
-                    best_iou = iou;
-                }
-            }
-            avg_iou += best_iou;
-            if(best_iou > iou_thresh){
-                ++correct;
-            }
-        }
-
-        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
-        free(id);
-        free_image(orig);
-        free_image(sized);
-    }
-}
-
-void test_coco(char *cfgfile, char *weightfile, char *filename, float thresh)
-{
-    image **alphabet = load_alphabet();
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    detection_layer l = net.layers[net.n-1];
-    set_batch_network(&net, 1);
-    srand(2222222);
-    float nms = .4;
-    clock_t time;
-    char buff[256];
-    char *input = buff;
-    int j;
-    box *boxes = calloc(l.side*l.side*l.n, sizeof(box));
-    float **probs = calloc(l.side*l.side*l.n, sizeof(float *));
-    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        } else {
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image im = load_image_color(input,0,0);
-        image sized = resize_image(im, net.w, net.h);
-        float *X = sized.data;
-        time=clock();
-        network_predict(net, X);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 0);
-        if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms);
-        draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, coco_classes, alphabet, 80);
-        save_image(im, "prediction");
-        show_image(im, "predictions");
-        free_image(im);
-        free_image(sized);
-#ifdef OPENCV
-        cvWaitKey(0);
-        cvDestroyAllWindows();
-#endif
-        if (filename) break;
-    }
-}
-
-void run_coco(int argc, char **argv)
-{
-    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
-    float thresh = find_float_arg(argc, argv, "-thresh", .2);
-    int cam_index = find_int_arg(argc, argv, "-c", 0);
-    int frame_skip = find_int_arg(argc, argv, "-s", 0);
-
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    char *filename = (argc > 5) ? argv[5]: 0;
-    if(0==strcmp(argv[2], "test")) test_coco(cfg, weights, filename, thresh);
-    else if(0==strcmp(argv[2], "train")) train_coco(cfg, weights);
-    else if(0==strcmp(argv[2], "valid")) validate_coco(cfg, weights);
-    else if(0==strcmp(argv[2], "recall")) validate_coco_recall(cfg, weights);
-    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, coco_classes, 80, frame_skip, prefix, .5);
-}
diff --git a/image.darknet/inst/include/darknet/src/col2im.h b/image.darknet/inst/include/darknet/src/col2im.h
index 0237497..3fbe053 100644
--- a/image.darknet/inst/include/darknet/src/col2im.h
+++ b/image.darknet/inst/include/darknet/src/col2im.h
@@ -6,7 +6,7 @@ void col2im_cpu(float* data_col,
         int ksize, int stride, int pad, float* data_im);
 
 #ifdef GPU
-void col2im_ongpu(float *data_col,
+void col2im_gpu(float *data_col,
         int channels, int height, int width,
         int ksize, int stride, int pad, float *data_im);
 #endif
diff --git a/image.darknet/inst/include/darknet/src/col2im_kernels.cu b/image.darknet/inst/include/darknet/src/col2im_kernels.cu
index aed2df9..ba45e0f 100644
--- a/image.darknet/inst/include/darknet/src/col2im_kernels.cu
+++ b/image.darknet/inst/include/darknet/src/col2im_kernels.cu
@@ -41,7 +41,7 @@ __global__ void col2im_gpu_kernel(const int n, const float* data_col,
     }
 }
 
-void col2im_ongpu(float *data_col,
+void col2im_gpu(float *data_col,
         int channels, int height, int width,
         int ksize, int stride, int pad, float *data_im){
     // We are going to launch channels * height_col * width_col kernels, each
diff --git a/image.darknet/inst/include/darknet/src/compare.c b/image.darknet/inst/include/darknet/src/compare.c
index 4fd266c..d2d2b3b 100644
--- a/image.darknet/inst/include/darknet/src/compare.c
+++ b/image.darknet/inst/include/darknet/src/compare.c
@@ -54,7 +54,7 @@ void train_compare(char *cfgfile, char *weightfile)
         float loss = train_network(net, train);
         if(avg_loss == -1) avg_loss = loss;
         avg_loss = avg_loss*.9 + loss*.1;
-        printf("%.3f: %f, %f avg, %lf seconds, %d images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen);
+        printf("%.3f: %f, %f avg, %lf seconds, %ld images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen);
         free_data(train);
         if(i%100 == 0){
             char buff[256];
diff --git a/image.darknet/inst/include/darknet/src/connected_layer.c b/image.darknet/inst/include/darknet/src/connected_layer.c
index b678ed0..353f4e5 100644
--- a/image.darknet/inst/include/darknet/src/connected_layer.c
+++ b/image.darknet/inst/include/darknet/src/connected_layer.c
@@ -1,4 +1,5 @@
 #include "connected_layer.h"
+#include "convolutional_layer.h"
 #include "batchnorm_layer.h"
 #include "utils.h"
 #include "cuda.h"
@@ -10,10 +11,11 @@
 #include <stdlib.h>
 #include <string.h>
 
-connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize)
+layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam)
 {
     int i;
-    connected_layer l = {0};
+    layer l = {0};
+    l.learning_rate_scale = 1;
     l.type = CONNECTED;
 
     l.inputs = inputs;
@@ -50,6 +52,14 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
         l.biases[i] = 0;
     }
 
+    if(adam){
+        l.m = calloc(l.inputs*l.outputs, sizeof(float));
+        l.v = calloc(l.inputs*l.outputs, sizeof(float));
+        l.bias_m = calloc(l.outputs, sizeof(float));
+        l.scale_m = calloc(l.outputs, sizeof(float));
+        l.bias_v = calloc(l.outputs, sizeof(float));
+        l.scale_v = calloc(l.outputs, sizeof(float));
+    }
     if(batch_normalize){
         l.scales = calloc(outputs, sizeof(float));
         l.scale_updates = calloc(outputs, sizeof(float));
@@ -82,10 +92,16 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
 
     l.output_gpu = cuda_make_array(l.output, outputs*batch);
     l.delta_gpu = cuda_make_array(l.delta, outputs*batch);
-    if(batch_normalize){
-        l.scales_gpu = cuda_make_array(l.scales, outputs);
-        l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
+    if (adam) {
+        l.m_gpu =       cuda_make_array(0, inputs*outputs);
+        l.v_gpu =       cuda_make_array(0, inputs*outputs);
+        l.bias_m_gpu =  cuda_make_array(0, outputs);
+        l.bias_v_gpu =  cuda_make_array(0, outputs);
+        l.scale_m_gpu = cuda_make_array(0, outputs);
+        l.scale_v_gpu = cuda_make_array(0, outputs);
+    }
 
+    if(batch_normalize){
         l.mean_gpu = cuda_make_array(l.mean, outputs);
         l.variance_gpu = cuda_make_array(l.variance, outputs);
 
@@ -95,8 +111,17 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
         l.mean_delta_gpu = cuda_make_array(l.mean, outputs);
         l.variance_delta_gpu = cuda_make_array(l.variance, outputs);
 
+        l.scales_gpu = cuda_make_array(l.scales, outputs);
+        l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
+
         l.x_gpu = cuda_make_array(l.output, l.batch*outputs);
         l.x_norm_gpu = cuda_make_array(l.output, l.batch*outputs);
+#ifdef CUDNN
+        cudnnCreateTensorDescriptor(&l.normTensorDesc);
+        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
+        cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
+        cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
+#endif
     }
 #endif
     l.activation = activation;
@@ -104,8 +129,12 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
     return l;
 }
 
-void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_connected_layer(layer l, update_args a)
 {
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
     axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
     scal_cpu(l.outputs, momentum, l.bias_updates, 1);
 
@@ -119,63 +148,39 @@ void update_connected_layer(connected_layer l, int batch, float learning_rate, f
     scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1);
 }
 
-void forward_connected_layer(connected_layer l, network_state state)
+void forward_connected_layer(layer l, network net)
 {
-    int i;
     fill_cpu(l.outputs*l.batch, 0, l.output, 1);
     int m = l.batch;
     int k = l.inputs;
     int n = l.outputs;
-    float *a = state.input;
+    float *a = net.input;
     float *b = l.weights;
     float *c = l.output;
     gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
     if(l.batch_normalize){
-        if(state.train){
-            mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
-            variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);
-
-            scal_cpu(l.outputs, .95, l.rolling_mean, 1);
-            axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1);
-            scal_cpu(l.outputs, .95, l.rolling_variance, 1);
-            axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1);
-
-            copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
-            normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);   
-            copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
-        } else {
-            normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1);
-        }
-        scale_bias(l.output, l.scales, l.batch, l.outputs, 1);
-    }
-    for(i = 0; i < l.batch; ++i){
-        axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1);
+        forward_batchnorm_layer(l, net);
+    } else {
+        add_bias(l.output, l.biases, l.batch, l.outputs, 1);
     }
     activate_array(l.output, l.outputs*l.batch, l.activation);
 }
 
-void backward_connected_layer(connected_layer l, network_state state)
+void backward_connected_layer(layer l, network net)
 {
-    int i;
     gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
-    for(i = 0; i < l.batch; ++i){
-        axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
-    }
-    if(l.batch_normalize){
-        backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates);
-
-        scale_bias(l.delta, l.scales, l.batch, l.outputs, 1);
 
-        mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta);
-        variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta);
-        normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta);
+    if(l.batch_normalize){
+        backward_batchnorm_layer(l, net);
+    } else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.outputs, 1);
     }
 
     int m = l.outputs;
     int k = l.batch;
     int n = l.inputs;
     float *a = l.delta;
-    float *b = state.input;
+    float *b = net.input;
     float *c = l.weight_updates;
     gemm(1,0,m,n,k,1,a,m,b,n,1,c,n);
 
@@ -185,7 +190,7 @@ void backward_connected_layer(connected_layer l, network_state state)
 
     a = l.delta;
     b = l.weights;
-    c = state.delta;
+    c = net.delta;
 
     if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
 }
@@ -213,11 +218,11 @@ void statistics_connected_layer(layer l)
         printf("Scales ");
         print_statistics(l.scales, l.outputs);
         /*
-        printf("Rolling Mean ");
-        print_statistics(l.rolling_mean, l.outputs);
-        printf("Rolling Variance ");
-        print_statistics(l.rolling_variance, l.outputs);
-        */
+           printf("Rolling Mean ");
+           print_statistics(l.rolling_mean, l.outputs);
+           printf("Rolling Variance ");
+           print_statistics(l.rolling_variance, l.outputs);
+         */
     }
     printf("Biases ");
     print_statistics(l.biases, l.outputs);
@@ -227,7 +232,7 @@ void statistics_connected_layer(layer l)
 
 #ifdef GPU
 
-void pull_connected_layer(connected_layer l)
+void pull_connected_layer(layer l)
 {
     cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
     cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
@@ -240,7 +245,7 @@ void pull_connected_layer(connected_layer l)
     }
 }
 
-void push_connected_layer(connected_layer l)
+void push_connected_layer(layer l)
 {
     cuda_push_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
     cuda_push_array(l.biases_gpu, l.biases, l.outputs);
@@ -253,62 +258,70 @@ void push_connected_layer(connected_layer l)
     }
 }
 
-void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_connected_layer_gpu(layer l, update_args a)
 {
-    axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
-    scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+    if(a.adam){
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.inputs*l.outputs, batch, a.t);
+        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t);
+        if(l.scales_gpu){
+            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t);
+        }
+    }else{
+        axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+        scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1);
 
-    if(l.batch_normalize){
-        axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
-        scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1);
-    }
+        if(l.batch_normalize){
+            axpy_gpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+            scal_gpu(l.outputs, momentum, l.scale_updates_gpu, 1);
+        }
 
-    axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
-    axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
-    scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
+        axpy_gpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_gpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        scal_gpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
+    }
 }
 
-void forward_connected_layer_gpu(connected_layer l, network_state state)
+void forward_connected_layer_gpu(layer l, network net)
 {
-    int i;
-    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
 
     int m = l.batch;
     int k = l.inputs;
     int n = l.outputs;
-    float * a = state.input;
+    float * a = net.input_gpu;
     float * b = l.weights_gpu;
     float * c = l.output_gpu;
-    gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
-    if(l.batch_normalize){
-        forward_batchnorm_layer_gpu(l, state);
-    }
-    for(i = 0; i < l.batch; ++i){
-        axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
+    gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
+
+    if (l.batch_normalize) {
+        forward_batchnorm_layer_gpu(l, net);
+    } else {
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.outputs, 1);
     }
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
 }
 
-void backward_connected_layer_gpu(connected_layer l, network_state state)
+void backward_connected_layer_gpu(layer l, network net)
 {
-    int i;
-    constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
-    for(i = 0; i < l.batch; ++i){
-        axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
-    }
-
+    constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
     if(l.batch_normalize){
-        backward_batchnorm_layer_gpu(l, state);
+        backward_batchnorm_layer_gpu(l, net);
+    } else {
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.outputs, 1);
     }
 
     int m = l.outputs;
     int k = l.batch;
     int n = l.inputs;
     float * a = l.delta_gpu;
-    float * b = state.input;
+    float * b = net.input_gpu;
     float * c = l.weight_updates_gpu;
-    gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
+    gemm_gpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
 
     m = l.batch;
     k = l.outputs;
@@ -316,8 +329,8 @@ void backward_connected_layer_gpu(connected_layer l, network_state state)
 
     a = l.delta_gpu;
     b = l.weights_gpu;
-    c = state.delta;
+    c = net.delta_gpu;
 
-    if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
+    if(c) gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
 }
 #endif
diff --git a/image.darknet/inst/include/darknet/src/connected_layer.h b/image.darknet/inst/include/darknet/src/connected_layer.h
index 23797b1..6727a96 100644
--- a/image.darknet/inst/include/darknet/src/connected_layer.h
+++ b/image.darknet/inst/include/darknet/src/connected_layer.h
@@ -5,22 +5,18 @@
 #include "layer.h"
 #include "network.h"
 
-typedef layer connected_layer;
+layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam);
 
-connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize);
-
-void forward_connected_layer(connected_layer layer, network_state state);
-void backward_connected_layer(connected_layer layer, network_state state);
-void update_connected_layer(connected_layer layer, int batch, float learning_rate, float momentum, float decay);
-void denormalize_connected_layer(layer l);
-void statistics_connected_layer(layer l);
+void forward_connected_layer(layer l, network net);
+void backward_connected_layer(layer l, network net);
+void update_connected_layer(layer l, update_args a);
 
 #ifdef GPU
-void forward_connected_layer_gpu(connected_layer layer, network_state state);
-void backward_connected_layer_gpu(connected_layer layer, network_state state);
-void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay);
-void push_connected_layer(connected_layer layer);
-void pull_connected_layer(connected_layer layer);
+void forward_connected_layer_gpu(layer l, network net);
+void backward_connected_layer_gpu(layer l, network net);
+void update_connected_layer_gpu(layer l, update_args a);
+void push_connected_layer(layer l);
+void pull_connected_layer(layer l);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/convolutional_kernels.cu b/image.darknet/inst/include/darknet/src/convolutional_kernels.cu
index fcaea03..4a1047b 100644
--- a/image.darknet/inst/include/darknet/src/convolutional_kernels.cu
+++ b/image.darknet/inst/include/darknet/src/convolutional_kernels.cu
@@ -33,7 +33,7 @@ __global__ void binarize_input_kernel(float *input, int n, int size, float *bina
     int i = 0;
     float mean = 0;
     for(i = 0; i < n; ++i){
-        mean += abs(input[i*size + s]);
+        mean += fabsf(input[i*size + s]);
     }
     mean = mean / n;
     for(i = 0; i < n; ++i){
@@ -55,7 +55,7 @@ __global__ void binarize_weights_kernel(float *weights, int n, int size, float *
     int i = 0;
     float mean = 0;
     for(i = 0; i < size; ++i){
-        mean += abs(weights[f*size + i]);
+        mean += fabsf(weights[f*size + i]);
     }
     mean = mean / size;
     for(i = 0; i < size; ++i){
@@ -70,19 +70,19 @@ void binarize_weights_gpu(float *weights, int n, int size, float *binary)
     check_error(cudaPeekAtLastError());
 }
 
-void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
+void forward_convolutional_layer_gpu(convolutional_layer l, network net)
 {
-    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
     if(l.binary){
-        binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
+        binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
         swap_binary(&l);
     }
 
     if(l.xnor){
-        binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
+        binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
         swap_binary(&l);
-        binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
-        state.input = l.binary_input_gpu;
+        binarize_gpu(net.input_gpu, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
+        net.input_gpu = l.binary_input_gpu;
     }
 
 #ifdef CUDNN
@@ -90,74 +90,126 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
     cudnnConvolutionForward(cudnn_handle(),
                 &one,
                 l.srcTensorDesc,
-                state.input,
+                net.input_gpu,
                 l.weightDesc,
                 l.weights_gpu,
                 l.convDesc,
                 l.fw_algo,
-                state.workspace,
+                net.workspace,
                 l.workspace_size,
                 &one,
                 l.dstTensorDesc,
                 l.output_gpu);
 
 #else
-    int i;
-    int m = l.n;
-    int k = l.size*l.size*l.c;
+    int i, j;
+    int m = l.n/l.groups;
+    int k = l.size*l.size*l.c/l.groups;
     int n = l.out_w*l.out_h;
     for(i = 0; i < l.batch; ++i){
-        im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.workspace);
-        float * a = l.weights_gpu;
-        float * b = state.workspace;
-        float * c = l.output_gpu;
-        gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
+        for(j = 0; j < l.groups; ++j){
+            float *a = l.weights_gpu + j*l.nweights/l.groups;
+            float *b = net.workspace;
+            float *c = l.output_gpu + (i*l.groups + j)*n*m;
+            float *im = net.input_gpu + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
+
+            if (l.size == 1){
+                b = im;
+            } else {
+                im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+            }
+            gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
+        }
     }
 #endif
 
     if (l.batch_normalize) {
-        forward_batchnorm_layer_gpu(l, state);
+        forward_batchnorm_layer_gpu(l, net);
+    } else {
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
     }
-    add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
 
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
     //if(l.dot > 0) dot_error_gpu(l);
     if(l.binary || l.xnor) swap_binary(&l);
 }
 
-void backward_convolutional_layer_gpu(convolutional_layer l, network_state state)
+__global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id >= n) return;
+
+    int j = id % w;
+    id /= w;
+    int i = id % h;
+    id /= h;
+    int k = id % c;
+    id /= c;
+    int b = id;
+
+    int w_offset = -(size/2.f);
+    int h_offset = -(size/2.f);
+
+    int out_index = j + w*(i + h*(k + c*b));
+    int l, m;
+    for(l = 0; l < size; ++l){
+        for(m = 0; m < size; ++m){
+            int cur_h = h_offset + i + l;
+            int cur_w = w_offset + j + m;
+            int index = cur_w + w*(cur_h + h*(k + b*c));
+            int valid = (cur_h >= 0 && cur_h < h &&
+                    cur_w >= 0 && cur_w < w);
+            delta[out_index] += valid ? rate*(x[index] - x[out_index]) : 0;
+        }
+    }
+}
+
+extern "C" void smooth_layer(layer l, int size, float rate)
 {
-    //constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    int h = l.out_h;
+    int w = l.out_w;
+    int c = l.out_c;
+
+    size_t n = h*w*c*l.batch;
+
+    smooth_kernel<<<cuda_gridsize(n), BLOCK>>>(l.output_gpu, n, l.w, l.h, l.c, size, rate, l.delta_gpu);
+    check_error(cudaPeekAtLastError());
+}
+
+void backward_convolutional_layer_gpu(convolutional_layer l, network net)
+{
+    if(l.smooth){
+        smooth_layer(l, 5, l.smooth);
+    }
+    //constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
 
-    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
 
     if(l.batch_normalize){
-        backward_batchnorm_layer_gpu(l, state);
-        //axpy_ongpu(l.outputs*l.batch, -state.net.decay, l.x_gpu, 1, l.delta_gpu, 1);
+        backward_batchnorm_layer_gpu(l, net);
     } else {
-        //axpy_ongpu(l.outputs*l.batch, -state.net.decay, l.output_gpu, 1, l.delta_gpu, 1);
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
     }
-    float *original_input = state.input;
+    float *original_input = net.input_gpu;
 
-    if(l.xnor) state.input = l.binary_input_gpu;
+    if(l.xnor) net.input_gpu = l.binary_input_gpu;
 #ifdef CUDNN
     float one = 1;
     cudnnConvolutionBackwardFilter(cudnn_handle(),
             &one,
             l.srcTensorDesc,
-            state.input,
+            net.input_gpu,
             l.ddstTensorDesc,
             l.delta_gpu,
             l.convDesc,
             l.bf_algo,
-            state.workspace,
+            net.workspace,
             l.workspace_size,
             &one,
             l.dweightDesc,
             l.weight_updates_gpu);
 
-    if(state.delta){
+    if(net.delta_gpu){
         if(l.binary || l.xnor) swap_binary(&l);
         cudnnConvolutionBackwardData(cudnn_handle(),
                 &one,
@@ -167,108 +219,111 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
                 l.delta_gpu,
                 l.convDesc,
                 l.bd_algo,
-                state.workspace,
+                net.workspace,
                 l.workspace_size,
                 &one,
                 l.dsrcTensorDesc,
-                state.delta);
+                net.delta_gpu);
         if(l.binary || l.xnor) swap_binary(&l);
-        if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
+        if(l.xnor) gradient_array_gpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu);
     }
 
 #else
-    int m = l.n;
-    int n = l.size*l.size*l.c;
+    int m = l.n/l.groups;
+    int n = l.size*l.size*l.c/l.groups;
     int k = l.out_w*l.out_h;
 
-    int i;
+    int i, j;
     for(i = 0; i < l.batch; ++i){
-        float * a = l.delta_gpu;
-        float * b = state.workspace;
-        float * c = l.weight_updates_gpu;
-
-        im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.workspace);
-        gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);
-
-        if(state.delta){
-            if(l.binary || l.xnor) swap_binary(&l);
-            float * a = l.weights_gpu;
-            float * b = l.delta_gpu;
-            float * c = state.workspace;
-
-            gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);
-
-            col2im_ongpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta + i*l.c*l.h*l.w);
-            if(l.binary || l.xnor) {
-                swap_binary(&l);
+        for(j = 0; j < l.groups; ++j){
+            float *a = l.delta_gpu + (i*l.groups + j)*m*k;
+            float *b = net.workspace;
+            float *c = l.weight_updates_gpu + j*l.nweights/l.groups;
+
+            float *im  = net.input_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;
+            float *imd = net.delta_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;
+
+            im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+            gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
+
+            if (net.delta_gpu) {
+                if (l.binary || l.xnor) swap_binary(&l);
+                a = l.weights_gpu + j*l.nweights/l.groups;
+                b = l.delta_gpu + (i*l.groups + j)*m*k;
+                c = net.workspace;
+                if (l.size == 1) {
+                    c = imd;
+                }
+
+                gemm_gpu(1,0,n,k,m,1,a,n,b,k,0,c,k);
+
+                if (l.size != 1) {
+                    col2im_gpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, imd);
+                }
+                if(l.binary || l.xnor) {
+                    swap_binary(&l);
+                }
             }
-            if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w);
+            if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
         }
     }
 #endif
 }
 
-void pull_convolutional_layer(convolutional_layer layer)
+void pull_convolutional_layer(layer l)
 {
-    cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
-    if (layer.batch_normalize){
-        cuda_pull_array(layer.scales_gpu, layer.scales, layer.n);
-        cuda_pull_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
-        cuda_pull_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
-    }
-    if (layer.adam){
-        cuda_pull_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
-        cuda_pull_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
+    cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
+    cuda_pull_array(l.biases_gpu, l.biases, l.n);
+    cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+    cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_pull_array(l.scales_gpu, l.scales, l.n);
+        cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
     }
 }
 
-void push_convolutional_layer(convolutional_layer layer)
+void push_convolutional_layer(layer l)
 {
-    cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
-    if (layer.batch_normalize){
-        cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
-        cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
-        cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
-    }
-    if (layer.adam){
-        cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
-        cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
+    cuda_push_array(l.weights_gpu, l.weights, l.nweights);
+    cuda_push_array(l.biases_gpu, l.biases, l.n);
+    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+    cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_push_array(l.scales_gpu, l.scales, l.n);
+        cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
     }
 }
 
-void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay)
+void update_convolutional_layer_gpu(layer l, update_args a)
 {
-    int size = layer.size*layer.size*layer.c*layer.n;
-    axpy_ongpu(layer.n, learning_rate/batch, layer.bias_updates_gpu, 1, layer.biases_gpu, 1);
-    scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1);
-
-    if(layer.scales_gpu){
-        axpy_ongpu(layer.n, learning_rate/batch, layer.scale_updates_gpu, 1, layer.scales_gpu, 1);
-        scal_ongpu(layer.n, momentum, layer.scale_updates_gpu, 1);
-    }
-
-    if(layer.adam){
-        scal_ongpu(size, layer.B1, layer.m_gpu, 1);
-        scal_ongpu(size, layer.B2, layer.v_gpu, 1);
-
-        axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
+    if(a.adam){
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
+        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
+        if(l.scales_gpu){
+            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
+        }
+    }else{
+        axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);
 
-        axpy_ongpu(size, -(1-layer.B1), layer.weight_updates_gpu, 1, layer.m_gpu, 1);
-        mul_ongpu(size, layer.weight_updates_gpu, 1, layer.weight_updates_gpu, 1);
-        axpy_ongpu(size, (1-layer.B2), layer.weight_updates_gpu, 1, layer.v_gpu, 1);
+        axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+        scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
 
-        adam_gpu(size, layer.weights_gpu, layer.m_gpu, layer.v_gpu, layer.B1, layer.B2, learning_rate/batch, layer.eps, layer.t+1);
-        fill_ongpu(size, 0, layer.weight_updates_gpu, 1);
-    }else{
-        axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
-        axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
-        scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);
+        if(l.scales_gpu){
+            axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+            scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
+        }
+    }
+    if(l.clip){
+        constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1);
     }
 }
 
diff --git a/image.darknet/inst/include/darknet/src/convolutional_layer.c b/image.darknet/inst/include/darknet/src/convolutional_layer.c
index 37211ab..1fb58b0 100644
--- a/image.darknet/inst/include/darknet/src/convolutional_layer.c
+++ b/image.darknet/inst/include/darknet/src/convolutional_layer.c
@@ -12,22 +12,17 @@
 #include "xnor_layer.h"
 #endif
 
-#ifndef AI2
-#define AI2 0
-void forward_xnor_layer(layer l, network_state state);
-#endif
-
 void swap_binary(convolutional_layer *l)
 {
     float *swap = l->weights;
     l->weights = l->binary_weights;
     l->binary_weights = swap;
 
-    #ifdef GPU
+#ifdef GPU
     swap = l->weights_gpu;
     l->weights_gpu = l->binary_weights_gpu;
     l->binary_weights_gpu = swap;
-    #endif
+#endif
 }
 
 void binarize_weights(float *weights, int n, int size, float *binary)
@@ -80,23 +75,15 @@ int convolutional_out_width(convolutional_layer l)
 
 image get_convolutional_image(convolutional_layer l)
 {
-    int h,w,c;
-    h = convolutional_out_height(l);
-    w = convolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.output);
+    return float_to_image(l.out_w,l.out_h,l.out_c,l.output);
 }
 
 image get_convolutional_delta(convolutional_layer l)
 {
-    int h,w,c;
-    h = convolutional_out_height(l);
-    w = convolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.delta);
+    return float_to_image(l.out_w,l.out_h,l.out_c,l.delta);
 }
 
-size_t get_workspace_size(layer l){
+static size_t get_workspace_size(layer l){
 #ifdef CUDNN
     if(gpu_index >= 0){
         size_t most = 0;
@@ -127,8 +114,8 @@ size_t get_workspace_size(layer l){
         if (s > most) most = s;
         return most;
     }
-    #endif
-    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
+#endif
+    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c/l.groups*sizeof(float);
 }
 
 #ifdef GPU
@@ -137,46 +124,62 @@ void cudnn_convolutional_setup(layer *l)
 {
     cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
     cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
-    cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
 
     cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
     cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
-    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
+    cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
+
+    cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); 
+    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); 
+    #if CUDNN_MAJOR >= 6
+    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
+    #else
     cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
+    #endif
+
+    #if CUDNN_MAJOR >= 7
+    cudnnSetConvolutionGroupCount(l->convDesc, l->groups);
+    #else
+    if(l->groups > 1){
+        error("CUDNN < 7 doesn't support groups, please upgrade!");
+    }
+    #endif
+
     cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
             l->srcTensorDesc,
             l->weightDesc,
             l->convDesc,
             l->dstTensorDesc,
-            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-            0,
+            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+            2000000000,
             &l->fw_algo);
     cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
             l->weightDesc,
             l->ddstTensorDesc,
             l->convDesc,
             l->dsrcTensorDesc,
-            CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
-            0,
+            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+            2000000000,
             &l->bd_algo);
     cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
             l->srcTensorDesc,
             l->ddstTensorDesc,
             l->convDesc,
             l->dweightDesc,
-            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
-            0,
+            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+            2000000000,
             &l->bf_algo);
 }
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
 {
     int i;
     convolutional_layer l = {0};
     l.type = CONVOLUTIONAL;
 
+    l.groups = groups;
     l.h = h;
     l.w = w;
     l.c = c;
@@ -189,17 +192,23 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
     l.pad = padding;
     l.batch_normalize = batch_normalize;
 
-    l.weights = calloc(c*n*size*size, sizeof(float));
-    l.weight_updates = calloc(c*n*size*size, sizeof(float));
+    l.weights = calloc(c/groups*n*size*size, sizeof(float));
+    l.weight_updates = calloc(c/groups*n*size*size, sizeof(float));
 
     l.biases = calloc(n, sizeof(float));
     l.bias_updates = calloc(n, sizeof(float));
 
+    l.nweights = c/groups*n*size*size;
+    l.nbiases = n;
+
     // float scale = 1./sqrt(size*size*c);
-    float scale = sqrt(2./(size*size*c));
-    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
-    int out_h = convolutional_out_height(l);
+    float scale = sqrt(2./(size*size*c/l.groups));
+    //printf("convscale %f\n", scale);
+    //scale = .02;
+    //for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
+    for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_normal();
     int out_w = convolutional_out_width(l);
+    int out_h = convolutional_out_height(l);
     l.out_h = out_h;
     l.out_w = out_w;
     l.out_c = n;
@@ -213,12 +222,12 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
     l.backward = backward_convolutional_layer;
     l.update = update_convolutional_layer;
     if(binary){
-        l.binary_weights = calloc(c*n*size*size, sizeof(float));
-        l.cweights = calloc(c*n*size*size, sizeof(char));
+        l.binary_weights = calloc(l.nweights, sizeof(float));
+        l.cweights = calloc(l.nweights, sizeof(char));
         l.scales = calloc(n, sizeof(float));
     }
     if(xnor){
-        l.binary_weights = calloc(c*n*size*size, sizeof(float));
+        l.binary_weights = calloc(l.nweights, sizeof(float));
         l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
     }
 
@@ -241,9 +250,12 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
         l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
     }
     if(adam){
-        l.adam = 1;
-        l.m = calloc(c*n*size*size, sizeof(float));
-        l.v = calloc(c*n*size*size, sizeof(float));
+        l.m = calloc(l.nweights, sizeof(float));
+        l.v = calloc(l.nweights, sizeof(float));
+        l.bias_m = calloc(n, sizeof(float));
+        l.scale_m = calloc(n, sizeof(float));
+        l.bias_v = calloc(n, sizeof(float));
+        l.scale_v = calloc(n, sizeof(float));
     }
 
 #ifdef GPU
@@ -253,12 +265,16 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
 
     if(gpu_index >= 0){
         if (adam) {
-            l.m_gpu = cuda_make_array(l.m, c*n*size*size);
-            l.v_gpu = cuda_make_array(l.v, c*n*size*size);
+            l.m_gpu = cuda_make_array(l.m, l.nweights);
+            l.v_gpu = cuda_make_array(l.v, l.nweights);
+            l.bias_m_gpu = cuda_make_array(l.bias_m, n);
+            l.bias_v_gpu = cuda_make_array(l.bias_v, n);
+            l.scale_m_gpu = cuda_make_array(l.scale_m, n);
+            l.scale_v_gpu = cuda_make_array(l.scale_v, n);
         }
 
-        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
-        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+        l.weights_gpu = cuda_make_array(l.weights, l.nweights);
+        l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
 
         l.biases_gpu = cuda_make_array(l.biases, n);
         l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
@@ -267,10 +283,10 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
         l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
 
         if(binary){
-            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
         }
         if(xnor){
-            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
             l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
         }
 
@@ -291,6 +307,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
             l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
         }
 #ifdef CUDNN
+        cudnnCreateTensorDescriptor(&l.normTensorDesc);
         cudnnCreateTensorDescriptor(&l.srcTensorDesc);
         cudnnCreateTensorDescriptor(&l.dstTensorDesc);
         cudnnCreateFilterDescriptor(&l.weightDesc);
@@ -305,7 +322,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
     l.workspace_size = get_workspace_size(l);
     l.activation = activation;
 
-    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d  %5.3f BFLOPs\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, (2.0 * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w)/1000000000.);
 
     return l;
 }
@@ -315,8 +332,8 @@ void denormalize_convolutional_layer(convolutional_layer l)
     int i, j;
     for(i = 0; i < l.n; ++i){
         float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
-        for(j = 0; j < l.c*l.size*l.size; ++j){
-            l.weights[i*l.c*l.size*l.size + j] *= scale;
+        for(j = 0; j < l.c/l.groups*l.size*l.size; ++j){
+            l.weights[i*l.c/l.groups*l.size*l.size + j] *= scale;
         }
         l.biases[i] -= l.rolling_mean[i] * scale;
         l.scales[i] = 1;
@@ -325,6 +342,7 @@ void denormalize_convolutional_layer(convolutional_layer l)
     }
 }
 
+/*
 void test_convolutional_layer()
 {
     convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0);
@@ -344,10 +362,10 @@ void test_convolutional_layer()
         3,3,3,3,3,
         3,3,3,3,3,
         3,3,3,3,3};
-    network_state state = {0};
-    state.input = data;
-    forward_convolutional_layer(l, state);
+    //net.input = data;
+    //forward_convolutional_layer(l);
 }
+*/
 
 void resize_convolutional_layer(convolutional_layer *l, int w, int h)
 {
@@ -424,88 +442,106 @@ void backward_bias(float *bias_updates, float *delta, int batch, int n, int size
     }
 }
 
-void forward_convolutional_layer(convolutional_layer l, network_state state)
+void forward_convolutional_layer(convolutional_layer l, network net)
 {
-    int out_h = convolutional_out_height(l);
-    int out_w = convolutional_out_width(l);
-    int i;
+    int i, j;
 
     fill_cpu(l.outputs*l.batch, 0, l.output, 1);
 
     if(l.xnor){
-        binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
+        binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights);
         swap_binary(&l);
-        binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
-        state.input = l.binary_input;
+        binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input);
+        net.input = l.binary_input;
     }
 
-    int m = l.n;
-    int k = l.size*l.size*l.c;
-    int n = out_h*out_w;
-
-
-    float *a = l.weights;
-    float *b = state.workspace;
-    float *c = l.output;
-
+    int m = l.n/l.groups;
+    int k = l.size*l.size*l.c/l.groups;
+    int n = l.out_w*l.out_h;
     for(i = 0; i < l.batch; ++i){
-        im2col_cpu(state.input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, b);
-        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
-        c += n*m;
-        state.input += l.c*l.h*l.w;
+        for(j = 0; j < l.groups; ++j){
+            float *a = l.weights + j*l.nweights/l.groups;
+            float *b = net.workspace;
+            float *c = l.output + (i*l.groups + j)*n*m;
+            float *im =  net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
+
+            if (l.size == 1) {
+                b = im;
+            } else {
+                im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+            }
+            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+        }
     }
 
     if(l.batch_normalize){
-        forward_batchnorm_layer(l, state);
+        forward_batchnorm_layer(l, net);
+    } else {
+        add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
     }
-    add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
 
-    activate_array(l.output, m*n*l.batch, l.activation);
+    activate_array(l.output, l.outputs*l.batch, l.activation);
     if(l.binary || l.xnor) swap_binary(&l);
 }
 
-void backward_convolutional_layer(convolutional_layer l, network_state state)
+void backward_convolutional_layer(convolutional_layer l, network net)
 {
-    int i;
-    int m = l.n;
-    int n = l.size*l.size*l.c;
-    int k = convolutional_out_height(l)*
-        convolutional_out_width(l);
+    int i, j;
+    int m = l.n/l.groups;
+    int n = l.size*l.size*l.c/l.groups;
+    int k = l.out_w*l.out_h;
 
-    gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
-    backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
 
     if(l.batch_normalize){
-        backward_batchnorm_layer(l, state);
+        backward_batchnorm_layer(l, net);
+    } else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
     }
 
     for(i = 0; i < l.batch; ++i){
-        float *a = l.delta + i*m*k;
-        float *b = state.workspace;
-        float *c = l.weight_updates;
-
-        float *im = state.input+i*l.c*l.h*l.w;
+        for(j = 0; j < l.groups; ++j){
+            float *a = l.delta + (i*l.groups + j)*m*k;
+            float *b = net.workspace;
+            float *c = l.weight_updates + j*l.nweights/l.groups;
+
+            float *im  = net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
+            float *imd = net.delta + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
+
+            if(l.size == 1){
+                b = im;
+            } else {
+                im2col_cpu(im, l.c/l.groups, l.h, l.w, 
+                        l.size, l.stride, l.pad, b);
+            }
 
-        im2col_cpu(im, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, b);
-        gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
+            gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
 
-        if(state.delta){
-            a = l.weights;
-            b = l.delta + i*m*k;
-            c = state.workspace;
+            if (net.delta) {
+                a = l.weights + j*l.nweights/l.groups;
+                b = l.delta + (i*l.groups + j)*m*k;
+                c = net.workspace;
+                if (l.size == 1) {
+                    c = imd;
+                }
 
-            gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);
+                gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);
 
-            col2im_cpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+                if (l.size != 1) {
+                    col2im_cpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, imd);
+                }
+            }
         }
     }
 }
 
-void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_convolutional_layer(convolutional_layer l, update_args a)
 {
-    int size = l.size*l.size*l.c*l.n;
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
     axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
     scal_cpu(l.n, momentum, l.bias_updates, 1);
 
@@ -514,9 +550,9 @@ void update_convolutional_layer(convolutional_layer l, int batch, float learning
         scal_cpu(l.n, momentum, l.scale_updates, 1);
     }
 
-    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
-    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
-    scal_cpu(size, momentum, l.weight_updates, 1);
+    axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(l.nweights, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(l.nweights, momentum, l.weight_updates, 1);
 }
 
 
@@ -524,7 +560,7 @@ image get_convolutional_weight(convolutional_layer l, int i)
 {
     int h = l.size;
     int w = l.size;
-    int c = l.c;
+    int c = l.c/l.groups;
     return float_to_image(w,h,c,l.weights+i*h*w*c);
 }
 
@@ -558,8 +594,14 @@ image *get_weights(convolutional_layer l)
     int i;
     for(i = 0; i < l.n; ++i){
         weights[i] = copy_image(get_convolutional_weight(l, i));
-        //normalize_image(weights[i]);
+        normalize_image(weights[i]);
+        /*
+           char buff[256];
+           sprintf(buff, "filter%d", i);
+           save_image(weights[i], buff);
+         */
     }
+    //error("hey");
     return weights;
 }
 
diff --git a/image.darknet/inst/include/darknet/src/convolutional_layer.h b/image.darknet/inst/include/darknet/src/convolutional_layer.h
index 970aa10..6c261f5 100644
--- a/image.darknet/inst/include/darknet/src/convolutional_layer.h
+++ b/image.darknet/inst/include/darknet/src/convolutional_layer.h
@@ -10,31 +10,31 @@
 typedef layer convolutional_layer;
 
 #ifdef GPU
-void forward_convolutional_layer_gpu(convolutional_layer layer, network_state state);
-void backward_convolutional_layer_gpu(convolutional_layer layer, network_state state);
-void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay);
+void forward_convolutional_layer_gpu(convolutional_layer layer, network net);
+void backward_convolutional_layer_gpu(convolutional_layer layer, network net);
+void update_convolutional_layer_gpu(convolutional_layer layer, update_args a);
 
 void push_convolutional_layer(convolutional_layer layer);
 void pull_convolutional_layer(convolutional_layer layer);
 
 void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
+void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
 #ifdef CUDNN
 void cudnn_convolutional_setup(layer *l);
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
-void denormalize_convolutional_layer(convolutional_layer l);
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
 void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
-void forward_convolutional_layer(const convolutional_layer layer, network_state state);
-void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay);
+void forward_convolutional_layer(const convolutional_layer layer, network net);
+void update_convolutional_layer(convolutional_layer layer, update_args a);
 image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights);
 void binarize_weights(float *weights, int n, int size, float *binary);
 void swap_binary(convolutional_layer *l);
 void binarize_weights2(float *weights, int n, int size, char *binary, float *scales);
 
-void backward_convolutional_layer(convolutional_layer layer, network_state state);
+void backward_convolutional_layer(convolutional_layer layer, network net);
 
 void add_bias(float *output, float *biases, int batch, int n, int size);
 void backward_bias(float *bias_updates, float *delta, int batch, int n, int size);
@@ -45,8 +45,6 @@ image get_convolutional_weight(convolutional_layer layer, int i);
 
 int convolutional_out_height(convolutional_layer layer);
 int convolutional_out_width(convolutional_layer layer);
-void rescale_weights(convolutional_layer l, float scale, float trans);
-void rgbgr_weights(convolutional_layer l);
 
 #endif
 
diff --git a/image.darknet/inst/include/darknet/src/cost_layer.c b/image.darknet/inst/include/darknet/src/cost_layer.c
index 39d2398..2138ff2 100644
--- a/image.darknet/inst/include/darknet/src/cost_layer.c
+++ b/image.darknet/inst/include/darknet/src/cost_layer.c
@@ -9,9 +9,12 @@
 
 COST_TYPE get_cost_type(char *s)
 {
+    if (strcmp(s, "seg")==0) return SEG;
     if (strcmp(s, "sse")==0) return SSE;
     if (strcmp(s, "masked")==0) return MASKED;
     if (strcmp(s, "smooth")==0) return SMOOTH;
+    if (strcmp(s, "L1")==0) return L1;
+    if (strcmp(s, "wgan")==0) return WGAN;
     fprintf(stderr, "Couldn't find cost type %s, going with SSE\n", s);
     return SSE;
 }
@@ -19,12 +22,18 @@ COST_TYPE get_cost_type(char *s)
 char *get_cost_string(COST_TYPE a)
 {
     switch(a){
+        case SEG:
+            return "seg";
         case SSE:
             return "sse";
         case MASKED:
             return "masked";
         case SMOOTH:
             return "smooth";
+        case L1:
+            return "L1";
+        case WGAN:
+            return "wgan";
     }
     return "sse";
 }
@@ -70,26 +79,28 @@ void resize_cost_layer(cost_layer *l, int inputs)
 #endif
 }
 
-void forward_cost_layer(cost_layer l, network_state state)
+void forward_cost_layer(cost_layer l, network net)
 {
-    if (!state.truth) return;
+    if (!net.truth) return;
     if(l.cost_type == MASKED){
         int i;
         for(i = 0; i < l.batch*l.inputs; ++i){
-            if(state.truth[i] == SECRET_NUM) state.input[i] = SECRET_NUM;
+            if(net.truth[i] == SECRET_NUM) net.input[i] = SECRET_NUM;
         }
     }
     if(l.cost_type == SMOOTH){
-        smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
+        smooth_l1_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output);
+    }else if(l.cost_type == L1){
+        l1_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output);
     } else {
-        l2_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
+        l2_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output);
     }
     l.cost[0] = sum_array(l.output, l.batch*l.inputs);
 }
 
-void backward_cost_layer(const cost_layer l, network_state state)
+void backward_cost_layer(const cost_layer l, network net)
 {
-    axpy_cpu(l.batch*l.inputs, l.scale, l.delta, 1, state.delta, 1);
+    axpy_cpu(l.batch*l.inputs, l.scale, l.delta, 1, net.delta, 1);
 }
 
 #ifdef GPU
@@ -113,17 +124,30 @@ int float_abs_compare (const void * a, const void * b)
     return (fa > fb) - (fa < fb);
 }
 
-void forward_cost_layer_gpu(cost_layer l, network_state state)
+void forward_cost_layer_gpu(cost_layer l, network net)
 {
-    if (!state.truth) return;
-    if (l.cost_type == MASKED) {
-        mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth);
+    if (!net.truth) return;
+    if(l.smooth){
+        scal_gpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
+        add_gpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
     }
 
     if(l.cost_type == SMOOTH){
-        smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu);
+        smooth_l1_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
+    } else if (l.cost_type == L1){
+        l1_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
+    } else if (l.cost_type == WGAN){
+        wgan_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
     } else {
-        l2_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu);
+        l2_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
+    }
+
+    if (l.cost_type == SEG && l.noobject_scale != 1) {
+        scale_mask_gpu(l.batch*l.inputs, l.delta_gpu, 0, net.truth_gpu, l.noobject_scale);
+        scale_mask_gpu(l.batch*l.inputs, l.output_gpu, 0, net.truth_gpu, l.noobject_scale);
+    }
+    if (l.cost_type == MASKED) {
+        mask_gpu(l.batch*l.inputs, net.delta_gpu, SECRET_NUM, net.truth_gpu, 0);
     }
 
     if(l.ratio){
@@ -133,16 +157,20 @@ void forward_cost_layer_gpu(cost_layer l, network_state state)
         float thresh = l.delta[n];
         thresh = 0;
         printf("%f\n", thresh);
-        supp_ongpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
+        supp_gpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
+    }
+
+    if(l.thresh){
+        supp_gpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1);
     }
 
     cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs);
     l.cost[0] = sum_array(l.output, l.batch*l.inputs);
 }
 
-void backward_cost_layer_gpu(const cost_layer l, network_state state)
+void backward_cost_layer_gpu(const cost_layer l, network net)
 {
-    axpy_ongpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, state.delta, 1);
+    axpy_gpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
 
diff --git a/image.darknet/inst/include/darknet/src/cost_layer.h b/image.darknet/inst/include/darknet/src/cost_layer.h
index a692831..ceb64de 100644
--- a/image.darknet/inst/include/darknet/src/cost_layer.h
+++ b/image.darknet/inst/include/darknet/src/cost_layer.h
@@ -8,13 +8,13 @@ typedef layer cost_layer;
 COST_TYPE get_cost_type(char *s);
 char *get_cost_string(COST_TYPE a);
 cost_layer make_cost_layer(int batch, int inputs, COST_TYPE type, float scale);
-void forward_cost_layer(const cost_layer l, network_state state);
-void backward_cost_layer(const cost_layer l, network_state state);
+void forward_cost_layer(const cost_layer l, network net);
+void backward_cost_layer(const cost_layer l, network net);
 void resize_cost_layer(cost_layer *l, int inputs);
 
 #ifdef GPU
-void forward_cost_layer_gpu(cost_layer l, network_state state);
-void backward_cost_layer_gpu(const cost_layer l, network_state state);
+void forward_cost_layer_gpu(cost_layer l, network net);
+void backward_cost_layer_gpu(const cost_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/crnn_layer.c b/image.darknet/inst/include/darknet/src/crnn_layer.c
index 5495880..7dd29f6 100644
--- a/image.darknet/inst/include/darknet/src/crnn_layer.c
+++ b/image.darknet/inst/include/darknet/src/crnn_layer.c
@@ -48,17 +48,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
 
     l.input_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
+    *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 1, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
     l.input_layer->batch = batch;
 
     l.self_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
+    *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 1, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
     l.self_layer->batch = batch;
 
     l.output_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
+    *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 1, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
     l.output_layer->batch = batch;
 
     l.output = l.output_layer->output;
@@ -81,17 +81,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
     return l;
 }
 
-void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_crnn_layer(layer l, update_args a)
 {
-    update_convolutional_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_convolutional_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_convolutional_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.input_layer),  a);
+    update_convolutional_layer(*(l.self_layer),   a);
+    update_convolutional_layer(*(l.output_layer), a);
 }
 
-void forward_crnn_layer(layer l, network_state state)
+void forward_crnn_layer(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
@@ -100,17 +100,17 @@ void forward_crnn_layer(layer l, network_state state)
     fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
     fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1);
     fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1);
-    if(state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+    if(net.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);
 
     for (i = 0; i < l.steps; ++i) {
-        s.input = state.input;
+        s.input = net.input;
         forward_convolutional_layer(input_layer, s);
 
         s.input = l.state;
         forward_convolutional_layer(self_layer, s);
 
         float *old_state = l.state;
-        if(state.train) l.state += l.hidden*l.batch;
+        if(net.train) l.state += l.hidden*l.batch;
         if(l.shortcut){
             copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1);
         }else{
@@ -122,17 +122,16 @@ void forward_crnn_layer(layer l, network_state state)
         s.input = l.state;
         forward_convolutional_layer(output_layer, s);
 
-        state.input += l.inputs*l.batch;
+        net.input += l.inputs*l.batch;
         increment_layer(&input_layer, 1);
         increment_layer(&self_layer, 1);
         increment_layer(&output_layer, 1);
     }
 }
 
-void backward_crnn_layer(layer l, network_state state)
+void backward_crnn_layer(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
@@ -168,8 +167,8 @@ void backward_crnn_layer(layer l, network_state state)
 
         copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1);
         if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1);
-        s.input = state.input + i*l.inputs*l.batch;
-        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
+        s.input = net.input + i*l.inputs*l.batch;
+        if(net.delta) s.delta = net.delta + i*l.inputs*l.batch;
         else s.delta = 0;
         backward_convolutional_layer(input_layer, s);
 
@@ -195,58 +194,57 @@ void push_crnn_layer(layer l)
     push_convolutional_layer(*(l.output_layer));
 }
 
-void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_crnn_layer_gpu(layer l, update_args a)
 {
-    update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_convolutional_layer_gpu(*(l.input_layer),  a);
+    update_convolutional_layer_gpu(*(l.self_layer),   a);
+    update_convolutional_layer_gpu(*(l.output_layer), a);
 }
 
-void forward_crnn_layer_gpu(layer l, network_state state)
+void forward_crnn_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
     layer output_layer = *(l.output_layer);
 
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
-    fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
-    fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
-    if(state.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
+    fill_gpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
+    fill_gpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
+    if(net.train) fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);
 
     for (i = 0; i < l.steps; ++i) {
-        s.input = state.input;
+        s.input_gpu = net.input_gpu;
         forward_convolutional_layer_gpu(input_layer, s);
 
-        s.input = l.state_gpu;
+        s.input_gpu = l.state_gpu;
         forward_convolutional_layer_gpu(self_layer, s);
 
         float *old_state = l.state_gpu;
-        if(state.train) l.state_gpu += l.hidden*l.batch;
+        if(net.train) l.state_gpu += l.hidden*l.batch;
         if(l.shortcut){
-            copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
+            copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
         }else{
-            fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
+            fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);
         }
-        axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
-        axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_gpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
 
-        s.input = l.state_gpu;
+        s.input_gpu = l.state_gpu;
         forward_convolutional_layer_gpu(output_layer, s);
 
-        state.input += l.inputs*l.batch;
+        net.input_gpu += l.inputs*l.batch;
         increment_layer(&input_layer, 1);
         increment_layer(&self_layer, 1);
         increment_layer(&output_layer, 1);
     }
 }
 
-void backward_crnn_layer_gpu(layer l, network_state state)
+void backward_crnn_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
@@ -256,25 +254,25 @@ void backward_crnn_layer_gpu(layer l, network_state state)
     increment_layer(&output_layer, l.steps - 1);
     l.state_gpu += l.hidden*l.batch*l.steps;
     for (i = l.steps-1; i >= 0; --i) {
-        copy_ongpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
-        axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
+        copy_gpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
 
-        s.input = l.state_gpu;
-        s.delta = self_layer.delta_gpu;
+        s.input_gpu = l.state_gpu;
+        s.delta_gpu = self_layer.delta_gpu;
         backward_convolutional_layer_gpu(output_layer, s);
 
         l.state_gpu -= l.hidden*l.batch;
 
-        s.input = l.state_gpu;
-        s.delta = self_layer.delta_gpu - l.hidden*l.batch;
-        if (i == 0) s.delta = 0;
+        s.input_gpu = l.state_gpu;
+        s.delta_gpu = self_layer.delta_gpu - l.hidden*l.batch;
+        if (i == 0) s.delta_gpu = 0;
         backward_convolutional_layer_gpu(self_layer, s);
 
-        copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
-        if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
-        s.input = state.input + i*l.inputs*l.batch;
-        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
-        else s.delta = 0;
+        copy_gpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
+        if (i > 0 && l.shortcut) axpy_gpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
+        s.input_gpu = net.input_gpu + i*l.inputs*l.batch;
+        if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch;
+        else s.delta_gpu = 0;
         backward_convolutional_layer_gpu(input_layer, s);
 
         increment_layer(&input_layer,  -1);
diff --git a/image.darknet/inst/include/darknet/src/crnn_layer.h b/image.darknet/inst/include/darknet/src/crnn_layer.h
index 0da942e..515f378 100644
--- a/image.darknet/inst/include/darknet/src/crnn_layer.h
+++ b/image.darknet/inst/include/darknet/src/crnn_layer.h
@@ -8,14 +8,14 @@
 
 layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize);
 
-void forward_crnn_layer(layer l, network_state state);
-void backward_crnn_layer(layer l, network_state state);
-void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_crnn_layer(layer l, network net);
+void backward_crnn_layer(layer l, network net);
+void update_crnn_layer(layer l, update_args a);
 
 #ifdef GPU
-void forward_crnn_layer_gpu(layer l, network_state state);
-void backward_crnn_layer_gpu(layer l, network_state state);
-void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_crnn_layer_gpu(layer l, network net);
+void backward_crnn_layer_gpu(layer l, network net);
+void update_crnn_layer_gpu(layer l, update_args a);
 void push_crnn_layer(layer l);
 void pull_crnn_layer(layer l);
 #endif
diff --git a/image.darknet/inst/include/darknet/src/crop_layer.c b/image.darknet/inst/include/darknet/src/crop_layer.c
index 11c59b4..3b91852 100644
--- a/image.darknet/inst/include/darknet/src/crop_layer.c
+++ b/image.darknet/inst/include/darknet/src/crop_layer.c
@@ -10,8 +10,8 @@ image get_crop_image(crop_layer l)
     return float_to_image(w,h,c,l.output);
 }
 
-void backward_crop_layer(const crop_layer l, network_state state){}
-void backward_crop_layer_gpu(const crop_layer l, network_state state){}
+void backward_crop_layer(const crop_layer l, network net){}
+void backward_crop_layer_gpu(const crop_layer l, network net){}
 
 crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure)
 {
@@ -64,7 +64,7 @@ void resize_crop_layer(layer *l, int w, int h)
 }
 
 
-void forward_crop_layer(const crop_layer l, network_state state)
+void forward_crop_layer(const crop_layer l, network net)
 {
     int i,j,c,b,row,col;
     int index;
@@ -78,7 +78,7 @@ void forward_crop_layer(const crop_layer l, network_state state)
         scale = 1;
         trans = 0;
     }
-    if(!state.train){
+    if(!net.train){
         flip = 0;
         dh = (l.h - l.out_h)/2;
         dw = (l.w - l.out_w)/2;
@@ -94,7 +94,7 @@ void forward_crop_layer(const crop_layer l, network_state state)
                     }
                     row = i + dh;
                     index = col+l.w*(row+l.h*(c + l.c*b)); 
-                    l.output[count++] = state.input[index]*scale + trans;
+                    l.output[count++] = net.input[index]*scale + trans;
                 }
             }
         }
diff --git a/image.darknet/inst/include/darknet/src/crop_layer.h b/image.darknet/inst/include/darknet/src/crop_layer.h
index 3aa2d3d..3b5883c 100644
--- a/image.darknet/inst/include/darknet/src/crop_layer.h
+++ b/image.darknet/inst/include/darknet/src/crop_layer.h
@@ -9,11 +9,11 @@ typedef layer crop_layer;
 
 image get_crop_image(crop_layer l);
 crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure);
-void forward_crop_layer(const crop_layer l, network_state state);
+void forward_crop_layer(const crop_layer l, network net);
 void resize_crop_layer(layer *l, int w, int h);
 
 #ifdef GPU
-void forward_crop_layer_gpu(crop_layer l, network_state state);
+void forward_crop_layer_gpu(crop_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/crop_layer_kernels.cu b/image.darknet/inst/include/darknet/src/crop_layer_kernels.cu
index 8a08630..b5b9f55 100644
--- a/image.darknet/inst/include/darknet/src/crop_layer_kernels.cu
+++ b/image.darknet/inst/include/darknet/src/crop_layer_kernels.cu
@@ -113,9 +113,9 @@ __global__ void levels_image_kernel(float *image, float *rand, int batch, int w,
     float r3 = rand[8*id + 3];
 
     saturation = r0*(saturation - 1) + 1;
-    saturation = (r1 > .5) ? 1./saturation : saturation;
+    saturation = (r1 > .5f) ? 1.f/saturation : saturation;
     exposure = r2*(exposure - 1) + 1;
-    exposure = (r3 > .5) ? 1./exposure : exposure;
+    exposure = (r3 > .5f) ? 1.f/exposure : exposure;
 
     size_t offset = id * h * w * 3;
     image += offset;
@@ -131,9 +131,9 @@ __global__ void levels_image_kernel(float *image, float *rand, int batch, int w,
     } else {
         shift = 0;
     }
-    image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5)*shift;
-    image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5)*shift;
-    image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5)*shift;
+    image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5f)*shift;
+    image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5f)*shift;
+    image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5f)*shift;
 }
 
 __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, float *output)
@@ -141,8 +141,8 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if(id >= size) return;
 
-    float cx = w/2.;
-    float cy = h/2.;
+    float cx = w/2.f;
+    float cy = h/2.f;
 
     int count = id;
     int j = id % crop_width;
@@ -160,11 +160,11 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
 
     float dw = (w - crop_width)*r4;
     float dh = (h - crop_height)*r5;
-    flip = (flip && (r6 > .5));
+    flip = (flip && (r6 > .5f));
     angle = 2*angle*r7 - angle;
     if(!train){
-        dw = (w - crop_width)/2.;
-        dh = (h - crop_height)/2.;
+        dw = (w - crop_width)/2.f;
+        dh = (h - crop_height)/2.f;
         flip = 0;
         angle = 0;
     }
@@ -174,17 +174,17 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
     float x = (flip) ? w - dw - j - 1 : j + dw;    
     float y = i + dh;
 
-    float rx = cos(angle)*(x-cx) - sin(angle)*(y-cy) + cx;
-    float ry = sin(angle)*(x-cx) + cos(angle)*(y-cy) + cy;
+    float rx = cosf(angle)*(x-cx) - sinf(angle)*(y-cy) + cx;
+    float ry = sinf(angle)*(x-cx) + cosf(angle)*(y-cy) + cy;
 
     output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k);
 }
 
-extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state)
+extern "C" void forward_crop_layer_gpu(crop_layer layer, network net)
 {
     cuda_random(layer.rand_gpu, layer.batch*8);
 
-    float radians = layer.angle*3.14159265/180.;
+    float radians = layer.angle*3.14159265f/180.f;
 
     float scale = 2;
     float translate = -1;
@@ -195,12 +195,12 @@ extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state)
 
     int size = layer.batch * layer.w * layer.h;
 
-    levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, layer.batch, layer.w, layer.h, state.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
+    levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, layer.batch, layer.w, layer.h, net.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
     check_error(cudaPeekAtLastError());
 
     size = layer.batch*layer.c*layer.out_w*layer.out_h;
 
-    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, state.train, layer.flip, radians, layer.output_gpu);
+    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, net.train, layer.flip, radians, layer.output_gpu);
     check_error(cudaPeekAtLastError());
 
 /*
diff --git a/image.darknet/inst/include/darknet/src/cuda.c b/image.darknet/inst/include/darknet/src/cuda.c
index 1b51271..48aba6e 100644
--- a/image.darknet/inst/include/darknet/src/cuda.c
+++ b/image.darknet/inst/include/darknet/src/cuda.c
@@ -5,7 +5,7 @@ int gpu_index = 0;
 #include "cuda.h"
 #include "utils.h"
 #include "blas.h"
-#include "assert.h"
+#include <assert.h>
 #include <stdlib.h>
 #include <time.h>
 
@@ -96,6 +96,8 @@ float *cuda_make_array(float *x, size_t n)
     if(x){
         status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
         check_error(status);
+    } else {
+        fill_gpu(n, 0, x_gpu, 1);
     }
     if(!x_gpu) error("Cuda malloc failed\n");
     return x_gpu;
@@ -128,12 +130,17 @@ float cuda_compare(float *x_gpu, float *x, size_t n, char *s)
     return err;
 }
 
-int *cuda_make_int_array(size_t n)
+int *cuda_make_int_array(int *x, size_t n)
 {
     int *x_gpu;
     size_t size = sizeof(int)*n;
     cudaError_t status = cudaMalloc((void **)&x_gpu, size);
     check_error(status);
+    if(x){
+        status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
+        check_error(status);
+    }
+    if(!x_gpu) error("Cuda malloc failed\n");
     return x_gpu;
 }
 
@@ -157,4 +164,15 @@ void cuda_pull_array(float *x_gpu, float *x, size_t n)
     check_error(status);
 }
 
+float cuda_mag_array(float *x_gpu, size_t n)
+{
+    float *temp = calloc(n, sizeof(float));
+    cuda_pull_array(x_gpu, temp, n);
+    float m = mag_array(temp, n);
+    free(temp);
+    return m;
+}
+#else
+void cuda_set_device(int n){}
+
 #endif
diff --git a/image.darknet/inst/include/darknet/src/cuda.h b/image.darknet/inst/include/darknet/src/cuda.h
index 29b1eef..a1bc216 100644
--- a/image.darknet/inst/include/darknet/src/cuda.h
+++ b/image.darknet/inst/include/darknet/src/cuda.h
@@ -1,28 +1,13 @@
 #ifndef CUDA_H
 #define CUDA_H
 
-extern int gpu_index;
+#include "darknet.h"
 
 #ifdef GPU
 
-#define BLOCK 512
-
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
-
-#ifdef CUDNN
-#include "cudnn.h"
-#endif
-
 void check_error(cudaError_t status);
 cublasHandle_t blas_handle();
-float *cuda_make_array(float *x, size_t n);
-int *cuda_make_int_array(size_t n);
-void cuda_push_array(float *x_gpu, float *x, size_t n);
-void cuda_pull_array(float *x_gpu, float *x, size_t n);
-void cuda_set_device(int n);
-void cuda_free(float *x_gpu);
+int *cuda_make_int_array(int *x, size_t n);
 void cuda_random(float *x_gpu, size_t n);
 float cuda_compare(float *x_gpu, float *x, size_t n, char *s);
 dim3 cuda_gridsize(size_t n);
diff --git a/image.darknet/inst/include/darknet/src/data.c b/image.darknet/inst/include/darknet/src/data.c
index 05e5a91..59051b4 100644
--- a/image.darknet/inst/include/darknet/src/data.c
+++ b/image.darknet/inst/include/darknet/src/data.c
@@ -102,7 +102,7 @@ matrix load_image_paths(char **paths, int n, int w, int h)
     return X;
 }
 
-matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
+matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center)
 {
     int i;
     matrix X;
@@ -112,7 +112,12 @@ matrix load_image_augment_paths(char **paths, int n, int min, int max, int size,
 
     for(i = 0; i < n; ++i){
         image im = load_image_color(paths[i], 0, 0);
-        image crop = random_augment_image(im, angle, aspect, min, max, size);
+        image crop;
+        if(center){
+            crop = center_crop_image(im, size, size);
+        } else {
+            crop = random_augment_image(im, angle, aspect, min, max, size, size);
+        }
         int flip = rand()%2;
         if (flip) flip_image(crop);
         random_distort_image(crop, hue, saturation, exposure);
@@ -122,6 +127,7 @@ matrix load_image_augment_paths(char **paths, int n, int min, int max, int size,
         show_image(crop, "crop");
         cvWaitKey(0);
         */
+        //grayscale_image_3c(crop);
         free_image(im);
         X.vals[i] = crop.data;
         X.cols = crop.h*crop.w*crop.c;
@@ -132,14 +138,18 @@ matrix load_image_augment_paths(char **paths, int n, int min, int max, int size,
 
 box_label *read_boxes(char *filename, int *n)
 {
-    box_label *boxes = calloc(1, sizeof(box_label));
     FILE *file = fopen(filename, "r");
     if(!file) file_error(filename);
     float x, y, h, w;
     int id;
     int count = 0;
+    int size = 64;
+    box_label *boxes = calloc(size, sizeof(box_label));
     while(fscanf(file, "%d %f %f %f %f", &id, &x, &y, &w, &h) == 5){
-        boxes = realloc(boxes, (count+1)*sizeof(box_label));
+        if(count == size) {
+            size = size * 2;
+            boxes = realloc(boxes, size*sizeof(box_label));
+        }
         boxes[count].id = id;
         boxes[count].x = x;
         boxes[count].y = y;
@@ -221,7 +231,7 @@ void fill_truth_swag(char *path, float *truth, int classes, int flip, float dx,
     int id;
     int i;
 
-    for (i = 0; i < count && i < 30; ++i) {
+    for (i = 0; i < count && i < 90; ++i) {
         x =  boxes[i].x;
         y =  boxes[i].y;
         w =  boxes[i].w;
@@ -290,6 +300,150 @@ void fill_truth_region(char *path, float *truth, int classes, int num_boxes, int
     free(boxes);
 }
 
+void load_rle(image im, int *rle, int n)
+{
+    int count = 0;
+    int curr = 0;
+    int i,j;
+    for(i = 0; i < n; ++i){
+        for(j = 0; j < rle[i]; ++j){
+            im.data[count++] = curr;
+        }
+        curr = 1 - curr;
+    }
+    for(; count < im.h*im.w*im.c; ++count){
+        im.data[count] = curr;
+    }
+}
+
+void or_image(image src, image dest, int c)
+{
+    int i;
+    for(i = 0; i < src.w*src.h; ++i){
+        if(src.data[i]) dest.data[dest.w*dest.h*c + i] = 1;
+    }
+}
+
+void exclusive_image(image src)
+{
+    int k, j, i;
+    int s = src.w*src.h;
+    for(k = 0; k < src.c-1; ++k){
+        for(i = 0; i < s; ++i){
+            if (src.data[k*s + i]){
+                for(j = k+1; j < src.c; ++j){
+                    src.data[j*s + i] = 0;
+                }
+            }
+        }
+    }
+}
+
+box bound_image(image im)
+{
+    int x,y;
+    int minx = im.w;
+    int miny = im.h;
+    int maxx = 0;
+    int maxy = 0;
+    for(y = 0; y < im.h; ++y){
+        for(x = 0; x < im.w; ++x){
+            if(im.data[y*im.w + x]){
+                minx = (x < minx) ? x : minx;
+                miny = (y < miny) ? y : miny;
+                maxx = (x > maxx) ? x : maxx;
+                maxy = (y > maxy) ? y : maxy;
+            }
+        }
+    }
+    box b = {minx, miny, maxx-minx + 1, maxy-miny + 1};
+    //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
+    return b;
+}
+
+void fill_truth_iseg(char *path, int num_boxes, float *truth, int classes, int w, int h, augment_args aug, int flip, int mw, int mh)
+{
+    char labelpath[4096];
+    find_replace(path, "images", "mask", labelpath);
+    find_replace(labelpath, "JPEGImages", "mask", labelpath);
+    find_replace(labelpath, ".jpg", ".txt", labelpath);
+    find_replace(labelpath, ".JPG", ".txt", labelpath);
+    find_replace(labelpath, ".JPEG", ".txt", labelpath);
+    FILE *file = fopen(labelpath, "r");
+    if(!file) file_error(labelpath);
+    char buff[32788];
+    int id;
+    int i = 0;
+    int j;
+    image part = make_image(w, h, 1);
+    while((fscanf(file, "%d %s", &id, buff) == 2) && i < num_boxes){
+        int n = 0;
+        int *rle = read_intlist(buff, &n, 0);
+        load_rle(part, rle, n);
+        image sized = rotate_crop_image(part, aug.rad, aug.scale, aug.w, aug.h, aug.dx, aug.dy, aug.aspect);
+        if(flip) flip_image(sized);
+
+        image mask = resize_image(sized, mw, mh);
+        truth[i*(mw*mh+1)] = id;
+        for(j = 0; j < mw*mh; ++j){
+            truth[i*(mw*mh + 1) + 1 + j] = mask.data[j];
+        }
+        ++i;
+
+        free_image(mask);
+        free_image(sized);
+        free(rle);
+    }
+    if(i < num_boxes) truth[i*(mw*mh+1)] = -1;
+    fclose(file);
+    free_image(part);
+}
+
+void fill_truth_mask(char *path, int num_boxes, float *truth, int classes, int w, int h, augment_args aug, int flip, int mw, int mh)
+{
+    char labelpath[4096];
+    find_replace(path, "images", "mask", labelpath);
+    find_replace(labelpath, "JPEGImages", "mask", labelpath);
+    find_replace(labelpath, ".jpg", ".txt", labelpath);
+    find_replace(labelpath, ".JPG", ".txt", labelpath);
+    find_replace(labelpath, ".JPEG", ".txt", labelpath);
+    FILE *file = fopen(labelpath, "r");
+    if(!file) file_error(labelpath);
+    char buff[32788];
+    int id;
+    int i = 0;
+    image part = make_image(w, h, 1);
+    while((fscanf(file, "%d %s", &id, buff) == 2) && i < num_boxes){
+        int n = 0;
+        int *rle = read_intlist(buff, &n, 0);
+        load_rle(part, rle, n);
+        image sized = rotate_crop_image(part, aug.rad, aug.scale, aug.w, aug.h, aug.dx, aug.dy, aug.aspect);
+        if(flip) flip_image(sized);
+        box b = bound_image(sized);
+        if(b.w > 0){
+            image crop = crop_image(sized, b.x, b.y, b.w, b.h);
+            image mask = resize_image(crop, mw, mh);
+            truth[i*(4 + mw*mh + 1) + 0] = (b.x + b.w/2.)/sized.w;
+            truth[i*(4 + mw*mh + 1) + 1] = (b.y + b.h/2.)/sized.h;
+            truth[i*(4 + mw*mh + 1) + 2] = b.w/sized.w;
+            truth[i*(4 + mw*mh + 1) + 3] = b.h/sized.h;
+            int j;
+            for(j = 0; j < mw*mh; ++j){
+                truth[i*(4 + mw*mh + 1) + 4 + j] = mask.data[j];
+            }
+            truth[i*(4 + mw*mh + 1) + 4 + mw*mh] = id;
+            free_image(crop);
+            free_image(mask);
+            ++i;
+        }
+        free_image(sized);
+        free(rle);
+    }
+    fclose(file);
+    free_image(part);
+}
+
+
 void fill_truth_detection(char *path, int num_boxes, float *truth, int classes, int flip, float dx, float dy, float sx, float sy)
 {
     char labelpath[4096];
@@ -309,6 +463,7 @@ void fill_truth_detection(char *path, int num_boxes, float *truth, int classes,
     float x,y,w,h;
     int id;
     int i;
+    int sub = 0;
 
     for (i = 0; i < count; ++i) {
         x =  boxes[i].x;
@@ -317,13 +472,16 @@ void fill_truth_detection(char *path, int num_boxes, float *truth, int classes,
         h =  boxes[i].h;
         id = boxes[i].id;
 
-        if ((w < .005 || h < .005)) continue;
+        if ((w < .001 || h < .001)) {
+            ++sub;
+            continue;
+        }
 
-        truth[i*5+0] = x;
-        truth[i*5+1] = y;
-        truth[i*5+2] = w;
-        truth[i*5+3] = h;
-        truth[i*5+4] = id;
+        truth[(i-sub)*5+0] = x;
+        truth[(i-sub)*5+1] = y;
+        truth[(i-sub)*5+2] = w;
+        truth[(i-sub)*5+3] = h;
+        truth[(i-sub)*5+4] = id;
     }
     free(boxes);
 }
@@ -391,9 +549,10 @@ void fill_truth(char *path, char **labels, int k, float *truth)
         if(strstr(path, labels[i])){
             truth[i] = 1;
             ++count;
+            //printf("%s %s %d\n", path, labels[i], i);
         }
     }
-    if(count != 1) printf("Too many or too few labels: %d, %s\n", count, path);
+    if(count != 1 && (k != 1 || count != 0)) printf("Too many or too few labels: %d, %s\n", count, path);
 }
 
 void fill_hierarchy(float *truth, int k, tree *hierarchy)
@@ -428,6 +587,36 @@ void fill_hierarchy(float *truth, int k, tree *hierarchy)
     }
 }
 
+matrix load_regression_labels_paths(char **paths, int n, int k)
+{
+    matrix y = make_matrix(n, k);
+    int i,j;
+    for(i = 0; i < n; ++i){
+        char labelpath[4096];
+        find_replace(paths[i], "images", "labels", labelpath);
+        find_replace(labelpath, "JPEGImages", "labels", labelpath);
+        find_replace(labelpath, ".BMP", ".txt", labelpath);
+        find_replace(labelpath, ".JPEG", ".txt", labelpath);
+        find_replace(labelpath, ".JPG", ".txt", labelpath);
+        find_replace(labelpath, ".JPeG", ".txt", labelpath);
+        find_replace(labelpath, ".Jpeg", ".txt", labelpath);
+        find_replace(labelpath, ".PNG", ".txt", labelpath);
+        find_replace(labelpath, ".TIF", ".txt", labelpath);
+        find_replace(labelpath, ".bmp", ".txt", labelpath);
+        find_replace(labelpath, ".jpeg", ".txt", labelpath);
+        find_replace(labelpath, ".jpg", ".txt", labelpath);
+        find_replace(labelpath, ".png", ".txt", labelpath);
+        find_replace(labelpath, ".tif", ".txt", labelpath);
+
+        FILE *file = fopen(labelpath, "r");
+        for(j = 0; j < k; ++j){
+            fscanf(file, "%f", &(y.vals[i][j]));
+        }
+        fclose(file);
+    }
+    return y;
+}
+
 matrix load_labels_paths(char **paths, int n, char **labels, int k, tree *hierarchy)
 {
     matrix y = make_matrix(n, k);
@@ -445,18 +634,14 @@ matrix load_tags_paths(char **paths, int n, int k)
 {
     matrix y = make_matrix(n, k);
     int i;
-    int count = 0;
+    //int count = 0;
     for(i = 0; i < n; ++i){
         char label[4096];
-        find_replace(paths[i], "imgs", "labels", label);
-        find_replace(label, "_iconl.jpeg", ".txt", label);
+        find_replace(paths[i], "images", "labels", label);
+        find_replace(label, ".jpg", ".txt", label);
         FILE *file = fopen(label, "r");
-        if(!file){
-            find_replace(label, "labels", "labels2", label);
-            file = fopen(label, "r");
-            if(!file) continue;
-        }
-        ++count;
+        if (!file) continue;
+        //++count;
         int tag;
         while(fscanf(file, "%d", &tag) == 1){
             if(tag < k){
@@ -465,7 +650,7 @@ matrix load_tags_paths(char **paths, int n, int k)
         }
         fclose(file);
     }
-    printf("%d/%d\n", count, n);
+    //printf("%d/%d\n", count, n);
     return y;
 }
 
@@ -488,6 +673,195 @@ void free_data(data d)
     }
 }
 
+image get_segmentation_image(char *path, int w, int h, int classes)
+{
+    char labelpath[4096];
+    find_replace(path, "images", "mask", labelpath);
+    find_replace(labelpath, "JPEGImages", "mask", labelpath);
+    find_replace(labelpath, ".jpg", ".txt", labelpath);
+    find_replace(labelpath, ".JPG", ".txt", labelpath);
+    find_replace(labelpath, ".JPEG", ".txt", labelpath);
+    image mask = make_image(w, h, classes);
+    FILE *file = fopen(labelpath, "r");
+    if(!file) file_error(labelpath);
+    char buff[32788];
+    int id;
+    image part = make_image(w, h, 1);
+    while(fscanf(file, "%d %s", &id, buff) == 2){
+        int n = 0;
+        int *rle = read_intlist(buff, &n, 0);
+        load_rle(part, rle, n);
+        or_image(part, mask, id);
+        free(rle);
+    }
+    //exclusive_image(mask);
+    fclose(file);
+    free_image(part);
+    return mask;
+}
+
+image get_segmentation_image2(char *path, int w, int h, int classes)
+{
+    char labelpath[4096];
+    find_replace(path, "images", "mask", labelpath);
+    find_replace(labelpath, "JPEGImages", "mask", labelpath);
+    find_replace(labelpath, ".jpg", ".txt", labelpath);
+    find_replace(labelpath, ".JPG", ".txt", labelpath);
+    find_replace(labelpath, ".JPEG", ".txt", labelpath);
+    image mask = make_image(w, h, classes+1);
+    int i;
+    for(i = 0; i < w*h; ++i){
+        mask.data[w*h*classes + i] = 1;
+    }
+    FILE *file = fopen(labelpath, "r");
+    if(!file) file_error(labelpath);
+    char buff[32788];
+    int id;
+    image part = make_image(w, h, 1);
+    while(fscanf(file, "%d %s", &id, buff) == 2){
+        int n = 0;
+        int *rle = read_intlist(buff, &n, 0);
+        load_rle(part, rle, n);
+        or_image(part, mask, id);
+        for(i = 0; i < w*h; ++i){
+            if(part.data[i]) mask.data[w*h*classes + i] = 0;
+        }
+        free(rle);
+    }
+    //exclusive_image(mask);
+    fclose(file);
+    free_image(part);
+    return mask;
+}
+
+data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int min, int max, float angle, float aspect, float hue, float saturation, float exposure, int div)
+{
+    char **random_paths = get_random_paths(paths, n, m);
+    int i;
+    data d = {0};
+    d.shallow = 0;
+
+    d.X.rows = n;
+    d.X.vals = calloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*3;
+
+
+    d.y.rows = n;
+    d.y.cols = h*w*classes/div/div;
+    d.y.vals = calloc(d.X.rows, sizeof(float*));
+
+    for(i = 0; i < n; ++i){
+        image orig = load_image_color(random_paths[i], 0, 0);
+        augment_args a = random_augment_args(orig, angle, aspect, min, max, w, h);
+        image sized = rotate_crop_image(orig, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
+
+        int flip = rand()%2;
+        if(flip) flip_image(sized);
+        random_distort_image(sized, hue, saturation, exposure);
+        d.X.vals[i] = sized.data;
+
+        image mask = get_segmentation_image(random_paths[i], orig.w, orig.h, classes);
+        //image mask = make_image(orig.w, orig.h, classes+1);
+        image sized_m = rotate_crop_image(mask, a.rad, a.scale/div, a.w/div, a.h/div, a.dx/div, a.dy/div, a.aspect);
+
+        if(flip) flip_image(sized_m);
+        d.y.vals[i] = sized_m.data;
+
+        free_image(orig);
+        free_image(mask);
+
+        /*
+           image rgb = mask_to_rgb(sized_m, classes);
+           show_image(rgb, "part");
+           show_image(sized, "orig");
+           cvWaitKey(0);
+           free_image(rgb);
+         */
+    }
+    free(random_paths);
+    return d;
+}
+
+data load_data_iseg(int n, char **paths, int m, int w, int h, int classes, int boxes, int div, int min, int max, float angle, float aspect, float hue, float saturation, float exposure)
+{
+    char **random_paths = get_random_paths(paths, n, m);
+    int i;
+    data d = {0};
+    d.shallow = 0;
+
+    d.X.rows = n;
+    d.X.vals = calloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*3;
+
+    d.y = make_matrix(n, (((w/div)*(h/div))+1)*boxes);
+
+    for(i = 0; i < n; ++i){
+        image orig = load_image_color(random_paths[i], 0, 0);
+        augment_args a = random_augment_args(orig, angle, aspect, min, max, w, h);
+        image sized = rotate_crop_image(orig, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
+
+        int flip = rand()%2;
+        if(flip) flip_image(sized);
+        random_distort_image(sized, hue, saturation, exposure);
+        d.X.vals[i] = sized.data;
+        //show_image(sized, "image");
+
+        fill_truth_iseg(random_paths[i], boxes, d.y.vals[i], classes, orig.w, orig.h, a, flip, w/div, h/div);
+
+        free_image(orig);
+
+        /*
+           image rgb = mask_to_rgb(sized_m, classes);
+           show_image(rgb, "part");
+           show_image(sized, "orig");
+           cvWaitKey(0);
+           free_image(rgb);
+         */
+    }
+    free(random_paths);
+    return d;
+}
+
+data load_data_mask(int n, char **paths, int m, int w, int h, int classes, int boxes, int coords, int min, int max, float angle, float aspect, float hue, float saturation, float exposure)
+{
+    char **random_paths = get_random_paths(paths, n, m);
+    int i;
+    data d = {0};
+    d.shallow = 0;
+
+    d.X.rows = n;
+    d.X.vals = calloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*3;
+
+    d.y = make_matrix(n, (coords+1)*boxes);
+
+    for(i = 0; i < n; ++i){
+        image orig = load_image_color(random_paths[i], 0, 0);
+        augment_args a = random_augment_args(orig, angle, aspect, min, max, w, h);
+        image sized = rotate_crop_image(orig, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
+
+        int flip = rand()%2;
+        if(flip) flip_image(sized);
+        random_distort_image(sized, hue, saturation, exposure);
+        d.X.vals[i] = sized.data;
+        //show_image(sized, "image");
+
+        fill_truth_mask(random_paths[i], boxes, d.y.vals[i], classes, orig.w, orig.h, a, flip, 14, 14);
+
+        free_image(orig);
+
+        /*
+           image rgb = mask_to_rgb(sized_m, classes);
+           show_image(rgb, "part");
+           show_image(sized, "orig");
+           cvWaitKey(0);
+           free_image(rgb);
+         */
+    }
+    free(random_paths);
+    return d;
+}
+
 data load_data_region(int n, char **paths, int m, int w, int h, int size, int classes, float jitter, float hue, float saturation, float exposure)
 {
     char **random_paths = get_random_paths(paths, n, m);
@@ -624,7 +998,7 @@ data load_data_swag(char **paths, int n, int classes, float jitter)
     d.X.vals = calloc(d.X.rows, sizeof(float*));
     d.X.cols = h*w*3;
 
-    int k = (4+classes)*30;
+    int k = (4+classes)*90;
     d.y = make_matrix(1, k);
 
     int dw = w*jitter;
@@ -673,45 +1047,46 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, in
     d.y = make_matrix(n, 5*boxes);
     for(i = 0; i < n; ++i){
         image orig = load_image_color(random_paths[i], 0, 0);
+        image sized = make_image(w, h, orig.c);
+        fill_image(sized, .5);
 
-        int oh = orig.h;
-        int ow = orig.w;
+        float dw = jitter * orig.w;
+        float dh = jitter * orig.h;
 
-        int dw = (ow*jitter);
-        int dh = (oh*jitter);
+        float new_ar = (orig.w + rand_uniform(-dw, dw)) / (orig.h + rand_uniform(-dh, dh));
+        //float scale = rand_uniform(.25, 2);
+        float scale = 1;
 
-        int pleft  = rand_uniform(-dw, dw);
-        int pright = rand_uniform(-dw, dw);
-        int ptop   = rand_uniform(-dh, dh);
-        int pbot   = rand_uniform(-dh, dh);
+        float nw, nh;
 
-        int swidth =  ow - pleft - pright;
-        int sheight = oh - ptop - pbot;
+        if(new_ar < 1){
+            nh = scale * h;
+            nw = nh * new_ar;
+        } else {
+            nw = scale * w;
+            nh = nw / new_ar;
+        }
 
-        float sx = (float)swidth  / ow;
-        float sy = (float)sheight / oh;
+        float dx = rand_uniform(0, w - nw);
+        float dy = rand_uniform(0, h - nh);
 
-        int flip = rand()%2;
-        image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
+        place_image(orig, nw, nh, dx, dy, sized);
 
-        float dx = ((float)pleft/ow)/sx;
-        float dy = ((float)ptop /oh)/sy;
+        random_distort_image(sized, hue, saturation, exposure);
 
-        image sized = resize_image(cropped, w, h);
+        int flip = rand()%2;
         if(flip) flip_image(sized);
-        random_distort_image(sized, hue, saturation, exposure);
         d.X.vals[i] = sized.data;
 
-        fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, dx, dy, 1./sx, 1./sy);
+
+        fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, -dx/w, -dy/h, nw/w, nh/h);
 
         free_image(orig);
-        free_image(cropped);
     }
     free(random_paths);
     return d;
 }
 
-
 void *load_thread(void *ptr)
 {
     //printf("Loading data: %d\n", rand());
@@ -722,12 +1097,20 @@ void *load_thread(void *ptr)
 
     if (a.type == OLD_CLASSIFICATION_DATA){
         *a.d = load_data_old(a.paths, a.n, a.m, a.labels, a.classes, a.w, a.h);
+    } else if (a.type == REGRESSION_DATA){
+        *a.d = load_data_regression(a.paths, a.n, a.m, a.classes, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
     } else if (a.type == CLASSIFICATION_DATA){
-        *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
+        *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure, a.center);
     } else if (a.type == SUPER_DATA){
         *a.d = load_data_super(a.paths, a.n, a.m, a.w, a.h, a.scale);
     } else if (a.type == WRITING_DATA){
         *a.d = load_data_writing(a.paths, a.n, a.m, a.w, a.h, a.out_w, a.out_h);
+    } else if (a.type == ISEG_DATA){
+        *a.d = load_data_iseg(a.n, a.paths, a.m, a.w, a.h, a.classes, a.num_boxes, a.scale, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
+    } else if (a.type == INSTANCE_DATA){
+        *a.d = load_data_mask(a.n, a.paths, a.m, a.w, a.h, a.classes, a.num_boxes, a.coords, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
+    } else if (a.type == SEGMENTATION_DATA){
+        *a.d = load_data_seg(a.n, a.paths, a.m, a.w, a.h, a.classes, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure, a.scale);
     } else if (a.type == REGION_DATA){
         *a.d = load_data_region(a.n, a.paths, a.m, a.w, a.h, a.num_boxes, a.classes, a.jitter, a.hue, a.saturation, a.exposure);
     } else if (a.type == DETECTION_DATA){
@@ -739,6 +1122,9 @@ void *load_thread(void *ptr)
     } else if (a.type == IMAGE_DATA){
         *(a.im) = load_image_color(a.path, 0, 0);
         *(a.resized) = resize_image(*(a.im), a.w, a.h);
+    } else if (a.type == LETTERBOX_DATA){
+        *(a.im) = load_image_color(a.path, 0, 0);
+        *(a.resized) = letterbox_image(*(a.im), a.w, a.h);
     } else if (a.type == TAG_DATA){
         *a.d = load_data_tag(a.paths, a.n, a.m, a.classes, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
     }
@@ -784,6 +1170,13 @@ void *load_threads(void *ptr)
     return 0;
 }
 
+void load_data_blocking(load_args args)
+{
+    struct load_args *ptr = calloc(1, sizeof(struct load_args));
+    *ptr = args;
+    load_thread(ptr);
+}
+
 pthread_t load_data(load_args args)
 {
     pthread_t thread;
@@ -863,12 +1256,95 @@ data load_data_super(char **paths, int n, int m, int w, int h, int scale)
     return d;
 }
 
-data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
+data load_data_regression(char **paths, int n, int m, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
 {
     if(m) paths = get_random_paths(paths, n, m);
     data d = {0};
     d.shallow = 0;
-    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure);
+    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure, 0);
+    d.y = load_regression_labels_paths(paths, n, k);
+    if(m) free(paths);
+    return d;
+}
+
+data select_data(data *orig, int *inds)
+{
+    data d = {0};
+    d.shallow = 1;
+    d.w = orig[0].w;
+    d.h = orig[0].h;
+
+    d.X.rows = orig[0].X.rows;
+    d.y.rows = orig[0].X.rows;
+
+    d.X.cols = orig[0].X.cols;
+    d.y.cols = orig[0].y.cols;
+
+    d.X.vals = calloc(orig[0].X.rows, sizeof(float *));
+    d.y.vals = calloc(orig[0].y.rows, sizeof(float *));
+    int i;
+    for(i = 0; i < d.X.rows; ++i){
+        d.X.vals[i] = orig[inds[i]].X.vals[i];
+        d.y.vals[i] = orig[inds[i]].y.vals[i];
+    }
+    return d;
+}
+
+data *tile_data(data orig, int divs, int size)
+{
+    data *ds = calloc(divs*divs, sizeof(data));
+    int i, j;
+#pragma omp parallel for
+    for(i = 0; i < divs*divs; ++i){
+        data d;
+        d.shallow = 0;
+        d.w = orig.w/divs * size;
+        d.h = orig.h/divs * size;
+        d.X.rows = orig.X.rows;
+        d.X.cols = d.w*d.h*3;
+        d.X.vals = calloc(d.X.rows, sizeof(float*));
+
+        d.y = copy_matrix(orig.y);
+#pragma omp parallel for
+        for(j = 0; j < orig.X.rows; ++j){
+            int x = (i%divs) * orig.w / divs - (d.w - orig.w/divs)/2;
+            int y = (i/divs) * orig.h / divs - (d.h - orig.h/divs)/2;
+            image im = float_to_image(orig.w, orig.h, 3, orig.X.vals[j]);
+            d.X.vals[j] = crop_image(im, x, y, d.w, d.h).data;
+        }
+        ds[i] = d;
+    }
+    return ds;
+}
+
+data resize_data(data orig, int w, int h)
+{
+    data d = {0};
+    d.shallow = 0;
+    d.w = w;
+    d.h = h;
+    int i;
+    d.X.rows = orig.X.rows;
+    d.X.cols = w*h*3;
+    d.X.vals = calloc(d.X.rows, sizeof(float*));
+
+    d.y = copy_matrix(orig.y);
+#pragma omp parallel for
+    for(i = 0; i < orig.X.rows; ++i){
+        image im = float_to_image(orig.w, orig.h, 3, orig.X.vals[i]);
+        d.X.vals[i] = resize_image(im, w, h).data;
+    }
+    return d;
+}
+
+data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center)
+{
+    if(m) paths = get_random_paths(paths, n, m);
+    data d = {0};
+    d.shallow = 0;
+    d.w=size;
+    d.h=size;
+    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure, center);
     d.y = load_labels_paths(paths, n, labels, k, hierarchy);
     if(m) free(paths);
     return d;
@@ -881,7 +1357,7 @@ data load_data_tag(char **paths, int n, int m, int k, int min, int max, int size
     d.w = size;
     d.h = size;
     d.shallow = 0;
-    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure);
+    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure, 0);
     d.y = load_tags_paths(paths, n, k);
     if(m) free(paths);
     return d;
@@ -909,6 +1385,8 @@ data concat_data(data d1, data d2)
     d.shallow = 1;
     d.X = concat_matrix(d1.X, d2.X);
     d.y = concat_matrix(d1.y, d2.y);
+    d.w = d1.w;
+    d.h = d1.h;
     return d;
 }
 
@@ -962,7 +1440,6 @@ data load_cifar10_data(char *filename)
             X.vals[i][j] = (double)bytes[j+1];
         }
     }
-    //translate_data_rows(d, -128);
     scale_data_rows(d, 1./255);
     //normalize_data_rows(d);
     fclose(fp);
@@ -985,7 +1462,7 @@ void get_next_batch(data d, int n, int offset, float *X, float *y)
     for(j = 0; j < n; ++j){
         int index = offset + j;
         memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
-        memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
+        if(y) memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
     }
 }
 
@@ -1029,7 +1506,6 @@ data load_all_cifar10()
         fclose(fp);
     }
     //normalize_data_rows(d);
-    //translate_data_rows(d, -128);
     scale_data_rows(d, 1./255);
     smooth_data(d);
     return d;
@@ -1113,6 +1589,19 @@ void translate_data_rows(data d, float s)
     }
 }
 
+data copy_data(data d)
+{
+    data c = {0};
+    c.w = d.w;
+    c.h = d.h;
+    c.shallow = 0;
+    c.num_boxes = d.num_boxes;
+    c.boxes = d.boxes;
+    c.X = copy_matrix(d.X);
+    c.y = copy_matrix(d.y);
+    return c;
+}
+
 void normalize_data_rows(data d)
 {
     int i;
diff --git a/image.darknet/inst/include/darknet/src/data.h b/image.darknet/inst/include/darknet/src/data.h
index 3f6ef61..781906f 100644
--- a/image.darknet/inst/include/darknet/src/data.h
+++ b/image.darknet/inst/include/darknet/src/data.h
@@ -2,6 +2,7 @@
 #define DATA_H
 #include <pthread.h>
 
+#include "darknet.h"
 #include "matrix.h"
 #include "list.h"
 #include "image.h"
@@ -17,93 +18,32 @@ static inline float distance_from_edge(int x, int max)
     if (dist > 1) dist = 1;
     return dist;
 }
+void load_data_blocking(load_args args);
 
-typedef struct{
-    int w, h;
-    matrix X;
-    matrix y;
-    int shallow;
-    int *num_boxes;
-    box **boxes;
-} data;
-
-typedef enum {
-    CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA
-} data_type;
-
-typedef struct load_args{
-    int threads;
-    char **paths;
-    char *path;
-    int n;
-    int m;
-    char **labels;
-    int h;
-    int w;
-    int out_w;
-    int out_h;
-    int nh;
-    int nw;
-    int num_boxes;
-    int min, max, size;
-    int classes;
-    int background;
-    int scale;
-    float jitter;
-    float angle;
-    float aspect;
-    float saturation;
-    float exposure;
-    float hue;
-    data *d;
-    image *im;
-    image *resized;
-    data_type type;
-    tree *hierarchy;
-} load_args;
-
-typedef struct{
-    int id;
-    float x,y,w,h;
-    float left, right, top, bottom;
-} box_label;
-
-void free_data(data d);
-
-pthread_t load_data(load_args args);
-
-pthread_t load_data_in_thread(load_args args);
 
 void print_letters(float *pred, int n);
 data load_data_captcha(char **paths, int n, int m, int k, int w, int h);
 data load_data_captcha_encode(char **paths, int n, int m, int w, int h);
-data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h);
 data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, int classes, float jitter, float hue, float saturation, float exposure);
 data load_data_tag(char **paths, int n, int m, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
-matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
+matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center);
 data load_data_super(char **paths, int n, int m, int w, int h, int scale);
-data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
+data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center);
+data load_data_regression(char **paths, int n, int m, int classes, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
 data load_go(char *filename);
 
-box_label *read_boxes(char *filename, int *n);
-data load_cifar10_data(char *filename);
-data load_all_cifar10();
 
 data load_data_writing(char **paths, int n, int m, int w, int h, int out_w, int out_h);
 
-list *get_paths(char *filename);
-char **get_labels(char *filename);
 void get_random_batch(data d, int n, float *X, float *y);
 data get_data_part(data d, int part, int total);
 data get_random_data(data d, int num);
-void get_next_batch(data d, int n, int offset, float *X, float *y);
 data load_categorical_data_csv(char *filename, int target, int k);
 void normalize_data_rows(data d);
 void scale_data_rows(data d, float s);
 void translate_data_rows(data d, float s);
 void randomize_data(data d);
 data *split_data(data d, int part, int total);
-data concat_data(data d1, data d2);
 data concat_datas(data *d, int n);
 void fill_truth(char *path, char **labels, int k, float *truth);
 
diff --git a/image.darknet/inst/include/darknet/src/deconvolutional_kernels.cu b/image.darknet/inst/include/darknet/src/deconvolutional_kernels.cu
index d6259fb..8267dcf 100644
--- a/image.darknet/inst/include/darknet/src/deconvolutional_kernels.cu
+++ b/image.darknet/inst/include/darknet/src/deconvolutional_kernels.cu
@@ -5,6 +5,7 @@
 extern "C" {
 #include "convolutional_layer.h"
 #include "deconvolutional_layer.h"
+#include "batchnorm_layer.h"
 #include "gemm.h"
 #include "blas.h"
 #include "im2col.h"
@@ -13,97 +14,126 @@ extern "C" {
 #include "cuda.h"
 }
 
-extern "C" void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state)
+extern "C" void forward_deconvolutional_layer_gpu(layer l, network net)
 {
     int i;
-    int out_h = deconvolutional_out_height(layer);
-    int out_w = deconvolutional_out_width(layer);
-    int size = out_h*out_w;
 
-    int m = layer.size*layer.size*layer.n;
-    int n = layer.h*layer.w;
-    int k = layer.c;
+    int m = l.size*l.size*l.n;
+    int n = l.h*l.w;
+    int k = l.c;
 
-    fill_ongpu(layer.outputs*layer.batch, 0, layer.output_gpu, 1);
+    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
 
-    for(i = 0; i < layer.batch; ++i){
-        float *a = layer.weights_gpu;
-        float *b = state.input + i*layer.c*layer.h*layer.w;
-        float *c = layer.col_image_gpu;
+    for(i = 0; i < l.batch; ++i){
+        float *a = l.weights_gpu;
+        float *b = net.input_gpu + i*l.c*l.h*l.w;
+        float *c = net.workspace;
 
-        gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
+        gemm_gpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
 
-        col2im_ongpu(c, layer.n, out_h, out_w, layer.size, layer.stride, 0, layer.output_gpu+i*layer.n*size);
+        col2im_gpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs);
     }
-    add_bias_gpu(layer.output_gpu, layer.biases_gpu, layer.batch, layer.n, size);
-    activate_array(layer.output_gpu, layer.batch*layer.n*size, layer.activation);
+    if (l.batch_normalize) {
+        forward_batchnorm_layer_gpu(l, net);
+    } else {
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+    }
+    activate_array_gpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation);
 }
 
-extern "C" void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state)
+extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
 {
-    float alpha = 1./layer.batch;
-    int out_h = deconvolutional_out_height(layer);
-    int out_w = deconvolutional_out_width(layer);
-    int size = out_h*out_w;
     int i;
 
-    gradient_array(layer.output_gpu, size*layer.n*layer.batch, layer.activation, layer.delta_gpu);
-    backward_bias(layer.bias_updates_gpu, layer.delta, layer.batch, layer.n, size);
+    //constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+
+    if(l.batch_normalize){
+        backward_batchnorm_layer_gpu(l, net);
+    } else {
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+    }
 
-    if(state.delta) memset(state.delta, 0, layer.batch*layer.h*layer.w*layer.c*sizeof(float));
+    //if(net.delta_gpu) memset(net.delta_gpu, 0, l.batch*l.h*l.w*l.c*sizeof(float));
 
-    for(i = 0; i < layer.batch; ++i){
-        int m = layer.c;
-        int n = layer.size*layer.size*layer.n;
-        int k = layer.h*layer.w;
+    for(i = 0; i < l.batch; ++i){
+        int m = l.c;
+        int n = l.size*l.size*l.n;
+        int k = l.h*l.w;
 
-        float *a = state.input + i*m*n;
-        float *b = layer.col_image_gpu;
-        float *c = layer.weight_updates_gpu;
+        float *a = net.input_gpu + i*m*k;
+        float *b = net.workspace;
+        float *c = l.weight_updates_gpu;
 
-        im2col_ongpu(layer.delta_gpu + i*layer.n*size, layer.n, out_h, out_w, 
-                layer.size, layer.stride, 0, b);
-        gemm_ongpu(0,1,m,n,k,alpha,a,k,b,k,1,c,n);
+        im2col_gpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w, 
+                l.size, l.stride, l.pad, b);
+        gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
 
-        if(state.delta){
-            int m = layer.c;
-            int n = layer.h*layer.w;
-            int k = layer.size*layer.size*layer.n;
+        if(net.delta_gpu){
+            int m = l.c;
+            int n = l.h*l.w;
+            int k = l.size*l.size*l.n;
 
-            float *a = layer.weights_gpu;
-            float *b = layer.col_image_gpu;
-            float *c = state.delta + i*n*m;
+            float *a = l.weights_gpu;
+            float *b = net.workspace;
+            float *c = net.delta_gpu + i*n*m;
 
-            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+            gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
         }
     }
 }
 
-extern "C" void pull_deconvolutional_layer(deconvolutional_layer layer)
+extern "C" void pull_deconvolutional_layer(layer l)
 {
-    cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
+    cuda_pull_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size);
+    cuda_pull_array(l.biases_gpu, l.biases, l.n);
+    cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size);
+    cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_pull_array(l.scales_gpu, l.scales, l.n);
+        cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
+    }
 }
 
-extern "C" void push_deconvolutional_layer(deconvolutional_layer layer)
+extern "C" void push_deconvolutional_layer(layer l)
 {
-    cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
+    cuda_push_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size);
+    cuda_push_array(l.biases_gpu, l.biases, l.n);
+    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size);
+    cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_push_array(l.scales_gpu, l.scales, l.n);
+        cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
+    }
 }
 
-extern "C" void update_deconvolutional_layer_gpu(deconvolutional_layer layer, float learning_rate, float momentum, float decay)
+void update_deconvolutional_layer_gpu(layer l, update_args a)
 {
-    int size = layer.size*layer.size*layer.c*layer.n;
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
+    if(a.adam){
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
+        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
+        if(l.scales_gpu){
+            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
+        }
+    }else{
+        axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);
 
-    axpy_ongpu(layer.n, learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1);
-    scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1);
+        axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+        scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
 
-    axpy_ongpu(size, -decay, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
-    axpy_ongpu(size, learning_rate, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
-    scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);
+        if(l.scales_gpu){
+            axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+            scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
+        }
+    }
 }
 
diff --git a/image.darknet/inst/include/darknet/src/deconvolutional_layer.c b/image.darknet/inst/include/darknet/src/deconvolutional_layer.c
index fbef9d5..00c0e85 100644
--- a/image.darknet/inst/include/darknet/src/deconvolutional_layer.c
+++ b/image.darknet/inst/include/darknet/src/deconvolutional_layer.c
@@ -1,52 +1,41 @@
 #include "deconvolutional_layer.h"
 #include "convolutional_layer.h"
+#include "batchnorm_layer.h"
 #include "utils.h"
 #include "im2col.h"
 #include "col2im.h"
 #include "blas.h"
 #include "gemm.h"
+
 #include <stdio.h>
 #include <time.h>
 
-int deconvolutional_out_height(deconvolutional_layer l)
-{
-    int h = l.stride*(l.h - 1) + l.size;
-    return h;
-}
 
-int deconvolutional_out_width(deconvolutional_layer l)
-{
-    int w = l.stride*(l.w - 1) + l.size;
-    return w;
-}
-
-int deconvolutional_out_size(deconvolutional_layer l)
-{
-    return deconvolutional_out_height(l) * deconvolutional_out_width(l);
+static size_t get_workspace_size(layer l){
+    return (size_t)l.h*l.w*l.size*l.size*l.n*sizeof(float);
 }
 
-image get_deconvolutional_image(deconvolutional_layer l)
+void bilinear_init(layer l)
 {
-    int h,w,c;
-    h = deconvolutional_out_height(l);
-    w = deconvolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.output);
+    int i,j,f;
+    float center = (l.size-1) / 2.;
+    for(f = 0; f < l.n; ++f){
+        for(j = 0; j < l.size; ++j){
+            for(i = 0; i < l.size; ++i){
+                float val = (1 - fabs(i - center)) * (1 - fabs(j - center));
+                int c = f%l.c;
+                int ind = f*l.size*l.size*l.c + c*l.size*l.size + j*l.size + i;
+                l.weights[ind] = val;
+            }
+        }
+    }
 }
 
-image get_deconvolutional_delta(deconvolutional_layer l)
-{
-    int h,w,c;
-    h = deconvolutional_out_height(l);
-    w = deconvolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.delta);
-}
 
-deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation)
+layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam)
 {
     int i;
-    deconvolutional_layer l = {0};
+    layer l = {0};
     l.type = DECONVOLUTIONAL;
 
     l.h = h;
@@ -57,82 +46,182 @@ deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c,
     l.stride = stride;
     l.size = size;
 
+    l.nweights = c*n*size*size;
+    l.nbiases = n;
+
     l.weights = calloc(c*n*size*size, sizeof(float));
     l.weight_updates = calloc(c*n*size*size, sizeof(float));
 
     l.biases = calloc(n, sizeof(float));
     l.bias_updates = calloc(n, sizeof(float));
-    float scale = 1./sqrt(size*size*c);
+    //float scale = n/(size*size*c);
+    //printf("scale: %f\n", scale);
+    float scale = .02;
     for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal();
+    //bilinear_init(l);
     for(i = 0; i < n; ++i){
-        l.biases[i] = scale;
+        l.biases[i] = 0;
     }
-    int out_h = deconvolutional_out_height(l);
-    int out_w = deconvolutional_out_width(l);
+    l.pad = padding;
 
-    l.out_h = out_h;
-    l.out_w = out_w;
+    l.out_h = (l.h - 1) * l.stride + l.size - 2*l.pad;
+    l.out_w = (l.w - 1) * l.stride + l.size - 2*l.pad;
     l.out_c = n;
     l.outputs = l.out_w * l.out_h * l.out_c;
     l.inputs = l.w * l.h * l.c;
 
-    l.col_image = calloc(h*w*size*size*n, sizeof(float));
-    l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
-    l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));
+    scal_cpu(l.nweights, (float)l.out_w*l.out_h/(l.w*l.h), l.weights, 1);
+
+    l.output = calloc(l.batch*l.outputs, sizeof(float));
+    l.delta  = calloc(l.batch*l.outputs, sizeof(float));
 
     l.forward = forward_deconvolutional_layer;
     l.backward = backward_deconvolutional_layer;
     l.update = update_deconvolutional_layer;
 
-    #ifdef GPU
-    l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
-    l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+    l.batch_normalize = batch_normalize;
+
+    if(batch_normalize){
+        l.scales = calloc(n, sizeof(float));
+        l.scale_updates = calloc(n, sizeof(float));
+        for(i = 0; i < n; ++i){
+            l.scales[i] = 1;
+        }
+
+        l.mean = calloc(n, sizeof(float));
+        l.variance = calloc(n, sizeof(float));
+
+        l.mean_delta = calloc(n, sizeof(float));
+        l.variance_delta = calloc(n, sizeof(float));
+
+        l.rolling_mean = calloc(n, sizeof(float));
+        l.rolling_variance = calloc(n, sizeof(float));
+        l.x = calloc(l.batch*l.outputs, sizeof(float));
+        l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
+    }
+    if(adam){
+        l.m = calloc(c*n*size*size, sizeof(float));
+        l.v = calloc(c*n*size*size, sizeof(float));
+        l.bias_m = calloc(n, sizeof(float));
+        l.scale_m = calloc(n, sizeof(float));
+        l.bias_v = calloc(n, sizeof(float));
+        l.scale_v = calloc(n, sizeof(float));
+    }
+
+#ifdef GPU
+    l.forward_gpu = forward_deconvolutional_layer_gpu;
+    l.backward_gpu = backward_deconvolutional_layer_gpu;
+    l.update_gpu = update_deconvolutional_layer_gpu;
+
+    if(gpu_index >= 0){
+
+        if (adam) {
+            l.m_gpu = cuda_make_array(l.m, c*n*size*size);
+            l.v_gpu = cuda_make_array(l.v, c*n*size*size);
+            l.bias_m_gpu = cuda_make_array(l.bias_m, n);
+            l.bias_v_gpu = cuda_make_array(l.bias_v, n);
+            l.scale_m_gpu = cuda_make_array(l.scale_m, n);
+            l.scale_v_gpu = cuda_make_array(l.scale_v, n);
+        }
+        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+
+        l.biases_gpu = cuda_make_array(l.biases, n);
+        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
 
-    l.biases_gpu = cuda_make_array(l.biases, n);
-    l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+        l.delta_gpu = cuda_make_array(l.delta, l.batch*l.out_h*l.out_w*n);
+        l.output_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n);
 
-    l.col_image_gpu = cuda_make_array(l.col_image, h*w*size*size*n);
-    l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
-    l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+        if(batch_normalize){
+            l.mean_gpu = cuda_make_array(0, n);
+            l.variance_gpu = cuda_make_array(0, n);
+
+            l.rolling_mean_gpu = cuda_make_array(0, n);
+            l.rolling_variance_gpu = cuda_make_array(0, n);
+
+            l.mean_delta_gpu = cuda_make_array(0, n);
+            l.variance_delta_gpu = cuda_make_array(0, n);
+
+            l.scales_gpu = cuda_make_array(l.scales, n);
+            l.scale_updates_gpu = cuda_make_array(0, n);
+
+            l.x_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n);
+            l.x_norm_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n);
+        }
+    }
+    #ifdef CUDNN
+        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
+        cudnnCreateTensorDescriptor(&l.normTensorDesc);
+        cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
+        cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
     #endif
+#endif
 
     l.activation = activation;
+    l.workspace_size = get_workspace_size(l);
 
-    fprintf(stderr, "Deconvolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
+    fprintf(stderr, "deconv%5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
 
     return l;
 }
 
-void resize_deconvolutional_layer(deconvolutional_layer *l, int h, int w)
+void denormalize_deconvolutional_layer(layer l)
+{
+    int i, j;
+    for(i = 0; i < l.n; ++i){
+        float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
+        for(j = 0; j < l.c*l.size*l.size; ++j){
+            l.weights[i*l.c*l.size*l.size + j] *= scale;
+        }
+        l.biases[i] -= l.rolling_mean[i] * scale;
+        l.scales[i] = 1;
+        l.rolling_mean[i] = 0;
+        l.rolling_variance[i] = 1;
+    }
+}
+
+void resize_deconvolutional_layer(layer *l, int h, int w)
 {
     l->h = h;
     l->w = w;
-    int out_h = deconvolutional_out_height(*l);
-    int out_w = deconvolutional_out_width(*l);
-
-    l->col_image = realloc(l->col_image,
-                                out_h*out_w*l->size*l->size*l->c*sizeof(float));
-    l->output = realloc(l->output,
-                                l->batch*out_h * out_w * l->n*sizeof(float));
-    l->delta  = realloc(l->delta,
-                                l->batch*out_h * out_w * l->n*sizeof(float));
-    #ifdef GPU
-    cuda_free(l->col_image_gpu);
+    l->out_h = (l->h - 1) * l->stride + l->size - 2*l->pad;
+    l->out_w = (l->w - 1) * l->stride + l->size - 2*l->pad;
+
+    l->outputs = l->out_h * l->out_w * l->out_c;
+    l->inputs = l->w * l->h * l->c;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta  = realloc(l->delta,  l->batch*l->outputs*sizeof(float));
+    if(l->batch_normalize){
+        l->x = realloc(l->x, l->batch*l->outputs*sizeof(float));
+        l->x_norm  = realloc(l->x_norm, l->batch*l->outputs*sizeof(float));
+    }
+
+#ifdef GPU
     cuda_free(l->delta_gpu);
     cuda_free(l->output_gpu);
 
-    l->col_image_gpu = cuda_make_array(l->col_image, out_h*out_w*l->size*l->size*l->c);
-    l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n);
-    l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n);
+    l->delta_gpu =  cuda_make_array(l->delta,  l->batch*l->outputs);
+    l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+
+    if(l->batch_normalize){
+        cuda_free(l->x_gpu);
+        cuda_free(l->x_norm_gpu);
+
+        l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+        l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+    }
+    #ifdef CUDNN
+        cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
+        cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
     #endif
+#endif
+    l->workspace_size = get_workspace_size(*l);
 }
 
-void forward_deconvolutional_layer(const deconvolutional_layer l, network_state state)
+void forward_deconvolutional_layer(const layer l, network net)
 {
     int i;
-    int out_h = deconvolutional_out_height(l);
-    int out_w = deconvolutional_out_width(l);
-    int size = out_h*out_w;
 
     int m = l.size*l.size*l.n;
     int n = l.h*l.w;
@@ -142,63 +231,80 @@ void forward_deconvolutional_layer(const deconvolutional_layer l, network_state
 
     for(i = 0; i < l.batch; ++i){
         float *a = l.weights;
-        float *b = state.input + i*l.c*l.h*l.w;
-        float *c = l.col_image;
+        float *b = net.input + i*l.c*l.h*l.w;
+        float *c = net.workspace;
 
-        gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);
+        gemm_cpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
 
-        col2im_cpu(c, l.n, out_h, out_w, l.size, l.stride, 0, l.output+i*l.n*size);
+        col2im_cpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output+i*l.outputs);
+    }
+    if (l.batch_normalize) {
+        forward_batchnorm_layer(l, net);
+    } else {
+        add_bias(l.output, l.biases, l.batch, l.n, l.out_w*l.out_h);
     }
-    add_bias(l.output, l.biases, l.batch, l.n, size);
-    activate_array(l.output, l.batch*l.n*size, l.activation);
+    activate_array(l.output, l.batch*l.n*l.out_w*l.out_h, l.activation);
 }
 
-void backward_deconvolutional_layer(deconvolutional_layer l, network_state state)
+void backward_deconvolutional_layer(layer l, network net)
 {
-    float alpha = 1./l.batch;
-    int out_h = deconvolutional_out_height(l);
-    int out_w = deconvolutional_out_width(l);
-    int size = out_h*out_w;
     int i;
 
-    gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta);
-    backward_bias(l.bias_updates, l.delta, l.batch, l.n, size);
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+
+    if(l.batch_normalize){
+        backward_batchnorm_layer(l, net);
+    } else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h);
+    }
+
+    //if(net.delta) memset(net.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));
 
     for(i = 0; i < l.batch; ++i){
         int m = l.c;
         int n = l.size*l.size*l.n;
         int k = l.h*l.w;
 
-        float *a = state.input + i*m*n;
-        float *b = l.col_image;
+        float *a = net.input + i*m*k;
+        float *b = net.workspace;
         float *c = l.weight_updates;
 
-        im2col_cpu(l.delta + i*l.n*size, l.n, out_h, out_w, 
-                l.size, l.stride, 0, b);
-        gemm(0,1,m,n,k,alpha,a,k,b,k,1,c,n);
+        im2col_cpu(l.delta + i*l.outputs, l.out_c, l.out_h, l.out_w, 
+                l.size, l.stride, l.pad, b);
+        gemm_cpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
 
-        if(state.delta){
+        if(net.delta){
             int m = l.c;
             int n = l.h*l.w;
             int k = l.size*l.size*l.n;
 
             float *a = l.weights;
-            float *b = l.col_image;
-            float *c = state.delta + i*n*m;
+            float *b = net.workspace;
+            float *c = net.delta + i*n*m;
 
-            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+            gemm_cpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
         }
     }
 }
 
-void update_deconvolutional_layer(deconvolutional_layer l, float learning_rate, float momentum, float decay)
+void update_deconvolutional_layer(layer l, update_args a)
 {
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
     int size = l.size*l.size*l.c*l.n;
-    axpy_cpu(l.n, learning_rate, l.bias_updates, 1, l.biases, 1);
+    axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
     scal_cpu(l.n, momentum, l.bias_updates, 1);
 
-    axpy_cpu(size, -decay, l.weights, 1, l.weight_updates, 1);
-    axpy_cpu(size, learning_rate, l.weight_updates, 1, l.weights, 1);
+    if(l.scales){
+        axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
+        scal_cpu(l.n, momentum, l.scale_updates, 1);
+    }
+
+    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
     scal_cpu(size, momentum, l.weight_updates, 1);
 }
 
diff --git a/image.darknet/inst/include/darknet/src/deconvolutional_layer.h b/image.darknet/inst/include/darknet/src/deconvolutional_layer.h
index 2d36e02..b254fb9 100644
--- a/image.darknet/inst/include/darknet/src/deconvolutional_layer.h
+++ b/image.darknet/inst/include/darknet/src/deconvolutional_layer.h
@@ -7,28 +7,19 @@
 #include "layer.h"
 #include "network.h"
 
-typedef layer deconvolutional_layer;
-
 #ifdef GPU
-void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state);
-void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state);
-void update_deconvolutional_layer_gpu(deconvolutional_layer layer, float learning_rate, float momentum, float decay);
-void push_deconvolutional_layer(deconvolutional_layer layer);
-void pull_deconvolutional_layer(deconvolutional_layer layer);
+void forward_deconvolutional_layer_gpu(layer l, network net);
+void backward_deconvolutional_layer_gpu(layer l, network net);
+void update_deconvolutional_layer_gpu(layer l, update_args a);
+void push_deconvolutional_layer(layer l);
+void pull_deconvolutional_layer(layer l);
 #endif
 
-deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation);
-void resize_deconvolutional_layer(deconvolutional_layer *layer, int h, int w);
-void forward_deconvolutional_layer(const deconvolutional_layer layer, network_state state);
-void update_deconvolutional_layer(deconvolutional_layer layer, float learning_rate, float momentum, float decay);
-void backward_deconvolutional_layer(deconvolutional_layer layer, network_state state);
-
-image get_deconvolutional_image(deconvolutional_layer layer);
-image get_deconvolutional_delta(deconvolutional_layer layer);
-image get_deconvolutional_filter(deconvolutional_layer layer, int i);
-
-int deconvolutional_out_height(deconvolutional_layer layer);
-int deconvolutional_out_width(deconvolutional_layer layer);
+layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam);
+void resize_deconvolutional_layer(layer *l, int h, int w);
+void forward_deconvolutional_layer(const layer l, network net);
+void update_deconvolutional_layer(layer l, update_args a);
+void backward_deconvolutional_layer(layer l, network net);
 
 #endif
 
diff --git a/image.darknet/inst/include/darknet/src/demo.c b/image.darknet/inst/include/darknet/src/demo.c
index 7818bc3..b89efb8 100644
--- a/image.darknet/inst/include/darknet/src/demo.c
+++ b/image.darknet/inst/include/darknet/src/demo.c
@@ -9,213 +9,339 @@
 #include "demo.h"
 #include <sys/time.h>
 
-#define FRAMES 3
+#define DEMO 1
 
 #ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#include "opencv2/imgproc/imgproc_c.h"
-image get_image_from_stream(CvCapture *cap);
 
 static char **demo_names;
 static image **demo_alphabet;
 static int demo_classes;
 
-static float **probs;
-static box *boxes;
-static network net;
-static image in   ;
-static image in_s ;
-static image det  ;
-static image det_s;
-static image disp = {0};
-static CvCapture * cap;
+static network *net;
+static image buff [3];
+static image buff_letter[3];
+static int buff_index = 0;
+static void * cap;
 static float fps = 0;
 static float demo_thresh = 0;
-static float demo_hier_thresh = .5;
+static float demo_hier = .5;
+static int running = 0;
 
-static float *predictions[FRAMES];
+static int demo_frame = 3;
 static int demo_index = 0;
-static image images[FRAMES];
+static float **predictions;
 static float *avg;
+static int demo_done = 0;
+static int demo_total = 0;
+double demo_time;
 
-void *fetch_in_thread(void *ptr)
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num);
+
+int size_network(network *net)
 {
-    in = get_image_from_stream(cap);
-    if(!in.data){
-        error("Stream closed.");
+    int i;
+    int count = 0;
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        if(l.type == YOLO || l.type == REGION || l.type == DETECTION){
+            count += l.outputs;
+        }
     }
-    in_s = resize_image(in, net.w, net.h);
-    return 0;
+    return count;
+}
+
+void remember_network(network *net)
+{
+    int i;
+    int count = 0;
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        if(l.type == YOLO || l.type == REGION || l.type == DETECTION){
+            memcpy(predictions[demo_index] + count, net->layers[i].output, sizeof(float) * l.outputs);
+            count += l.outputs;
+        }
+    }
+}
+
+detection *avg_predictions(network *net, int *nboxes)
+{
+    int i, j;
+    int count = 0;
+    fill_cpu(demo_total, 0, avg, 1);
+    for(j = 0; j < demo_frame; ++j){
+        axpy_cpu(demo_total, 1./demo_frame, predictions[j], 1, avg, 1);
+    }
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        if(l.type == YOLO || l.type == REGION || l.type == DETECTION){
+            memcpy(l.output, avg + count, sizeof(float) * l.outputs);
+            count += l.outputs;
+        }
+    }
+    detection *dets = get_network_boxes(net, buff[0].w, buff[0].h, demo_thresh, demo_hier, 0, 1, nboxes);
+    return dets;
 }
 
 void *detect_in_thread(void *ptr)
 {
+    running = 1;
     float nms = .4;
 
-    layer l = net.layers[net.n-1];
-    float *X = det_s.data;
-    float *prediction = network_predict(net, X);
-
-    memcpy(predictions[demo_index], prediction, l.outputs*sizeof(float));
-    mean_arrays(predictions, FRAMES, l.outputs, avg);
-    l.output = avg;
-
-    free_image(det_s);
-    if(l.type == DETECTION){
-        get_detection_boxes(l, 1, 1, demo_thresh, probs, boxes, 0);
-    } else if (l.type == REGION){
-        get_region_boxes(l, 1, 1, demo_thresh, probs, boxes, 0, 0, demo_hier_thresh);
-    } else {
-        error("Last layer must produce detections\n");
+    layer l = net->layers[net->n-1];
+    float *X = buff_letter[(buff_index+2)%3].data;
+    network_predict(net, X);
+
+    /*
+       if(l.type == DETECTION){
+       get_detection_boxes(l, 1, 1, demo_thresh, probs, boxes, 0);
+       } else */
+    remember_network(net);
+    detection *dets = 0;
+    int nboxes = 0;
+    dets = avg_predictions(net, &nboxes);
+
+
+    /*
+       int i,j;
+       box zero = {0};
+       int classes = l.classes;
+       for(i = 0; i < demo_detections; ++i){
+       avg[i].objectness = 0;
+       avg[i].bbox = zero;
+       memset(avg[i].prob, 0, classes*sizeof(float));
+       for(j = 0; j < demo_frame; ++j){
+       axpy_cpu(classes, 1./demo_frame, dets[j][i].prob, 1, avg[i].prob, 1);
+       avg[i].objectness += dets[j][i].objectness * 1./demo_frame;
+       avg[i].bbox.x += dets[j][i].bbox.x * 1./demo_frame;
+       avg[i].bbox.y += dets[j][i].bbox.y * 1./demo_frame;
+       avg[i].bbox.w += dets[j][i].bbox.w * 1./demo_frame;
+       avg[i].bbox.h += dets[j][i].bbox.h * 1./demo_frame;
+       }
+    //copy_cpu(classes, dets[0][i].prob, 1, avg[i].prob, 1);
+    //avg[i].objectness = dets[0][i].objectness;
     }
-    if (nms > 0) do_nms(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+     */
+
+    if (nms > 0) do_nms_obj(dets, nboxes, l.classes, nms);
+
     printf("\033[2J");
     printf("\033[1;1H");
     printf("\nFPS:%.1f\n",fps);
     printf("Objects:\n\n");
+    image display = buff[(buff_index+2) % 3];
+    draw_detections(display, dets, nboxes, demo_thresh, demo_names, demo_alphabet, demo_classes);
+    free_detections(dets, nboxes);
 
-    images[demo_index] = det;
-    det = images[(demo_index + FRAMES/2 + 1)%FRAMES];
-    demo_index = (demo_index + 1)%FRAMES;
-
-    draw_detections(det, l.w*l.h*l.n, demo_thresh, boxes, probs, demo_names, demo_alphabet, demo_classes);
+    demo_index = (demo_index + 1)%demo_frame;
+    running = 0;
+    return 0;
+}
 
+void *fetch_in_thread(void *ptr)
+{
+    free_image(buff[buff_index]);
+    buff[buff_index] = get_image_from_stream(cap);
+    if(buff[buff_index].data == 0) {
+        demo_done = 1;
+        return 0;
+    }
+    letterbox_image_into(buff[buff_index], net->w, net->h, buff_letter[buff_index]);
     return 0;
 }
 
-double get_wall_time()
+void *display_in_thread(void *ptr)
 {
-    struct timeval time;
-    if (gettimeofday(&time,NULL)){
+    int c = show_image(buff[(buff_index + 1)%3], "Demo", 1);
+    if (c != -1) c = c%256;
+    if (c == 27) {
+        demo_done = 1;
         return 0;
+    } else if (c == 82) {
+        demo_thresh += .02;
+    } else if (c == 84) {
+        demo_thresh -= .02;
+        if(demo_thresh <= .02) demo_thresh = .02;
+    } else if (c == 83) {
+        demo_hier += .02;
+    } else if (c == 81) {
+        demo_hier -= .02;
+        if(demo_hier <= .0) demo_hier = .0;
     }
-    return (double)time.tv_sec + (double)time.tv_usec * .000001;
+    return 0;
 }
 
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, float hier_thresh)
+void *display_loop(void *ptr)
 {
-    //skip = frame_skip;
+    while(1){
+        display_in_thread(0);
+    }
+}
+
+void *detect_loop(void *ptr)
+{
+    while(1){
+        detect_in_thread(0);
+    }
+}
+
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg_frames, float hier, int w, int h, int frames, int fullscreen)
+{
+    //demo_frame = avg_frames;
     image **alphabet = load_alphabet();
-    int delay = frame_skip;
     demo_names = names;
     demo_alphabet = alphabet;
     demo_classes = classes;
     demo_thresh = thresh;
-    demo_hier_thresh = hier_thresh;
+    demo_hier = hier;
     printf("Demo\n");
-    net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    pthread_t detect_thread;
+    pthread_t fetch_thread;
 
     srand(2222222);
 
+    int i;
+    demo_total = size_network(net);
+    predictions = calloc(demo_frame, sizeof(float*));
+    for (i = 0; i < demo_frame; ++i){
+        predictions[i] = calloc(demo_total, sizeof(float));
+    }
+    avg = calloc(demo_total, sizeof(float));
+
     if(filename){
         printf("video file: %s\n", filename);
-        cap = cvCaptureFromFile(filename);
+        cap = open_video_stream(filename, 0, 0, 0, 0);
     }else{
-        cap = cvCaptureFromCAM(cam_index);
+        cap = open_video_stream(0, cam_index, w, h, frames);
     }
 
     if(!cap) error("Couldn't connect to webcam.\n");
 
-    layer l = net.layers[net.n-1];
-    int j;
-
-    avg = (float *) calloc(l.outputs, sizeof(float));
-    for(j = 0; j < FRAMES; ++j) predictions[j] = (float *) calloc(l.outputs, sizeof(float));
-    for(j = 0; j < FRAMES; ++j) images[j] = make_image(1,1,3);
-
-    boxes = (box *)calloc(l.w*l.h*l.n, sizeof(box));
-    probs = (float **)calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float *)calloc(l.classes, sizeof(float));
-
-    pthread_t fetch_thread;
-    pthread_t detect_thread;
-
-    fetch_in_thread(0);
-    det = in;
-    det_s = in_s;
-
-    fetch_in_thread(0);
-    detect_in_thread(0);
-    disp = det;
-    det = in;
-    det_s = in_s;
-
-    for(j = 0; j < FRAMES/2; ++j){
-        fetch_in_thread(0);
-        detect_in_thread(0);
-        disp = det;
-        det = in;
-        det_s = in_s;
-    }
+    buff[0] = get_image_from_stream(cap);
+    buff[1] = copy_image(buff[0]);
+    buff[2] = copy_image(buff[0]);
+    buff_letter[0] = letterbox_image(buff[0], net->w, net->h);
+    buff_letter[1] = letterbox_image(buff[0], net->w, net->h);
+    buff_letter[2] = letterbox_image(buff[0], net->w, net->h);
 
     int count = 0;
     if(!prefix){
-        cvNamedWindow("Demo", CV_WINDOW_NORMAL); 
-        cvMoveWindow("Demo", 0, 0);
-        cvResizeWindow("Demo", 1352, 1013);
+        make_window("Demo", 1352, 1013, fullscreen);
     }
 
-    double before = get_wall_time();
-
-    while(1){
-        ++count;
-        if(1){
-            if(pthread_create(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed");
-            if(pthread_create(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed");
-
-            if(!prefix){
-                show_image(disp, "Demo");
-                int c = cvWaitKey(1);
-                if (c == 10){
-                    if(frame_skip == 0) frame_skip = 60;
-                    else if(frame_skip == 4) frame_skip = 0;
-                    else if(frame_skip == 60) frame_skip = 4;   
-                    else frame_skip = 0;
-                }
-            }else{
-                char buff[256];
-                sprintf(buff, "%s_%08d", prefix, count);
-                save_image(disp, buff);
-            }
-
-            pthread_join(fetch_thread, 0);
-            pthread_join(detect_thread, 0);
-
-            if(delay == 0){
-                free_image(disp);
-                disp  = det;
-            }
-            det   = in;
-            det_s = in_s;
-        }else {
-            fetch_in_thread(0);
-            det   = in;
-            det_s = in_s;
-            detect_in_thread(0);
-            if(delay == 0) {
-                free_image(disp);
-                disp = det;
-            }
-            show_image(disp, "Demo");
-            cvWaitKey(1);
-        }
-        --delay;
-        if(delay < 0){
-            delay = frame_skip;
-
-            double after = get_wall_time();
-            float curr = 1./(after - before);
-            fps = curr;
-            before = after;
+    demo_time = what_time_is_it_now();
+
+    while(!demo_done){
+        buff_index = (buff_index + 1) %3;
+        if(pthread_create(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed");
+        if(pthread_create(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed");
+        if(!prefix){
+            fps = 1./(what_time_is_it_now() - demo_time);
+            demo_time = what_time_is_it_now();
+            display_in_thread(0);
+        }else{
+            char name[256];
+            sprintf(name, "%s_%08d", prefix, count);
+            save_image(buff[(buff_index + 1)%3], name);
         }
+        pthread_join(fetch_thread, 0);
+        pthread_join(detect_thread, 0);
+        ++count;
     }
 }
+
+/*
+   void demo_compare(char *cfg1, char *weight1, char *cfg2, char *weight2, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg_frames, float hier, int w, int h, int frames, int fullscreen)
+   {
+   demo_frame = avg_frames;
+   predictions = calloc(demo_frame, sizeof(float*));
+   image **alphabet = load_alphabet();
+   demo_names = names;
+   demo_alphabet = alphabet;
+   demo_classes = classes;
+   demo_thresh = thresh;
+   demo_hier = hier;
+   printf("Demo\n");
+   net = load_network(cfg1, weight1, 0);
+   set_batch_network(net, 1);
+   pthread_t detect_thread;
+   pthread_t fetch_thread;
+
+   srand(2222222);
+
+   if(filename){
+   printf("video file: %s\n", filename);
+   cap = cvCaptureFromFile(filename);
+   }else{
+   cap = cvCaptureFromCAM(cam_index);
+
+   if(w){
+   cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_WIDTH, w);
+   }
+   if(h){
+   cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_HEIGHT, h);
+   }
+   if(frames){
+   cvSetCaptureProperty(cap, CV_CAP_PROP_FPS, frames);
+   }
+   }
+
+   if(!cap) error("Couldn't connect to webcam.\n");
+
+   layer l = net->layers[net->n-1];
+   demo_detections = l.n*l.w*l.h;
+   int j;
+
+   avg = (float *) calloc(l.outputs, sizeof(float));
+   for(j = 0; j < demo_frame; ++j) predictions[j] = (float *) calloc(l.outputs, sizeof(float));
+
+   boxes = (box *)calloc(l.w*l.h*l.n, sizeof(box));
+   probs = (float **)calloc(l.w*l.h*l.n, sizeof(float *));
+   for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float *)calloc(l.classes+1, sizeof(float));
+
+   buff[0] = get_image_from_stream(cap);
+   buff[1] = copy_image(buff[0]);
+   buff[2] = copy_image(buff[0]);
+   buff_letter[0] = letterbox_image(buff[0], net->w, net->h);
+   buff_letter[1] = letterbox_image(buff[0], net->w, net->h);
+   buff_letter[2] = letterbox_image(buff[0], net->w, net->h);
+   ipl = cvCreateImage(cvSize(buff[0].w,buff[0].h), IPL_DEPTH_8U, buff[0].c);
+
+   int count = 0;
+   if(!prefix){
+   cvNamedWindow("Demo", CV_WINDOW_NORMAL); 
+   if(fullscreen){
+   cvSetWindowProperty("Demo", CV_WND_PROP_FULLSCREEN, CV_WINDOW_FULLSCREEN);
+   } else {
+   cvMoveWindow("Demo", 0, 0);
+   cvResizeWindow("Demo", 1352, 1013);
+   }
+   }
+
+   demo_time = what_time_is_it_now();
+
+   while(!demo_done){
+buff_index = (buff_index + 1) %3;
+if(pthread_create(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed");
+if(pthread_create(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed");
+if(!prefix){
+    fps = 1./(what_time_is_it_now() - demo_time);
+    demo_time = what_time_is_it_now();
+    display_in_thread(0);
+}else{
+    char name[256];
+    sprintf(name, "%s_%08d", prefix, count);
+    save_image(buff[(buff_index + 1)%3], name);
+}
+pthread_join(fetch_thread, 0);
+pthread_join(detect_thread, 0);
+++count;
+}
+}
+*/
 #else
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, float hier_thresh)
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg, float hier, int w, int h, int frames, int fullscreen)
 {
     fprintf(stderr, "Demo needs OpenCV for webcam images.\n");
 }
diff --git a/image.darknet/inst/include/darknet/src/demo.h b/image.darknet/inst/include/darknet/src/demo.h
index c3d6a61..86e4654 100644
--- a/image.darknet/inst/include/darknet/src/demo.h
+++ b/image.darknet/inst/include/darknet/src/demo.h
@@ -1,7 +1,6 @@
-#ifndef DEMO
-#define DEMO
+#ifndef DEMO_H
+#define DEMO_H
 
 #include "image.h"
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, float hier_thresh);
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/detection_layer.c b/image.darknet/inst/include/darknet/src/detection_layer.c
index cd98b4b..d0e0194 100644
--- a/image.darknet/inst/include/darknet/src/detection_layer.c
+++ b/image.darknet/inst/include/darknet/src/detection_layer.c
@@ -5,6 +5,7 @@
 #include "box.h"
 #include "cuda.h"
 #include "utils.h"
+
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
@@ -46,11 +47,11 @@ detection_layer make_detection_layer(int batch, int inputs, int n, int side, int
     return l;
 }
 
-void forward_detection_layer(const detection_layer l, network_state state)
+void forward_detection_layer(const detection_layer l, network net)
 {
     int locations = l.side*l.side;
     int i,j;
-    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
+    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
     //if(l.reorg) reorg(l.output, l.w*l.h, size*l.n, l.batch, 1);
     int b;
     if (l.softmax){
@@ -58,12 +59,12 @@ void forward_detection_layer(const detection_layer l, network_state state)
             int index = b*l.inputs;
             for (i = 0; i < locations; ++i) {
                 int offset = i*l.classes;
-                softmax(l.output + index + offset, l.classes, 1,
+                softmax(l.output + index + offset, l.classes, 1, 1,
                         l.output + index + offset);
             }
         }
     }
-    if(state.train){
+    if(net.train){
         float avg_iou = 0;
         float avg_cat = 0;
         float avg_allcat = 0;
@@ -77,7 +78,7 @@ void forward_detection_layer(const detection_layer l, network_state state)
             int index = b*l.inputs;
             for (i = 0; i < locations; ++i) {
                 int truth_index = (b*locations + i)*(1+l.coords+l.classes);
-                int is_obj = state.truth[truth_index];
+                int is_obj = net.truth[truth_index];
                 for (j = 0; j < l.n; ++j) {
                     int p_index = index + locations*l.classes + i*l.n + j;
                     l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]);
@@ -95,19 +96,19 @@ void forward_detection_layer(const detection_layer l, network_state state)
 
                 int class_index = index + i*l.classes;
                 for(j = 0; j < l.classes; ++j) {
-                    l.delta[class_index+j] = l.class_scale * (state.truth[truth_index+1+j] - l.output[class_index+j]);
-                    *(l.cost) += l.class_scale * pow(state.truth[truth_index+1+j] - l.output[class_index+j], 2);
-                    if(state.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j];
+                    l.delta[class_index+j] = l.class_scale * (net.truth[truth_index+1+j] - l.output[class_index+j]);
+                    *(l.cost) += l.class_scale * pow(net.truth[truth_index+1+j] - l.output[class_index+j], 2);
+                    if(net.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j];
                     avg_allcat += l.output[class_index+j];
                 }
 
-                box truth = float_to_box(state.truth + truth_index + 1 + l.classes);
+                box truth = float_to_box(net.truth + truth_index + 1 + l.classes, 1);
                 truth.x /= l.side;
                 truth.y /= l.side;
 
                 for(j = 0; j < l.n; ++j){
                     int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
-                    box out = float_to_box(l.output + box_index);
+                    box out = float_to_box(l.output + box_index, 1);
                     out.x /= l.side;
                     out.y /= l.side;
 
@@ -139,14 +140,14 @@ void forward_detection_layer(const detection_layer l, network_state state)
                         best_index = 0;
                     }
                 }
-                if(l.random && *(state.net.seen) < 64000){
+                if(l.random && *(net.seen) < 64000){
                     best_index = rand()%l.n;
                 }
 
                 int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
                 int tbox_index = truth_index + 1 + l.classes;
 
-                box out = float_to_box(l.output + box_index);
+                box out = float_to_box(l.output + box_index, 1);
                 out.x /= l.side;
                 out.y /= l.side;
                 if (l.sqrt) {
@@ -166,13 +167,13 @@ void forward_detection_layer(const detection_layer l, network_state state)
                     l.delta[p_index] = l.object_scale * (iou - l.output[p_index]);
                 }
 
-                l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]);
-                l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]);
-                l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]);
-                l.delta[box_index+3] = l.coord_scale*(state.truth[tbox_index + 3] - l.output[box_index + 3]);
+                l.delta[box_index+0] = l.coord_scale*(net.truth[tbox_index + 0] - l.output[box_index + 0]);
+                l.delta[box_index+1] = l.coord_scale*(net.truth[tbox_index + 1] - l.output[box_index + 1]);
+                l.delta[box_index+2] = l.coord_scale*(net.truth[tbox_index + 2] - l.output[box_index + 2]);
+                l.delta[box_index+3] = l.coord_scale*(net.truth[tbox_index + 3] - l.output[box_index + 3]);
                 if(l.sqrt){
-                    l.delta[box_index+2] = l.coord_scale*(sqrt(state.truth[tbox_index + 2]) - l.output[box_index + 2]);
-                    l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]);
+                    l.delta[box_index+2] = l.coord_scale*(sqrt(net.truth[tbox_index + 2]) - l.output[box_index + 2]);
+                    l.delta[box_index+3] = l.coord_scale*(sqrt(net.truth[tbox_index + 3]) - l.output[box_index + 3]);
                 }
 
                 *(l.cost) += pow(1-iou, 2);
@@ -216,12 +217,12 @@ void forward_detection_layer(const detection_layer l, network_state state)
     }
 }
 
-void backward_detection_layer(const detection_layer l, network_state state)
+void backward_detection_layer(const detection_layer l, network net)
 {
-    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
 }
 
-void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness)
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets)
 {
     int i,j,n;
     float *predictions = l.output;
@@ -234,17 +235,17 @@ void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box
             int p_index = l.side*l.side*l.classes + i*l.n + n;
             float scale = predictions[p_index];
             int box_index = l.side*l.side*(l.classes + l.n) + (i*l.n + n)*4;
-            boxes[index].x = (predictions[box_index + 0] + col) / l.side * w;
-            boxes[index].y = (predictions[box_index + 1] + row) / l.side * h;
-            boxes[index].w = pow(predictions[box_index + 2], (l.sqrt?2:1)) * w;
-            boxes[index].h = pow(predictions[box_index + 3], (l.sqrt?2:1)) * h;
+            box b;
+            b.x = (predictions[box_index + 0] + col) / l.side * w;
+            b.y = (predictions[box_index + 1] + row) / l.side * h;
+            b.w = pow(predictions[box_index + 2], (l.sqrt?2:1)) * w;
+            b.h = pow(predictions[box_index + 3], (l.sqrt?2:1)) * h;
+            dets[index].bbox = b;
+            dets[index].objectness = scale;
             for(j = 0; j < l.classes; ++j){
                 int class_index = i*l.classes;
                 float prob = scale*predictions[class_index+j];
-                probs[index][j] = (prob > thresh) ? prob : 0;
-            }
-            if(only_objectness){
-                probs[index][0] = scale;
+                dets[index].prob[j] = (prob > thresh) ? prob : 0;
             }
         }
     }
@@ -252,36 +253,23 @@ void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box
 
 #ifdef GPU
 
-void forward_detection_layer_gpu(const detection_layer l, network_state state)
+void forward_detection_layer_gpu(const detection_layer l, network net)
 {
-    if(!state.train){
-        copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
+    if(!net.train){
+        copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
         return;
     }
 
-    float *in_cpu = calloc(l.batch*l.inputs, sizeof(float));
-    float *truth_cpu = 0;
-    if(state.truth){
-        int num_truth = l.batch*l.side*l.side*(1+l.coords+l.classes);
-        truth_cpu = calloc(num_truth, sizeof(float));
-        cuda_pull_array(state.truth, truth_cpu, num_truth);
-    }
-    cuda_pull_array(state.input, in_cpu, l.batch*l.inputs);
-    network_state cpu_state = state;
-    cpu_state.train = state.train;
-    cpu_state.truth = truth_cpu;
-    cpu_state.input = in_cpu;
-    forward_detection_layer(l, cpu_state);
+    cuda_pull_array(net.input_gpu, net.input, l.batch*l.inputs);
+    forward_detection_layer(l, net);
     cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs);
     cuda_push_array(l.delta_gpu, l.delta, l.batch*l.inputs);
-    free(cpu_state.input);
-    if(cpu_state.truth) free(cpu_state.truth);
 }
 
-void backward_detection_layer_gpu(detection_layer l, network_state state)
+void backward_detection_layer_gpu(detection_layer l, network net)
 {
-    axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1);
-    //copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, state.delta, 1);
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+    //copy_gpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
 
diff --git a/image.darknet/inst/include/darknet/src/detection_layer.h b/image.darknet/inst/include/darknet/src/detection_layer.h
index e847a09..1c81853 100644
--- a/image.darknet/inst/include/darknet/src/detection_layer.h
+++ b/image.darknet/inst/include/darknet/src/detection_layer.h
@@ -7,13 +7,12 @@
 typedef layer detection_layer;
 
 detection_layer make_detection_layer(int batch, int inputs, int n, int size, int classes, int coords, int rescore);
-void forward_detection_layer(const detection_layer l, network_state state);
-void backward_detection_layer(const detection_layer l, network_state state);
-void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness);
+void forward_detection_layer(const detection_layer l, network net);
+void backward_detection_layer(const detection_layer l, network net);
 
 #ifdef GPU
-void forward_detection_layer_gpu(const detection_layer l, network_state state);
-void backward_detection_layer_gpu(detection_layer l, network_state state);
+void forward_detection_layer_gpu(const detection_layer l, network net);
+void backward_detection_layer_gpu(detection_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/detector.c b/image.darknet/inst/include/darknet/src/detector.c
deleted file mode 100644
index 1416c05..0000000
--- a/image.darknet/inst/include/darknet/src/detector.c
+++ /dev/null
@@ -1,552 +0,0 @@
-#include "network.h"
-#include "region_layer.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "box.h"
-#include "demo.h"
-#include "option_list.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-static int coco_ids[] = {1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90};
-
-void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
-{
-    list *options = read_data_cfg(datacfg);
-    char *train_images = option_find_str(options, "train", "data/train.list");
-    char *backup_directory = option_find_str(options, "backup", "/backup/");
-
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    float avg_loss = -1;
-    network *nets = calloc(ngpus, sizeof(network));
-
-    srand(time(0));
-    int seed = rand();
-    int i;
-    for(i = 0; i < ngpus; ++i){
-        srand(seed);
-#ifdef GPU
-        cuda_set_device(gpus[i]);
-#endif
-        nets[i] = parse_network_cfg(cfgfile);
-        if(weightfile){
-            load_weights(&nets[i], weightfile);
-        }
-        if(clear) *nets[i].seen = 0;
-        nets[i].learning_rate *= ngpus;
-    }
-    srand(time(0));
-    network net = nets[0];
-
-    int imgs = net.batch * net.subdivisions * ngpus;
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    data train, buffer;
-
-    layer l = net.layers[net.n - 1];
-
-    int classes = l.classes;
-    float jitter = l.jitter;
-
-    list *plist = get_paths(train_images);
-    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.paths = paths;
-    args.n = imgs;
-    args.m = plist->size;
-    args.classes = classes;
-    args.jitter = jitter;
-    args.num_boxes = l.max_boxes;
-    args.d = &buffer;
-    args.type = DETECTION_DATA;
-    args.threads = 8;
-
-    args.angle = net.angle;
-    args.exposure = net.exposure;
-    args.saturation = net.saturation;
-    args.hue = net.hue;
-
-    pthread_t load_thread = load_data(args);
-    clock_t time;
-    int count = 0;
-    //while(i*imgs < N*120){
-    while(get_current_batch(net) < net.max_batches){
-        if(l.random && count++%10 == 0){
-            printf("Resizing\n");
-            int dim = (rand() % 10 + 10) * 32;
-            if (get_current_batch(net)+200 > net.max_batches) dim = 608;
-            //int dim = (rand() % 4 + 16) * 32;
-            printf("%d\n", dim);
-            args.w = dim;
-            args.h = dim;
-
-            pthread_join(load_thread, 0);
-            train = buffer;
-            free_data(train);
-            load_thread = load_data(args);
-
-            for(i = 0; i < ngpus; ++i){
-                resize_network(nets + i, dim, dim);
-            }
-            net = nets[0];
-        }
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data(args);
-
-        /*
-           int k;
-           for(k = 0; k < l.max_boxes; ++k){
-           box b = float_to_box(train.y.vals[10] + 1 + k*5);
-           if(!b.x) break;
-           printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
-           }
-           image im = float_to_image(448, 448, 3, train.X.vals[10]);
-           int k;
-           for(k = 0; k < l.max_boxes; ++k){
-           box b = float_to_box(train.y.vals[10] + 1 + k*5);
-           printf("%d %d %d %d\n", truth.x, truth.y, truth.w, truth.h);
-           draw_bbox(im, b, 8, 1,0,0);
-           }
-           save_image(im, "truth11");
-         */
-
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-
-        time=clock();
-        float loss = 0;
-#ifdef GPU
-        if(ngpus == 1){
-            loss = train_network(net, train);
-        } else {
-            loss = train_networks(nets, ngpus, train, 4);
-        }
-#else
-        loss = train_network(net, train);
-#endif
-        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-
-        i = get_current_batch(net);
-        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
-        if(i%1000==0 || (i < 1000 && i%100 == 0)){
-#ifdef GPU
-            if(ngpus != 1) sync_nets(nets, ngpus, 0);
-#endif
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-        }
-        free_data(train);
-    }
-#ifdef GPU
-    if(ngpus != 1) sync_nets(nets, ngpus, 0);
-#endif
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
-}
-
-
-static int get_coco_image_id(char *filename)
-{
-    char *p = strrchr(filename, '_');
-    return atoi(p+1);
-}
-
-static void print_cocos(FILE *fp, char *image_path, box *boxes, float **probs, int num_boxes, int classes, int w, int h)
-{
-    int i, j;
-    int image_id = get_coco_image_id(image_path);
-    for(i = 0; i < num_boxes; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
-
-        if (xmin < 0) xmin = 0;
-        if (ymin < 0) ymin = 0;
-        if (xmax > w) xmax = w;
-        if (ymax > h) ymax = h;
-
-        float bx = xmin;
-        float by = ymin;
-        float bw = xmax - xmin;
-        float bh = ymax - ymin;
-
-        for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, probs[i][j]);
-        }
-    }
-}
-
-void print_detector_detections(FILE **fps, char *id, box *boxes, float **probs, int total, int classes, int w, int h)
-{
-    int i, j;
-    for(i = 0; i < total; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
-
-        if (xmin < 0) xmin = 0;
-        if (ymin < 0) ymin = 0;
-        if (xmax > w) xmax = w;
-        if (ymax > h) ymax = h;
-
-        for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, probs[i][j],
-                    xmin, ymin, xmax, ymax);
-        }
-    }
-}
-
-void print_imagenet_detections(FILE *fp, int id, box *boxes, float **probs, int total, int classes, int w, int h)
-{
-    int i, j;
-    for(i = 0; i < total; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
-
-        if (xmin < 0) xmin = 0;
-        if (ymin < 0) ymin = 0;
-        if (xmax > w) xmax = w;
-        if (ymax > h) ymax = h;
-
-        for(j = 0; j < classes; ++j){
-            int class = j;
-            if (probs[i][class]) fprintf(fp, "%d %d %f %f %f %f %f\n", id, j+1, probs[i][class],
-                    xmin, ymin, xmax, ymax);
-        }
-    }
-}
-
-void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
-{
-    int j;
-    list *options = read_data_cfg(datacfg);
-    char *valid_images = option_find_str(options, "valid", "data/train.list");
-    char *name_list = option_find_str(options, "names", "data/names.list");
-    char *prefix = option_find_str(options, "results", "results");
-    char **names = get_labels(name_list);
-    char *mapf = option_find_str(options, "map", 0);
-    int *map = 0;
-    if (mapf) map = read_map(mapf);
-
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    srand(time(0));
-
-    list *plist = get_paths(valid_images);
-    char **paths = (char **)list_to_array(plist);
-
-    layer l = net.layers[net.n-1];
-    int classes = l.classes;
-
-    char buff[1024];
-    char *type = option_find_str(options, "eval", "voc");
-    FILE *fp = 0;
-    FILE **fps = 0;
-    int coco = 0;
-    int imagenet = 0;
-    if(0==strcmp(type, "coco")){
-        if(!outfile) outfile = "coco_results";
-        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
-        fp = fopen(buff, "w");
-        fprintf(fp, "[\n");
-        coco = 1;
-    } else if(0==strcmp(type, "imagenet")){
-        if(!outfile) outfile = "imagenet-detection";
-        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
-        fp = fopen(buff, "w");
-        imagenet = 1;
-        classes = 200;
-    } else {
-        if(!outfile) outfile = "comp4_det_test_";
-        fps = calloc(classes, sizeof(FILE *));
-        for(j = 0; j < classes; ++j){
-            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
-            fps[j] = fopen(buff, "w");
-        }
-    }
-
-
-    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
-
-    int m = plist->size;
-    int i=0;
-    int t;
-
-    float thresh = .005;
-    float nms = .45;
-
-    int nthreads = 4;
-    image *val = calloc(nthreads, sizeof(image));
-    image *val_resized = calloc(nthreads, sizeof(image));
-    image *buf = calloc(nthreads, sizeof(image));
-    image *buf_resized = calloc(nthreads, sizeof(image));
-    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.type = IMAGE_DATA;
-
-    for(t = 0; t < nthreads; ++t){
-        args.path = paths[i+t];
-        args.im = &buf[t];
-        args.resized = &buf_resized[t];
-        thr[t] = load_data_in_thread(args);
-    }
-    time_t start = time(0);
-    for(i = nthreads; i < m+nthreads; i += nthreads){
-        fprintf(stderr, "%d\n", i);
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            pthread_join(thr[t], 0);
-            val[t] = buf[t];
-            val_resized[t] = buf_resized[t];
-        }
-        for(t = 0; t < nthreads && i+t < m; ++t){
-            args.path = paths[i+t];
-            args.im = &buf[t];
-            args.resized = &buf_resized[t];
-            thr[t] = load_data_in_thread(args);
-        }
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            char *path = paths[i+t-nthreads];
-            char *id = basecfg(path);
-            float *X = val_resized[t].data;
-            network_predict(net, X);
-            int w = val[t].w;
-            int h = val[t].h;
-            get_region_boxes(l, w, h, thresh, probs, boxes, 0, map, .5);
-            if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, classes, nms);
-            if (coco){
-                print_cocos(fp, path, boxes, probs, l.w*l.h*l.n, classes, w, h);
-            } else if (imagenet){
-                print_imagenet_detections(fp, i+t-nthreads+1, boxes, probs, l.w*l.h*l.n, classes, w, h);
-            } else {
-                print_detector_detections(fps, id, boxes, probs, l.w*l.h*l.n, classes, w, h);
-            }
-            free(id);
-            free_image(val[t]);
-            free_image(val_resized[t]);
-        }
-    }
-    for(j = 0; j < classes; ++j){
-        if(fps) fclose(fps[j]);
-    }
-    if(coco){
-        fseek(fp, -2, SEEK_CUR); 
-        fprintf(fp, "\n]\n");
-        fclose(fp);
-    }
-    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
-}
-
-void validate_detector_recall(char *cfgfile, char *weightfile)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    srand(time(0));
-
-    list *plist = get_paths("data/voc.2007.test");
-    char **paths = (char **)list_to_array(plist);
-
-    layer l = net.layers[net.n-1];
-    int classes = l.classes;
-
-    int j, k;
-    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
-
-    int m = plist->size;
-    int i=0;
-
-    float thresh = .001;
-    float iou_thresh = .5;
-    float nms = .4;
-
-    int total = 0;
-    int correct = 0;
-    int proposals = 0;
-    float avg_iou = 0;
-
-    for(i = 0; i < m; ++i){
-        char *path = paths[i];
-        image orig = load_image_color(path, 0, 0);
-        image sized = resize_image(orig, net.w, net.h);
-        char *id = basecfg(path);
-        network_predict(net, sized.data);
-        get_region_boxes(l, 1, 1, thresh, probs, boxes, 1, 0, .5);
-        if (nms) do_nms(boxes, probs, l.w*l.h*l.n, 1, nms);
-
-        char labelpath[4096];
-        find_replace(path, "images", "labels", labelpath);
-        find_replace(labelpath, "JPEGImages", "labels", labelpath);
-        find_replace(labelpath, ".jpg", ".txt", labelpath);
-        find_replace(labelpath, ".JPEG", ".txt", labelpath);
-
-        int num_labels = 0;
-        box_label *truth = read_boxes(labelpath, &num_labels);
-        for(k = 0; k < l.w*l.h*l.n; ++k){
-            if(probs[k][0] > thresh){
-                ++proposals;
-            }
-        }
-        for (j = 0; j < num_labels; ++j) {
-            ++total;
-            box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
-            float best_iou = 0;
-            for(k = 0; k < l.w*l.h*l.n; ++k){
-                float iou = box_iou(boxes[k], t);
-                if(probs[k][0] > thresh && iou > best_iou){
-                    best_iou = iou;
-                }
-            }
-            avg_iou += best_iou;
-            if(best_iou > iou_thresh){
-                ++correct;
-            }
-        }
-
-        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
-        free(id);
-        free_image(orig);
-        free_image(sized);
-    }
-}
-
-void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh)
-{
-    list *options = read_data_cfg(datacfg);
-    char *name_list = option_find_str(options, "names", "data/names.list");
-    char **names = get_labels(name_list);
-
-    image **alphabet = load_alphabet();
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(2222222);
-    clock_t time;
-    char buff[256];
-    char *input = buff;
-    int j;
-    float nms=.4;
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        } else {
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image im = load_image_color(input,0,0);
-        image sized = resize_image(im, net.w, net.h);
-        layer l = net.layers[net.n-1];
-
-        box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-        float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-        for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes + 1, sizeof(float *));
-
-        float *X = sized.data;
-        time=clock();
-        network_predict(net, X);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0, hier_thresh);
-        if (l.softmax_tree && nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
-        else if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
-        draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes);
-        save_image(im, "predictions");
-        show_image(im, "predictions");
-
-        free_image(im);
-        free_image(sized);
-        free(boxes);
-        free_ptrs((void **)probs, l.w*l.h*l.n);
-#ifdef OPENCV
-        cvWaitKey(0);
-        cvDestroyAllWindows();
-#endif
-        if (filename) break;
-    }
-}
-
-void run_detector(int argc, char **argv)
-{
-    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
-    float thresh = find_float_arg(argc, argv, "-thresh", .24);
-    float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
-    int cam_index = find_int_arg(argc, argv, "-c", 0);
-    int frame_skip = find_int_arg(argc, argv, "-s", 0);
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
-    char *outfile = find_char_arg(argc, argv, "-out", 0);
-    int *gpus = 0;
-    int gpu = 0;
-    int ngpus = 0;
-    if(gpu_list){
-        printf("%s\n", gpu_list);
-        int len = strlen(gpu_list);
-        ngpus = 1;
-        int i;
-        for(i = 0; i < len; ++i){
-            if (gpu_list[i] == ',') ++ngpus;
-        }
-        gpus = calloc(ngpus, sizeof(int));
-        for(i = 0; i < ngpus; ++i){
-            gpus[i] = atoi(gpu_list);
-            gpu_list = strchr(gpu_list, ',')+1;
-        }
-    } else {
-        gpu = gpu_index;
-        gpus = &gpu;
-        ngpus = 1;
-    }
-
-    int clear = find_arg(argc, argv, "-clear");
-
-    char *datacfg = argv[3];
-    char *cfg = argv[4];
-    char *weights = (argc > 5) ? argv[5] : 0;
-    char *filename = (argc > 6) ? argv[6]: 0;
-    if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh);
-    else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
-    else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
-    else if(0==strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights);
-    else if(0==strcmp(argv[2], "demo")) {
-        list *options = read_data_cfg(datacfg);
-        int classes = option_find_int(options, "classes", 20);
-        char *name_list = option_find_str(options, "names", "data/names.list");
-        char **names = get_labels(name_list);
-        demo(cfg, weights, thresh, cam_index, filename, names, classes, frame_skip, prefix, hier_thresh);
-    }
-}
diff --git a/image.darknet/inst/include/darknet/src/dropout_layer.c b/image.darknet/inst/include/darknet/src/dropout_layer.c
index b1381e6..780554f 100644
--- a/image.darknet/inst/include/darknet/src/dropout_layer.c
+++ b/image.darknet/inst/include/darknet/src/dropout_layer.c
@@ -35,26 +35,26 @@ void resize_dropout_layer(dropout_layer *l, int inputs)
     #endif
 }
 
-void forward_dropout_layer(dropout_layer l, network_state state)
+void forward_dropout_layer(dropout_layer l, network net)
 {
     int i;
-    if (!state.train) return;
+    if (!net.train) return;
     for(i = 0; i < l.batch * l.inputs; ++i){
         float r = rand_uniform(0, 1);
         l.rand[i] = r;
-        if(r < l.probability) state.input[i] = 0;
-        else state.input[i] *= l.scale;
+        if(r < l.probability) net.input[i] = 0;
+        else net.input[i] *= l.scale;
     }
 }
 
-void backward_dropout_layer(dropout_layer l, network_state state)
+void backward_dropout_layer(dropout_layer l, network net)
 {
     int i;
-    if(!state.delta) return;
+    if(!net.delta) return;
     for(i = 0; i < l.batch * l.inputs; ++i){
         float r = l.rand[i];
-        if(r < l.probability) state.delta[i] = 0;
-        else state.delta[i] *= l.scale;
+        if(r < l.probability) net.delta[i] = 0;
+        else net.delta[i] *= l.scale;
     }
 }
 
diff --git a/image.darknet/inst/include/darknet/src/dropout_layer.h b/image.darknet/inst/include/darknet/src/dropout_layer.h
index 691cfc5..01f94d4 100644
--- a/image.darknet/inst/include/darknet/src/dropout_layer.h
+++ b/image.darknet/inst/include/darknet/src/dropout_layer.h
@@ -8,13 +8,13 @@ typedef layer dropout_layer;
 
 dropout_layer make_dropout_layer(int batch, int inputs, float probability);
 
-void forward_dropout_layer(dropout_layer l, network_state state);
-void backward_dropout_layer(dropout_layer l, network_state state);
+void forward_dropout_layer(dropout_layer l, network net);
+void backward_dropout_layer(dropout_layer l, network net);
 void resize_dropout_layer(dropout_layer *l, int inputs);
 
 #ifdef GPU
-void forward_dropout_layer_gpu(dropout_layer l, network_state state);
-void backward_dropout_layer_gpu(dropout_layer l, network_state state);
+void forward_dropout_layer_gpu(dropout_layer l, network net);
+void backward_dropout_layer_gpu(dropout_layer l, network net);
 
 #endif
 #endif
diff --git a/image.darknet/inst/include/darknet/src/dropout_layer_kernels.cu b/image.darknet/inst/include/darknet/src/dropout_layer_kernels.cu
index 7e51bd5..bd12b67 100644
--- a/image.darknet/inst/include/darknet/src/dropout_layer_kernels.cu
+++ b/image.darknet/inst/include/darknet/src/dropout_layer_kernels.cu
@@ -14,9 +14,9 @@ __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand
     if(id < size) input[id] = (rand[id] < prob) ? 0 : input[id]*scale;
 }
 
-void forward_dropout_layer_gpu(dropout_layer layer, network_state state)
+void forward_dropout_layer_gpu(dropout_layer layer, network net)
 {
-    if (!state.train) return;
+    if (!net.train) return;
     int size = layer.inputs*layer.batch;
     cuda_random(layer.rand_gpu, size);
     /*
@@ -27,15 +27,15 @@ void forward_dropout_layer_gpu(dropout_layer layer, network_state state)
     cuda_push_array(layer.rand_gpu, layer.rand, size);
     */
 
-    yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(state.input, size, layer.rand_gpu, layer.probability, layer.scale);
+    yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, size, layer.rand_gpu, layer.probability, layer.scale);
     check_error(cudaPeekAtLastError());
 }
 
-void backward_dropout_layer_gpu(dropout_layer layer, network_state state)
+void backward_dropout_layer_gpu(dropout_layer layer, network net)
 {
-    if(!state.delta) return;
+    if(!net.delta_gpu) return;
     int size = layer.inputs*layer.batch;
 
-    yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(state.delta, size, layer.rand_gpu, layer.probability, layer.scale);
+    yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(net.delta_gpu, size, layer.rand_gpu, layer.probability, layer.scale);
     check_error(cudaPeekAtLastError());
 }
diff --git a/image.darknet/inst/include/darknet/src/gemm.c b/image.darknet/inst/include/darknet/src/gemm.c
index 3003be0..648027f 100644
--- a/image.darknet/inst/include/darknet/src/gemm.c
+++ b/image.darknet/inst/include/darknet/src/gemm.c
@@ -77,6 +77,7 @@ void gemm_nn(int M, int N, int K, float ALPHA,
         float *C, int ldc)
 {
     int i,j,k;
+    #pragma omp parallel for
     for(i = 0; i < M; ++i){
         for(k = 0; k < K; ++k){
             register float A_PART = ALPHA*A[i*lda+k];
@@ -93,6 +94,7 @@ void gemm_nt(int M, int N, int K, float ALPHA,
         float *C, int ldc)
 {
     int i,j,k;
+    #pragma omp parallel for
     for(i = 0; i < M; ++i){
         for(j = 0; j < N; ++j){
             register float sum = 0;
@@ -110,6 +112,7 @@ void gemm_tn(int M, int N, int K, float ALPHA,
         float *C, int ldc)
 {
     int i,j,k;
+    #pragma omp parallel for
     for(i = 0; i < M; ++i){
         for(k = 0; k < K; ++k){
             register float A_PART = ALPHA*A[k*lda+i];
@@ -126,6 +129,7 @@ void gemm_tt(int M, int N, int K, float ALPHA,
         float *C, int ldc)
 {
     int i,j,k;
+    #pragma omp parallel for
     for(i = 0; i < M; ++i){
         for(j = 0; j < N; ++j){
             register float sum = 0;
@@ -165,7 +169,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
 
 #include <math.h>
 
-void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, 
+void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
         float *A_gpu, int lda, 
         float *B_gpu, int ldb,
         float BETA,
@@ -177,24 +181,6 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
     check_error(status);
 }
 
-void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float BETA,
-        float *C, int ldc)
-{
-    float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M));
-    float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K));
-    float *C_gpu = cuda_make_array(C, ldc*M);
-
-    gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
-
-    cuda_pull_array(C_gpu, C, ldc*M);
-    cuda_free(A_gpu);
-    cuda_free(B_gpu);
-    cuda_free(C_gpu);
-}
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -224,7 +210,7 @@ void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
     free(c);
 }
 
-void time_ongpu(int TA, int TB, int m, int k, int n)
+void time_gpu(int TA, int TB, int m, int k, int n)
 {
     int iter = 10;
     float *a = random_matrix(m,k);
@@ -242,7 +228,7 @@ void time_ongpu(int TA, int TB, int m, int k, int n)
     int i;
     clock_t start = clock(), end;
     for(i = 0; i<iter; ++i){
-        gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
+        gemm_gpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
         cudaThreadSynchronize();
     }
     double flop = ((double)m)*n*(2.*k + 2.)*iter;
@@ -313,24 +299,24 @@ int test_gpu_blas()
 
        test_gpu_accuracy(0,0,10,10,10); 
 
-       time_ongpu(0,0,64,2916,363); 
-       time_ongpu(0,0,64,2916,363); 
-       time_ongpu(0,0,64,2916,363); 
-       time_ongpu(0,0,192,729,1600); 
-       time_ongpu(0,0,384,196,1728); 
-       time_ongpu(0,0,256,196,3456); 
-       time_ongpu(0,0,256,196,2304); 
-       time_ongpu(0,0,128,4096,12544); 
-       time_ongpu(0,0,128,4096,4096); 
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,192,729,1600); 
+       time_gpu(0,0,384,196,1728); 
+       time_gpu(0,0,256,196,3456); 
+       time_gpu(0,0,256,196,2304); 
+       time_gpu(0,0,128,4096,12544); 
+       time_gpu(0,0,128,4096,4096); 
      */
-    time_ongpu(0,0,64,75,12544); 
-    time_ongpu(0,0,64,75,12544); 
-    time_ongpu(0,0,64,75,12544); 
-    time_ongpu(0,0,64,576,12544); 
-    time_ongpu(0,0,256,2304,784); 
-    time_ongpu(1,1,2304,256,784); 
-    time_ongpu(0,0,512,4608,196); 
-    time_ongpu(1,1,4608,512,196); 
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,576,12544); 
+    time_gpu(0,0,256,2304,784); 
+    time_gpu(1,1,2304,256,784); 
+    time_gpu(0,0,512,4608,196); 
+    time_gpu(1,1,4608,512,196); 
 
     return 0;
 }
diff --git a/image.darknet/inst/include/darknet/src/gemm.h b/image.darknet/inst/include/darknet/src/gemm.h
index f0231bf..3ebb0eb 100644
--- a/image.darknet/inst/include/darknet/src/gemm.h
+++ b/image.darknet/inst/include/darknet/src/gemm.h
@@ -19,7 +19,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
         float *C, int ldc);
 
 #ifdef GPU
-void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, 
+void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
         float *A_gpu, int lda, 
         float *B_gpu, int ldb,
         float BETA,
diff --git a/image.darknet/inst/include/darknet/src/go.c b/image.darknet/inst/include/darknet/src/go.c
deleted file mode 100644
index 89297b5..0000000
--- a/image.darknet/inst/include/darknet/src/go.c
+++ /dev/null
@@ -1,833 +0,0 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-#include "option_list.h"
-#include "blas.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-int inverted = 1;
-int noi = 1;
-static const int nind = 5;
-
-typedef struct {
-    char **data;
-    int n;
-} moves;
-
-char *fgetgo(FILE *fp)
-{
-    if(feof(fp)) return 0;
-    size_t size = 94;
-    char *line = malloc(size*sizeof(char));
-    if(size != fread(line, sizeof(char), size, fp)){
-        free(line);
-        return 0;
-    }
-
-    return line;
-}
-
-moves load_go_moves(char *filename)
-{
-    moves m;
-    m.n = 128;
-    m.data = calloc(128, sizeof(char*));
-    FILE *fp = fopen(filename, "rb");
-    int count = 0;
-    char *line = 0;
-    while((line = fgetgo(fp))){
-        if(count >= m.n){
-            m.n *= 2;
-            m.data = realloc(m.data, m.n*sizeof(char*));
-        }
-        m.data[count] = line;
-        ++count;
-    }
-    printf("%d\n", count);
-    m.n = count;
-    m.data = realloc(m.data, count*sizeof(char*));
-    return m;
-}
-
-void string_to_board(char *s, float *board)
-{
-    int i, j;
-    //memset(board, 0, 1*19*19*sizeof(float));
-    int count = 0;
-    for(i = 0; i < 91; ++i){
-        char c = s[i];
-        for(j = 0; j < 4; ++j){
-            int me = (c >> (2*j)) & 1;
-            int you = (c >> (2*j + 1)) & 1;
-            if (me) board[count] = 1;
-            else if (you) board[count] = -1;
-            else board[count] = 0;
-            ++count;
-            if(count >= 19*19) break;
-        }
-    }
-}
-
-void board_to_string(char *s, float *board)
-{
-    int i, j;
-    memset(s, 0, (19*19/4+1)*sizeof(char));
-    int count = 0;
-    for(i = 0; i < 91; ++i){
-        for(j = 0; j < 4; ++j){
-            int me = (board[count] == 1);
-            int you = (board[count] == -1);
-            if (me) s[i] = s[i] | (1<<(2*j));
-            if (you) s[i] = s[i] | (1<<(2*j + 1));
-            ++count;
-            if(count >= 19*19) break;
-        }
-    }
-}
-
-void random_go_moves(moves m, float *boards, float *labels, int n)
-{
-    int i;
-    memset(labels, 0, 19*19*n*sizeof(float));
-    for(i = 0; i < n; ++i){
-        char *b = m.data[rand()%m.n];
-        int row = b[0];
-        int col = b[1];
-        labels[col + 19*(row + i*19)] = 1;
-        string_to_board(b+2, boards+i*19*19);
-        boards[col + 19*(row + i*19)] = 0;
-
-        int flip = rand()%2;
-        int rotate = rand()%4;
-        image in = float_to_image(19, 19, 1, boards+i*19*19);
-        image out = float_to_image(19, 19, 1, labels+i*19*19);
-        if(flip){
-            flip_image(in);
-            flip_image(out);
-        }
-        rotate_image_cw(in, rotate);
-        rotate_image_cw(out, rotate);
-    }
-}
-
-
-void train_go(char *cfgfile, char *weightfile)
-{
-    srand(time(0));
-    float avg_loss = -1;
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-
-    char *backup_directory = "/home/pjreddie/backup/";
-
-    char buff[256];
-    float *board = calloc(19*19*net.batch, sizeof(float));
-    float *move = calloc(19*19*net.batch, sizeof(float));
-    moves m = load_go_moves("/home/pjreddie/backup/go.train");
-    //moves m = load_go_moves("games.txt");
-
-    int N = m.n;
-    int epoch = (*net.seen)/N;
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
-        clock_t time=clock();
-
-        random_go_moves(m, board, move, net.batch);
-        float loss = train_network_datum(net, board, move) / net.batch;
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.95 + loss*.05;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory,base, epoch);
-            save_weights(net, buff);
-
-        }
-        if(get_current_batch(net)%100 == 0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup",backup_directory,base);
-            save_weights(net, buff);
-        }
-        if(get_current_batch(net)%10000 == 0){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.backup",backup_directory,base,get_current_batch(net));
-            save_weights(net, buff);
-        }
-    }
-    sprintf(buff, "%s/%s.weights", backup_directory, base);
-    save_weights(net, buff);
-
-    free_network(net);
-    free(base);
-}
-
-void propagate_liberty(float *board, int *lib, int *visited, int row, int col, int side)
-{
-    if (row < 0 || row > 18 || col < 0 || col > 18) return;
-    int index = row*19 + col;
-    if (board[index] != side) return;
-    if (visited[index]) return;
-    visited[index] = 1;
-    lib[index] += 1;
-    propagate_liberty(board, lib, visited, row+1, col, side);
-    propagate_liberty(board, lib, visited, row-1, col, side);
-    propagate_liberty(board, lib, visited, row, col+1, side);
-    propagate_liberty(board, lib, visited, row, col-1, side);
-}
-
-
-int *calculate_liberties(float *board)
-{
-    int *lib = calloc(19*19, sizeof(int));
-    int visited[361];
-    int i, j;
-    for(j = 0; j < 19; ++j){
-        for(i = 0; i < 19; ++i){
-            memset(visited, 0, 19*19*sizeof(int));
-            int index = j*19 + i;
-            if(board[index] == 0){
-                if ((i > 0)  && board[index - 1]) propagate_liberty(board, lib, visited, j, i-1, board[index-1]);
-                if ((i < 18) && board[index + 1]) propagate_liberty(board, lib, visited, j, i+1, board[index+1]);
-                if ((j > 0)  && board[index - 19]) propagate_liberty(board, lib, visited, j-1, i, board[index-19]);
-                if ((j < 18) && board[index + 19]) propagate_liberty(board, lib, visited, j+1, i, board[index+19]);
-            }
-        }
-    }
-    return lib;
-}
-
-void print_board(float *board, int swap, int *indexes)
-{
-    //FILE *stream = stdout;
-    FILE *stream = stderr;
-    int i,j,n;
-    fprintf(stream, "\n\n");
-    fprintf(stream, "   ");
-    for(i = 0; i < 19; ++i){
-        fprintf(stream, "%c ", 'A' + i + 1*(i > 7 && noi));
-    }
-    fprintf(stream, "\n");
-    for(j = 0; j < 19; ++j){
-        fprintf(stream, "%2d", (inverted) ? 19-j : j+1);
-        for(i = 0; i < 19; ++i){
-            int index = j*19 + i;
-            if(indexes){
-                int found = 0;
-                for(n = 0; n < nind; ++n){
-                    if(index == indexes[n]){
-                        found = 1;
-                        /*
-                        if(n == 0) fprintf(stream, "\uff11");
-                        else if(n == 1) fprintf(stream, "\uff12");
-                        else if(n == 2) fprintf(stream, "\uff13");
-                        else if(n == 3) fprintf(stream, "\uff14");
-                        else if(n == 4) fprintf(stream, "\uff15");
-                        */
-                        if(n == 0) fprintf(stream, " 1");
-                        else if(n == 1) fprintf(stream, " 2");
-                        else if(n == 2) fprintf(stream, " 3");
-                        else if(n == 3) fprintf(stream, " 4");
-                        else if(n == 4) fprintf(stream, " 5");
-                    }
-                }
-                if(found) continue;
-            }
-            //if(board[index]*-swap > 0) fprintf(stream, "\u25C9 ");
-            //else if(board[index]*-swap < 0) fprintf(stream, "\u25EF ");
-            if(board[index]*-swap > 0) fprintf(stream, " O");
-            else if(board[index]*-swap < 0) fprintf(stream, " X");
-            else fprintf(stream, "  ");
-        }
-        fprintf(stream, "\n");
-    }
-}
-
-void flip_board(float *board)
-{
-    int i;
-    for(i = 0; i < 19*19; ++i){
-        board[i] = -board[i];
-    }
-}
-
-void predict_move(network net, float *board, float *move, int multi)
-{
-    float *output = network_predict(net, board);
-    copy_cpu(19*19, output, 1, move, 1);
-    int i;
-    if(multi){
-        image bim = float_to_image(19, 19, 1, board);
-        for(i = 1; i < 8; ++i){
-            rotate_image_cw(bim, i);
-            if(i >= 4) flip_image(bim);
-
-            float *output = network_predict(net, board);
-            image oim = float_to_image(19, 19, 1, output);
-
-            if(i >= 4) flip_image(oim);
-            rotate_image_cw(oim, -i);
-
-            axpy_cpu(19*19, 1, output, 1, move, 1);
-
-            if(i >= 4) flip_image(bim);
-            rotate_image_cw(bim, -i);
-        }
-        scal_cpu(19*19, 1./8., move, 1);
-    }
-    for(i = 0; i < 19*19; ++i){
-        if(board[i]) move[i] = 0;
-    }
-}
-
-void remove_connected(float *b, int *lib, int p, int r, int c)
-{
-    if (r < 0 || r >= 19 || c < 0 || c >= 19) return;
-    if (b[r*19 + c] != p) return;
-    if (lib[r*19 + c] != 1) return;
-    b[r*19 + c] = 0;
-    remove_connected(b, lib, p, r+1, c);
-    remove_connected(b, lib, p, r-1, c);
-    remove_connected(b, lib, p, r, c+1);
-    remove_connected(b, lib, p, r, c-1);
-}
-
-
-void move_go(float *b, int p, int r, int c)
-{
-    int *l = calculate_liberties(b);
-    b[r*19 + c] = p;
-    remove_connected(b, l, -p, r+1, c);
-    remove_connected(b, l, -p, r-1, c);
-    remove_connected(b, l, -p, r, c+1);
-    remove_connected(b, l, -p, r, c-1);
-    free(l);
-}
-
-int makes_safe_go(float *b, int *lib, int p, int r, int c){
-    if (r < 0 || r >= 19 || c < 0 || c >= 19) return 0;
-    if (b[r*19 + c] == -p){
-        if (lib[r*19 + c] > 1) return 0;
-        else return 1;
-    }
-    if (b[r*19 + c] == 0) return 1;
-    if (lib[r*19 + c] > 1) return 1;
-    return 0;
-}
-
-int suicide_go(float *b, int p, int r, int c)
-{
-    int *l = calculate_liberties(b);
-    int safe = 0;
-    safe = safe || makes_safe_go(b, l, p, r+1, c);
-    safe = safe || makes_safe_go(b, l, p, r-1, c);
-    safe = safe || makes_safe_go(b, l, p, r, c+1);
-    safe = safe || makes_safe_go(b, l, p, r, c-1);
-    free(l);
-    return !safe;
-}
-
-int legal_go(float *b, char *ko, int p, int r, int c)
-{
-    if (b[r*19 + c]) return 0;
-    char curr[91];
-    char next[91];
-    board_to_string(curr, b);
-    move_go(b, p, r, c);
-    board_to_string(next, b);
-    string_to_board(curr, b);
-    if(memcmp(next, ko, 91) == 0) return 0;
-    return 1;
-}
-
-int generate_move(network net, int player, float *board, int multi, float thresh, float temp, char *ko, int print)
-{
-    int i, j;
-    for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp;
-
-    float move[361];
-    if (player < 0) flip_board(board);
-    predict_move(net, board, move, multi);
-    if (player < 0) flip_board(board);
-
-    
-    for(i = 0; i < 19; ++i){
-        for(j = 0; j < 19; ++j){
-            if (!legal_go(board, ko, player, i, j)) move[i*19 + j] = 0;
-        }
-    }
-
-    int indexes[nind];
-    top_k(move, 19*19, nind, indexes);
-    if(thresh > move[indexes[0]]) thresh = move[indexes[nind-1]];
-
-    for(i = 0; i < 19; ++i){
-        for(j = 0; j < 19; ++j){
-            if (move[i*19 + j] < thresh) move[i*19 + j] = 0;
-        }
-    }
-
-
-    int max = max_index(move, 19*19);
-    int row = max / 19;
-    int col = max % 19;
-    int index = sample_array(move, 19*19);
-
-    if(print){
-        top_k(move, 19*19, nind, indexes);
-        for(i = 0; i < nind; ++i){
-            if (!move[indexes[i]]) indexes[i] = -1;
-        }
-        print_board(board, player, indexes);
-        for(i = 0; i < nind; ++i){
-            fprintf(stderr, "%d: %f\n", i+1, move[indexes[i]]);
-        }
-    }
-
-    if(suicide_go(board, player, row, col)){
-        return -1; 
-    }
-    if(suicide_go(board, player, index/19, index%19)) index = max;
-    return index;
-}
-
-void valid_go(char *cfgfile, char *weightfile, int multi)
-{
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-
-    float *board = calloc(19*19, sizeof(float));
-    float *move = calloc(19*19, sizeof(float));
-    moves m = load_go_moves("/home/pjreddie/backup/go.test");
-
-    int N = m.n;
-    int i;
-    int correct = 0;
-    for(i = 0; i <N; ++i){
-        char *b = m.data[i];
-        int row = b[0];
-        int col = b[1];
-        int truth = col + 19*row;
-        string_to_board(b+2, board);
-        predict_move(net, board, move, multi);
-        int index = max_index(move, 19*19);
-        if(index == truth) ++correct;
-        printf("%d Accuracy %f\n", i, (float) correct/(i+1));
-    }
-}
-
-void engine_go(char *filename, char *weightfile, int multi)
-{
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-    set_batch_network(&net, 1);
-    float *board = calloc(19*19, sizeof(float));
-    char *one = calloc(91, sizeof(char));
-    char *two = calloc(91, sizeof(char));
-    int passed = 0;
-    while(1){
-        char buff[256];
-        int id = 0;
-        int has_id = (scanf("%d", &id) == 1);
-        scanf("%s", buff);
-        if (feof(stdin)) break;
-        char ids[256];
-        sprintf(ids, "%d", id);
-        //fprintf(stderr, "%s\n", buff);
-        if (!has_id) ids[0] = 0;
-        if (!strcmp(buff, "protocol_version")){
-            printf("=%s 2\n\n", ids);
-        } else if (!strcmp(buff, "name")){
-            printf("=%s DarkGo\n\n", ids);
-        } else if (!strcmp(buff, "version")){
-            printf("=%s 1.0\n\n", ids);
-        } else if (!strcmp(buff, "known_command")){
-            char comm[256];
-            scanf("%s", comm);
-            int known = (!strcmp(comm, "protocol_version") || 
-                    !strcmp(comm, "name") || 
-                    !strcmp(comm, "version") || 
-                    !strcmp(comm, "known_command") || 
-                    !strcmp(comm, "list_commands") || 
-                    !strcmp(comm, "quit") || 
-                    !strcmp(comm, "boardsize") || 
-                    !strcmp(comm, "clear_board") || 
-                    !strcmp(comm, "komi") || 
-                    !strcmp(comm, "final_status_list") || 
-                    !strcmp(comm, "play") || 
-                    !strcmp(comm, "genmove"));
-            if(known) printf("=%s true\n\n", ids);
-            else printf("=%s false\n\n", ids);
-        } else if (!strcmp(buff, "list_commands")){
-            printf("=%s protocol_version\nname\nversion\nknown_command\nlist_commands\nquit\nboardsize\nclear_board\nkomi\nplay\ngenmove\nfinal_status_list\n\n", ids);
-        } else if (!strcmp(buff, "quit")){
-            break;
-        } else if (!strcmp(buff, "boardsize")){
-            int boardsize = 0;
-            scanf("%d", &boardsize);
-            //fprintf(stderr, "%d\n", boardsize);
-            if(boardsize != 19){
-                printf("?%s unacceptable size\n\n", ids);
-            } else {
-                printf("=%s \n\n", ids);
-            }
-        } else if (!strcmp(buff, "clear_board")){
-            passed = 0;
-            memset(board, 0, 19*19*sizeof(float));
-            printf("=%s \n\n", ids);
-        } else if (!strcmp(buff, "komi")){
-            float komi = 0;
-            scanf("%f", &komi);
-            printf("=%s \n\n", ids);
-        } else if (!strcmp(buff, "play")){
-            char color[256];
-            scanf("%s ", color);
-            char c;
-            int r;
-            int count = scanf("%c%d", &c, &r);
-            int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
-            if(c == 'p' && count < 2) {
-                passed = 1;
-                printf("=%s \n\n", ids);
-                char *line = fgetl(stdin);
-                free(line);
-                fflush(stdout);
-                fflush(stderr);
-                continue;
-            } else {
-                passed = 0;
-            }
-            if(c >= 'A' && c <= 'Z') c = c - 'A';
-            if(c >= 'a' && c <= 'z') c = c - 'a';
-            if(c >= 8) --c;
-            r = 19 - r;
-            fprintf(stderr, "move: %d %d\n", r, c);
-
-            char *swap = two;
-            two = one;
-            one = swap;
-            move_go(board, player, r, c);
-            board_to_string(one, board);
-
-            printf("=%s \n\n", ids);
-            print_board(board, 1, 0);
-        } else if (!strcmp(buff, "genmove")){
-            char color[256];
-            scanf("%s", color);
-            int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
-
-            int index = generate_move(net, player, board, multi, .1, .7, two, 1);
-            if(passed || index < 0){
-                printf("=%s pass\n\n", ids);
-                passed = 0;
-            } else {
-                int row = index / 19;
-                int col = index % 19;
-
-                char *swap = two;
-                two = one;
-                one = swap;
-
-                move_go(board, player, row, col);
-                board_to_string(one, board);
-                row = 19 - row;
-                if (col >= 8) ++col;
-                printf("=%s %c%d\n\n", ids, 'A' + col, row);
-                print_board(board, 1, 0);
-            }
-
-        } else if (!strcmp(buff, "p")){
-            //print_board(board, 1, 0);
-        } else if (!strcmp(buff, "final_status_list")){
-            char type[256];
-            scanf("%s", type);
-            fprintf(stderr, "final_status\n");
-            char *line = fgetl(stdin);
-            free(line);
-            if(type[0] == 'd' || type[0] == 'D'){
-                FILE *f = fopen("game.txt", "w");
-                int i, j;
-                int count = 2;
-                fprintf(f, "boardsize 19\n");
-                fprintf(f, "clear_board\n");
-                for(j = 0; j < 19; ++j){
-                    for(i = 0; i < 19; ++i){
-                        if(board[j*19 + i] == 1) fprintf(f, "play black %c%d\n", 'A'+i+(i>=8), 19-j);
-                        if(board[j*19 + i] == -1) fprintf(f, "play white %c%d\n", 'A'+i+(i>=8), 19-j);
-                        if(board[j*19 + i]) ++count;
-                    }
-                }
-                fprintf(f, "final_status_list dead\n");
-                fclose(f);
-                FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
-                for(i = 0; i < count; ++i){
-                    free(fgetl(p));
-                    free(fgetl(p));
-                }
-                char *l = 0;
-                while((l = fgetl(p))){
-                    printf("%s\n", l);
-                    free(l);
-                }
-            } else {
-                printf("?%s unknown command\n\n", ids);
-            }
-        } else {
-            char *line = fgetl(stdin);
-            free(line);
-            printf("?%s unknown command\n\n", ids);
-        }
-        fflush(stdout);
-        fflush(stderr);
-    }
-}
-
-void test_go(char *cfg, char *weights, int multi)
-{
-    network net = parse_network_cfg(cfg);
-    if(weights){
-        load_weights(&net, weights);
-    }
-    srand(time(0));
-    set_batch_network(&net, 1);
-    float *board = calloc(19*19, sizeof(float));
-    float *move = calloc(19*19, sizeof(float));
-    int color = 1;
-    while(1){
-        float *output = network_predict(net, board);
-        copy_cpu(19*19, output, 1, move, 1);
-        int i;
-        if(multi){
-            image bim = float_to_image(19, 19, 1, board);
-            for(i = 1; i < 8; ++i){
-                rotate_image_cw(bim, i);
-                if(i >= 4) flip_image(bim);
-
-                float *output = network_predict(net, board);
-                image oim = float_to_image(19, 19, 1, output);
-
-                if(i >= 4) flip_image(oim);
-                rotate_image_cw(oim, -i);
-
-                axpy_cpu(19*19, 1, output, 1, move, 1);
-
-                if(i >= 4) flip_image(bim);
-                rotate_image_cw(bim, -i);
-            }
-            scal_cpu(19*19, 1./8., move, 1);
-        }
-        for(i = 0; i < 19*19; ++i){
-            if(board[i]) move[i] = 0;
-        }
-
-        int indexes[nind];
-        int row, col;
-        top_k(move, 19*19, nind, indexes);
-        print_board(board, color, indexes);
-        for(i = 0; i < nind; ++i){
-            int index = indexes[i];
-            row = index / 19;
-            col = index % 19;
-            printf("%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100);
-        }
-        //if(color == 1) printf("\u25EF Enter move: ");
-        //else printf("\u25C9 Enter move: ");
-        if(color == 1) printf("X Enter move: ");
-        else printf("O Enter move: ");
-
-        char c;
-        char *line = fgetl(stdin);
-        int picked = 1;
-        int dnum = sscanf(line, "%d", &picked);
-        int cnum = sscanf(line, "%c", &c);
-        if (strlen(line) == 0 || dnum) {
-            --picked;
-            if (picked < nind){
-                int index = indexes[picked];
-                row = index / 19;
-                col = index % 19;
-                board[row*19 + col] = 1;
-            }
-        } else if (cnum){
-            if (c <= 'T' && c >= 'A'){
-                int num = sscanf(line, "%c %d", &c, &row);
-                row = (inverted)?19 - row : row-1;
-                col = c - 'A';
-                if (col > 7 && noi) col -= 1;
-                if (num == 2) board[row*19 + col] = 1;
-            } else if (c == 'p') {
-                // Pass
-            } else if(c=='b' || c == 'w'){
-                char g;
-                int num = sscanf(line, "%c %c %d", &g, &c, &row);
-                row = (inverted)?19 - row : row-1;
-                col = c - 'A';
-                if (col > 7 && noi) col -= 1;
-                if (num == 3) board[row*19 + col] = (g == 'b') ? color : -color;
-            } else if(c == 'c'){
-                char g;
-                int num = sscanf(line, "%c %c %d", &g, &c, &row);
-                row = (inverted)?19 - row : row-1;
-                col = c - 'A';
-                if (col > 7 && noi) col -= 1;
-                if (num == 3) board[row*19 + col] = 0;
-            }
-        }
-        free(line);
-        flip_board(board);
-        color = -color;
-    }
-}
-
-float score_game(float *board)
-{
-    FILE *f = fopen("game.txt", "w");
-    int i, j;
-    int count = 3;
-    fprintf(f, "komi 6.5\n");
-    fprintf(f, "boardsize 19\n");
-    fprintf(f, "clear_board\n");
-    for(j = 0; j < 19; ++j){
-        for(i = 0; i < 19; ++i){
-            if(board[j*19 + i] == 1) fprintf(f, "play black %c%d\n", 'A'+i+(i>=8), 19-j);
-            if(board[j*19 + i] == -1) fprintf(f, "play white %c%d\n", 'A'+i+(i>=8), 19-j);
-            if(board[j*19 + i]) ++count;
-        }
-    }
-    fprintf(f, "final_score\n");
-    fclose(f);
-    FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
-    for(i = 0; i < count; ++i){
-        free(fgetl(p));
-        free(fgetl(p));
-    }
-    char *l = 0;
-    float score = 0;
-    char player = 0;
-    while((l = fgetl(p))){
-        fprintf(stderr, "%s  \t", l);
-        int n = sscanf(l, "= %c+%f", &player, &score);
-        free(l);
-        if (n == 2) break;
-    }
-    if(player == 'W') score = -score;
-    pclose(p);
-    return score;
-}
-
-void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi)
-{
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-
-    network net2 = net;
-    if(f2){
-        net2 = parse_network_cfg(f2);
-        if(w2){
-            load_weights(&net2, w2);
-        }
-    }
-    srand(time(0));
-    char boards[300][93];
-    int count = 0;
-    set_batch_network(&net, 1);
-    set_batch_network(&net2, 1);
-    float *board = calloc(19*19, sizeof(float));
-    char *one = calloc(91, sizeof(char));
-    char *two = calloc(91, sizeof(char));
-    int done = 0;
-    int player = 1;
-    int p1 = 0;
-    int p2 = 0;
-    int total = 0;
-    while(1){
-        if (done || count >= 300){
-            float score = score_game(board);
-            int i = (score > 0)? 0 : 1;
-            if((score > 0) == (total%2==0)) ++p1;
-            else ++p2;
-            ++total;
-            fprintf(stderr, "Total: %d, Player 1: %f, Player 2: %f\n", total, (float)p1/total, (float)p2/total);
-            int j;
-            for(; i < count; i += 2){
-                for(j = 0; j < 93; ++j){
-                    printf("%c", boards[i][j]);
-                }
-                printf("\n");
-            }
-            memset(board, 0, 19*19*sizeof(float));
-            player = 1;
-            done = 0;
-            count = 0;
-            fflush(stdout);
-            fflush(stderr);
-        }
-        //print_board(board, 1, 0);
-        //sleep(1);
-        network use = ((total%2==0) == (player==1)) ? net : net2;
-        int index = generate_move(use, player, board, multi, .1, .7, two, 0);
-        if(index < 0){
-            done = 1;
-            continue;
-        }
-        int row = index / 19;
-        int col = index % 19;
-
-        char *swap = two;
-        two = one;
-        one = swap;
-
-        if(player < 0) flip_board(board);
-        boards[count][0] = row;
-        boards[count][1] = col;
-        board_to_string(boards[count] + 2, board);
-        if(player < 0) flip_board(board);
-        ++count;
-
-        move_go(board, player, row, col);
-        board_to_string(one, board);
-
-        player = -player;
-    }
-}
-
-void run_go(int argc, char **argv)
-{
-    //boards_go();
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    char *c2 = (argc > 5) ? argv[5] : 0;
-    char *w2 = (argc > 6) ? argv[6] : 0;
-    int multi = find_arg(argc, argv, "-multi");
-    if(0==strcmp(argv[2], "train")) train_go(cfg, weights);
-    else if(0==strcmp(argv[2], "valid")) valid_go(cfg, weights, multi);
-    else if(0==strcmp(argv[2], "self")) self_go(cfg, weights, c2, w2, multi);
-    else if(0==strcmp(argv[2], "test")) test_go(cfg, weights, multi);
-    else if(0==strcmp(argv[2], "engine")) engine_go(cfg, weights, multi);
-}
-
-
diff --git a/image.darknet/inst/include/darknet/src/gru_layer.c b/image.darknet/inst/include/darknet/src/gru_layer.c
index b78e868..b6601d8 100644
--- a/image.darknet/inst/include/darknet/src/gru_layer.c
+++ b/image.darknet/inst/include/darknet/src/gru_layer.c
@@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps)
 #endif
 }
 
-layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize)
+layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
 {
     fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs);
     batch = batch / steps;
@@ -36,39 +36,37 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
     l.steps = steps;
     l.inputs = inputs;
 
-    l.input_z_layer = malloc(sizeof(layer));
+    l.uz = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_z_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
-    l.input_z_layer->batch = batch;
+    *(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.uz->batch = batch;
 
-    l.state_z_layer = malloc(sizeof(layer));
+    l.wz = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.state_z_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
-    l.state_z_layer->batch = batch;
+    *(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wz->batch = batch;
 
-
-
-    l.input_r_layer = malloc(sizeof(layer));
+    l.ur = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_r_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
-    l.input_r_layer->batch = batch;
+    *(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.ur->batch = batch;
 
-    l.state_r_layer = malloc(sizeof(layer));
+    l.wr = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.state_r_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
-    l.state_r_layer->batch = batch;
+    *(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wr->batch = batch;
 
 
 
-    l.input_h_layer = malloc(sizeof(layer));
+    l.uh = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_h_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
-    l.input_h_layer->batch = batch;
+    *(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.uh->batch = batch;
 
-    l.state_h_layer = malloc(sizeof(layer));
+    l.wh = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.state_h_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
-    l.state_h_layer->batch = batch;
+    *(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wh->batch = batch;
 
     l.batch_normalize = batch_normalize;
 
@@ -94,68 +92,80 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
     l.backward_gpu = backward_gru_layer_gpu;
     l.update_gpu = update_gru_layer_gpu;
 
-    l.forgot_state_gpu = cuda_make_array(l.output, batch*outputs);
-    l.forgot_delta_gpu = cuda_make_array(l.output, batch*outputs);
-    l.prev_state_gpu = cuda_make_array(l.output, batch*outputs);
-    l.state_gpu = cuda_make_array(l.output, batch*outputs);
-    l.output_gpu = cuda_make_array(l.output, batch*outputs*steps);
-    l.delta_gpu = cuda_make_array(l.delta, batch*outputs*steps);
-    l.r_gpu = cuda_make_array(l.output_gpu, batch*outputs);
-    l.z_gpu = cuda_make_array(l.output_gpu, batch*outputs);
-    l.h_gpu = cuda_make_array(l.output_gpu, batch*outputs);
+    l.forgot_state_gpu = cuda_make_array(0, batch*outputs);
+    l.forgot_delta_gpu = cuda_make_array(0, batch*outputs);
+    l.prev_state_gpu = cuda_make_array(0, batch*outputs);
+    l.state_gpu = cuda_make_array(0, batch*outputs);
+    l.output_gpu = cuda_make_array(0, batch*outputs*steps);
+    l.delta_gpu = cuda_make_array(0, batch*outputs*steps);
+    l.r_gpu = cuda_make_array(0, batch*outputs);
+    l.z_gpu = cuda_make_array(0, batch*outputs);
+    l.h_gpu = cuda_make_array(0, batch*outputs);
+
+#ifdef CUDNN
+    cudnnSetTensor4dDescriptor(l.uz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uz->out_c, l.uz->out_h, l.uz->out_w); 
+    cudnnSetTensor4dDescriptor(l.uh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uh->out_c, l.uh->out_h, l.uh->out_w); 
+    cudnnSetTensor4dDescriptor(l.ur->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ur->out_c, l.ur->out_h, l.ur->out_w); 
+    cudnnSetTensor4dDescriptor(l.wz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wz->out_c, l.wz->out_h, l.wz->out_w); 
+    cudnnSetTensor4dDescriptor(l.wh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wh->out_c, l.wh->out_h, l.wh->out_w); 
+    cudnnSetTensor4dDescriptor(l.wr->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wr->out_c, l.wr->out_h, l.wr->out_w); 
+#endif
 #endif
 
     return l;
 }
 
-void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_gru_layer(layer l, update_args a)
 {
-    update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.ur), a);
+    update_connected_layer(*(l.uz), a);
+    update_connected_layer(*(l.uh), a);
+    update_connected_layer(*(l.wr), a);
+    update_connected_layer(*(l.wz), a);
+    update_connected_layer(*(l.wh), a);
 }
 
-void forward_gru_layer(layer l, network_state state)
+void forward_gru_layer(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
+    s.train = net.train;
     int i;
-    layer input_z_layer = *(l.input_z_layer);
-    layer input_r_layer = *(l.input_r_layer);
-    layer input_h_layer = *(l.input_h_layer);
-
-    layer state_z_layer = *(l.state_z_layer);
-    layer state_r_layer = *(l.state_r_layer);
-    layer state_h_layer = *(l.state_h_layer);
-
-    fill_cpu(l.outputs * l.batch * l.steps, 0, input_z_layer.delta, 1);
-    fill_cpu(l.outputs * l.batch * l.steps, 0, input_r_layer.delta, 1);
-    fill_cpu(l.outputs * l.batch * l.steps, 0, input_h_layer.delta, 1);
-
-    fill_cpu(l.outputs * l.batch * l.steps, 0, state_z_layer.delta, 1);
-    fill_cpu(l.outputs * l.batch * l.steps, 0, state_r_layer.delta, 1);
-    fill_cpu(l.outputs * l.batch * l.steps, 0, state_h_layer.delta, 1);
-    if(state.train) {
+    layer uz = *(l.uz);
+    layer ur = *(l.ur);
+    layer uh = *(l.uh);
+
+    layer wz = *(l.wz);
+    layer wr = *(l.wr);
+    layer wh = *(l.wh);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, uz.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, ur.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, uh.delta, 1);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wz.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wr.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wh.delta, 1);
+    if(net.train) {
         fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
         copy_cpu(l.outputs*l.batch, l.state, 1, l.prev_state, 1);
     }
 
     for (i = 0; i < l.steps; ++i) {
         s.input = l.state;
-        forward_connected_layer(state_z_layer, s);
-        forward_connected_layer(state_r_layer, s);
+        forward_connected_layer(wz, s);
+        forward_connected_layer(wr, s);
 
-        s.input = state.input;
-        forward_connected_layer(input_z_layer, s);
-        forward_connected_layer(input_r_layer, s);
-        forward_connected_layer(input_h_layer, s);
+        s.input = net.input;
+        forward_connected_layer(uz, s);
+        forward_connected_layer(ur, s);
+        forward_connected_layer(uh, s);
 
 
-        copy_cpu(l.outputs*l.batch, input_z_layer.output, 1, l.z_cpu, 1);
-        axpy_cpu(l.outputs*l.batch, 1, state_z_layer.output, 1, l.z_cpu, 1);
+        copy_cpu(l.outputs*l.batch, uz.output, 1, l.z_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, wz.output, 1, l.z_cpu, 1);
 
-        copy_cpu(l.outputs*l.batch, input_r_layer.output, 1, l.r_cpu, 1);
-        axpy_cpu(l.outputs*l.batch, 1, state_r_layer.output, 1, l.r_cpu, 1);
+        copy_cpu(l.outputs*l.batch, ur.output, 1, l.r_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, wr.output, 1, l.r_cpu, 1);
 
         activate_array(l.z_cpu, l.outputs*l.batch, LOGISTIC);
         activate_array(l.r_cpu, l.outputs*l.batch, LOGISTIC);
@@ -164,34 +174,34 @@ void forward_gru_layer(layer l, network_state state)
         mul_cpu(l.outputs*l.batch, l.r_cpu, 1, l.forgot_state, 1);
 
         s.input = l.forgot_state;
-        forward_connected_layer(state_h_layer, s);
+        forward_connected_layer(wh, s);
 
-        copy_cpu(l.outputs*l.batch, input_h_layer.output, 1, l.h_cpu, 1);
-        axpy_cpu(l.outputs*l.batch, 1, state_h_layer.output, 1, l.h_cpu, 1);
+        copy_cpu(l.outputs*l.batch, uh.output, 1, l.h_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, wh.output, 1, l.h_cpu, 1);
 
-        #ifdef USET
-        activate_array(l.h_cpu, l.outputs*l.batch, TANH);
-        #else
-        activate_array(l.h_cpu, l.outputs*l.batch, LOGISTIC);
-        #endif
+        if(l.tanh){
+            activate_array(l.h_cpu, l.outputs*l.batch, TANH);
+        } else {
+            activate_array(l.h_cpu, l.outputs*l.batch, LOGISTIC);
+        }
 
         weighted_sum_cpu(l.state, l.h_cpu, l.z_cpu, l.outputs*l.batch, l.output);
 
         copy_cpu(l.outputs*l.batch, l.output, 1, l.state, 1);
 
-        state.input += l.inputs*l.batch;
+        net.input += l.inputs*l.batch;
         l.output += l.outputs*l.batch;
-        increment_layer(&input_z_layer, 1);
-        increment_layer(&input_r_layer, 1);
-        increment_layer(&input_h_layer, 1);
+        increment_layer(&uz, 1);
+        increment_layer(&ur, 1);
+        increment_layer(&uh, 1);
 
-        increment_layer(&state_z_layer, 1);
-        increment_layer(&state_r_layer, 1);
-        increment_layer(&state_h_layer, 1);
+        increment_layer(&wz, 1);
+        increment_layer(&wr, 1);
+        increment_layer(&wh, 1);
     }
 }
 
-void backward_gru_layer(layer l, network_state state)
+void backward_gru_layer(layer l, network net)
 {
 }
 
@@ -205,191 +215,192 @@ void push_gru_layer(layer l)
 {
 }
 
-void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_gru_layer_gpu(layer l, update_args a)
 {
-    update_connected_layer_gpu(*(l.input_r_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.input_z_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.input_h_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.state_r_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.state_z_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.state_h_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer_gpu(*(l.ur), a);
+    update_connected_layer_gpu(*(l.uz), a);
+    update_connected_layer_gpu(*(l.uh), a);
+    update_connected_layer_gpu(*(l.wr), a);
+    update_connected_layer_gpu(*(l.wz), a);
+    update_connected_layer_gpu(*(l.wh), a);
 }
 
-void forward_gru_layer_gpu(layer l, network_state state)
+void forward_gru_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = {0};
+    s.train = net.train;
     int i;
-    layer input_z_layer = *(l.input_z_layer);
-    layer input_r_layer = *(l.input_r_layer);
-    layer input_h_layer = *(l.input_h_layer);
-
-    layer state_z_layer = *(l.state_z_layer);
-    layer state_r_layer = *(l.state_r_layer);
-    layer state_h_layer = *(l.state_h_layer);
-
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, input_z_layer.delta_gpu, 1);
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, input_r_layer.delta_gpu, 1);
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, input_h_layer.delta_gpu, 1);
-
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, state_z_layer.delta_gpu, 1);
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, state_r_layer.delta_gpu, 1);
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, state_h_layer.delta_gpu, 1);
-    if(state.train) {
-        fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
-        copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
+    layer uz = *(l.uz);
+    layer ur = *(l.ur);
+    layer uh = *(l.uh);
+
+    layer wz = *(l.wz);
+    layer wr = *(l.wr);
+    layer wh = *(l.wh);
+
+    fill_gpu(l.outputs * l.batch * l.steps, 0, uz.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, ur.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, uh.delta_gpu, 1);
+
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wz.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wr.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wh.delta_gpu, 1);
+    if(net.train) {
+        fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
     }
 
     for (i = 0; i < l.steps; ++i) {
-        s.input = l.state_gpu;
-        forward_connected_layer_gpu(state_z_layer, s);
-        forward_connected_layer_gpu(state_r_layer, s);
+        s.input_gpu = l.state_gpu;
+        forward_connected_layer_gpu(wz, s);
+        forward_connected_layer_gpu(wr, s);
 
-        s.input = state.input;
-        forward_connected_layer_gpu(input_z_layer, s);
-        forward_connected_layer_gpu(input_r_layer, s);
-        forward_connected_layer_gpu(input_h_layer, s);
+        s.input_gpu = net.input_gpu;
+        forward_connected_layer_gpu(uz, s);
+        forward_connected_layer_gpu(ur, s);
+        forward_connected_layer_gpu(uh, s);
 
+        copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, input_z_layer.output_gpu, 1, l.z_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_z_layer.output_gpu, 1, l.z_gpu, 1);
+        copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, input_r_layer.output_gpu, 1, l.r_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_r_layer.output_gpu, 1, l.r_gpu, 1);
+        activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
+        activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
 
-        activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
-        activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
+        copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
+        mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
-        mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
+        s.input_gpu = l.forgot_state_gpu;
+        forward_connected_layer_gpu(wh, s);
 
-        s.input = l.forgot_state_gpu;
-        forward_connected_layer_gpu(state_h_layer, s);
+        copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, input_h_layer.output_gpu, 1, l.h_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_h_layer.output_gpu, 1, l.h_gpu, 1);
-
-        #ifdef USET
-        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
-        #else
-        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
-        #endif
+        if(l.tanh){
+            activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
+        } else {
+            activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
+        }
 
         weighted_sum_gpu(l.state_gpu, l.h_gpu, l.z_gpu, l.outputs*l.batch, l.output_gpu);
+        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
-
-        state.input += l.inputs*l.batch;
+        net.input_gpu += l.inputs*l.batch;
         l.output_gpu += l.outputs*l.batch;
-        increment_layer(&input_z_layer, 1);
-        increment_layer(&input_r_layer, 1);
-        increment_layer(&input_h_layer, 1);
+        increment_layer(&uz, 1);
+        increment_layer(&ur, 1);
+        increment_layer(&uh, 1);
 
-        increment_layer(&state_z_layer, 1);
-        increment_layer(&state_r_layer, 1);
-        increment_layer(&state_h_layer, 1);
+        increment_layer(&wz, 1);
+        increment_layer(&wr, 1);
+        increment_layer(&wh, 1);
     }
 }
 
-void backward_gru_layer_gpu(layer l, network_state state)
+void backward_gru_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = {0};
+    s.train = net.train;
     int i;
-    layer input_z_layer = *(l.input_z_layer);
-    layer input_r_layer = *(l.input_r_layer);
-    layer input_h_layer = *(l.input_h_layer);
+    layer uz = *(l.uz);
+    layer ur = *(l.ur);
+    layer uh = *(l.uh);
 
-    layer state_z_layer = *(l.state_z_layer);
-    layer state_r_layer = *(l.state_r_layer);
-    layer state_h_layer = *(l.state_h_layer);
+    layer wz = *(l.wz);
+    layer wr = *(l.wr);
+    layer wh = *(l.wh);
 
-    increment_layer(&input_z_layer, l.steps - 1);
-    increment_layer(&input_r_layer, l.steps - 1);
-    increment_layer(&input_h_layer, l.steps - 1);
+    increment_layer(&uz, l.steps - 1);
+    increment_layer(&ur, l.steps - 1);
+    increment_layer(&uh, l.steps - 1);
 
-    increment_layer(&state_z_layer, l.steps - 1);
-    increment_layer(&state_r_layer, l.steps - 1);
-    increment_layer(&state_h_layer, l.steps - 1);
+    increment_layer(&wz, l.steps - 1);
+    increment_layer(&wr, l.steps - 1);
+    increment_layer(&wh, l.steps - 1);
 
-    state.input += l.inputs*l.batch*(l.steps-1);
-    if(state.delta) state.delta += l.inputs*l.batch*(l.steps-1);
+    net.input_gpu += l.inputs*l.batch*(l.steps-1);
+    if(net.delta_gpu) net.delta_gpu += l.inputs*l.batch*(l.steps-1);
     l.output_gpu += l.outputs*l.batch*(l.steps-1);
     l.delta_gpu += l.outputs*l.batch*(l.steps-1);
+    float *end_state = l.output_gpu;
     for (i = l.steps-1; i >= 0; --i) {
-        if(i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
+        if(i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
+        else copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1);
         float *prev_delta_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
 
-        copy_ongpu(l.outputs*l.batch, input_z_layer.output_gpu, 1, l.z_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_z_layer.output_gpu, 1, l.z_gpu, 1);
-
-        copy_ongpu(l.outputs*l.batch, input_r_layer.output_gpu, 1, l.r_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_r_layer.output_gpu, 1, l.r_gpu, 1);
-
-        activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
-        activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
-
-        copy_ongpu(l.outputs*l.batch, input_h_layer.output_gpu, 1, l.h_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_h_layer.output_gpu, 1, l.h_gpu, 1);
-
-        #ifdef USET
-        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
-        #else
-        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
-        #endif
-        
-        weighted_delta_gpu(l.prev_state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, input_h_layer.delta_gpu, input_z_layer.delta_gpu, l.outputs*l.batch, l.delta_gpu);
-
-        #ifdef USET
-        gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH, input_h_layer.delta_gpu);
-        #else
-        gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, input_h_layer.delta_gpu);
-        #endif
-
-        copy_ongpu(l.outputs*l.batch, input_h_layer.delta_gpu, 1, state_h_layer.delta_gpu, 1);
-        
-        copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.forgot_state_gpu, 1);
-        mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
-        fill_ongpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
-
-        s.input = l.forgot_state_gpu;
-        s.delta = l.forgot_delta_gpu;
-        
-        backward_connected_layer_gpu(state_h_layer, s);
+        copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
+
+        copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
+
+        activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
+        activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
+
+        copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
+
+        if(l.tanh){
+            activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
+        } else {
+            activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
+        }
+
+        weighted_delta_gpu(l.state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, uh.delta_gpu, uz.delta_gpu, l.outputs*l.batch, l.delta_gpu);
+
+        if(l.tanh){
+            gradient_array_gpu(l.h_gpu, l.outputs*l.batch, TANH, uh.delta_gpu);
+        } else {
+            gradient_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, uh.delta_gpu);
+        }
+
+        copy_gpu(l.outputs*l.batch, uh.delta_gpu, 1, wh.delta_gpu, 1);
+
+        copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
+        mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
+        fill_gpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
+
+        s.input_gpu = l.forgot_state_gpu;
+        s.delta_gpu = l.forgot_delta_gpu;
+
+        backward_connected_layer_gpu(wh, s);
         if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu);
-        mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.prev_state_gpu, input_r_layer.delta_gpu);
-
-        gradient_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, input_r_layer.delta_gpu);
-        copy_ongpu(l.outputs*l.batch, input_r_layer.delta_gpu, 1, state_r_layer.delta_gpu, 1);
-
-        gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, input_z_layer.delta_gpu);
-        copy_ongpu(l.outputs*l.batch, input_z_layer.delta_gpu, 1, state_z_layer.delta_gpu, 1);
-        
-        s.input = l.prev_state_gpu;
-        s.delta = prev_delta_gpu;
-        
-        backward_connected_layer_gpu(state_r_layer, s);
-        backward_connected_layer_gpu(state_z_layer, s);
-
-        s.input = state.input;
-        s.delta = state.delta;
-        
-        backward_connected_layer_gpu(input_h_layer, s);
-        backward_connected_layer_gpu(input_r_layer, s);
-        backward_connected_layer_gpu(input_z_layer, s);
-
-
-        state.input -= l.inputs*l.batch;
-        if(state.delta) state.delta -= l.inputs*l.batch;
+        mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.state_gpu, ur.delta_gpu);
+
+        gradient_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, ur.delta_gpu);
+        copy_gpu(l.outputs*l.batch, ur.delta_gpu, 1, wr.delta_gpu, 1);
+
+        gradient_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, uz.delta_gpu);
+        copy_gpu(l.outputs*l.batch, uz.delta_gpu, 1, wz.delta_gpu, 1);
+
+        s.input_gpu = l.state_gpu;
+        s.delta_gpu = prev_delta_gpu;
+
+        backward_connected_layer_gpu(wr, s);
+        backward_connected_layer_gpu(wz, s);
+
+        s.input_gpu = net.input_gpu;
+        s.delta_gpu = net.delta_gpu;
+
+        backward_connected_layer_gpu(uh, s);
+        backward_connected_layer_gpu(ur, s);
+        backward_connected_layer_gpu(uz, s);
+
+
+        net.input_gpu -= l.inputs*l.batch;
+        if(net.delta_gpu) net.delta_gpu -= l.inputs*l.batch;
         l.output_gpu -= l.outputs*l.batch;
         l.delta_gpu -= l.outputs*l.batch;
-        increment_layer(&input_z_layer, -1);
-        increment_layer(&input_r_layer, -1);
-        increment_layer(&input_h_layer, -1);
+        increment_layer(&uz, -1);
+        increment_layer(&ur, -1);
+        increment_layer(&uh, -1);
 
-        increment_layer(&state_z_layer, -1);
-        increment_layer(&state_r_layer, -1);
-        increment_layer(&state_h_layer, -1);
+        increment_layer(&wz, -1);
+        increment_layer(&wr, -1);
+        increment_layer(&wh, -1);
     }
+    copy_gpu(l.outputs*l.batch, end_state, 1, l.state_gpu, 1);
 }
 #endif
diff --git a/image.darknet/inst/include/darknet/src/gru_layer.h b/image.darknet/inst/include/darknet/src/gru_layer.h
index 9e19cee..9067942 100644
--- a/image.darknet/inst/include/darknet/src/gru_layer.h
+++ b/image.darknet/inst/include/darknet/src/gru_layer.h
@@ -6,16 +6,16 @@
 #include "layer.h"
 #include "network.h"
 
-layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize);
+layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
 
-void forward_gru_layer(layer l, network_state state);
-void backward_gru_layer(layer l, network_state state);
-void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_gru_layer(layer l, network state);
+void backward_gru_layer(layer l, network state);
+void update_gru_layer(layer l, update_args a);
 
 #ifdef GPU
-void forward_gru_layer_gpu(layer l, network_state state);
-void backward_gru_layer_gpu(layer l, network_state state);
-void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_gru_layer_gpu(layer l, network state);
+void backward_gru_layer_gpu(layer l, network state);
+void update_gru_layer_gpu(layer l, update_args a);
 void push_gru_layer(layer l);
 void pull_gru_layer(layer l);
 #endif
diff --git a/image.darknet/inst/include/darknet/src/im2col.h b/image.darknet/inst/include/darknet/src/im2col.h
index f0ddeee..02c4247 100644
--- a/image.darknet/inst/include/darknet/src/im2col.h
+++ b/image.darknet/inst/include/darknet/src/im2col.h
@@ -7,7 +7,7 @@ void im2col_cpu(float* data_im,
 
 #ifdef GPU
 
-void im2col_ongpu(float *im,
+void im2col_gpu(float *im,
          int channels, int height, int width,
          int ksize, int stride, int pad,float *data_col);
 
diff --git a/image.darknet/inst/include/darknet/src/im2col_kernels.cu b/image.darknet/inst/include/darknet/src/im2col_kernels.cu
index d42d600..07b5e67 100644
--- a/image.darknet/inst/include/darknet/src/im2col_kernels.cu
+++ b/image.darknet/inst/include/darknet/src/im2col_kernels.cu
@@ -45,7 +45,7 @@ __global__ void im2col_gpu_kernel(const int n, const float* data_im,
     }
 }
 
-void im2col_ongpu(float *im,
+void im2col_gpu(float *im,
          int channels, int height, int width,
          int ksize, int stride, int pad, float *data_col){
     // We are going to launch channels * height_col * width_col kernels, each
diff --git a/image.darknet/inst/include/darknet/src/image.c b/image.darknet/inst/include/darknet/src/image.c
index 5a90efd..4a2c6ba 100644
--- a/image.darknet/inst/include/darknet/src/image.c
+++ b/image.darknet/inst/include/darknet/src/image.c
@@ -10,12 +10,6 @@
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #include "stb_image_write.h"
 
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#include "opencv2/imgproc/imgproc_c.h"
-#endif
-
-
 int windows = 0;
 
 float colors[6][3] = { {1,0,1}, {0,0,1},{0,1,1},{0,1,0},{1,1,0},{1,0,0} };
@@ -31,6 +25,70 @@ float get_color(int c, int x, int max)
     return r;
 }
 
+image mask_to_rgb(image mask)
+{
+    int n = mask.c;
+    image im = make_image(mask.w, mask.h, 3);
+    int i, j;
+    for(j = 0; j < n; ++j){
+        int offset = j*123457 % n;
+        float red = get_color(2,offset,n);
+        float green = get_color(1,offset,n);
+        float blue = get_color(0,offset,n);
+        for(i = 0; i < im.w*im.h; ++i){
+            im.data[i + 0*im.w*im.h] += mask.data[j*im.h*im.w + i]*red;
+            im.data[i + 1*im.w*im.h] += mask.data[j*im.h*im.w + i]*green;
+            im.data[i + 2*im.w*im.h] += mask.data[j*im.h*im.w + i]*blue;
+        }
+    }
+    return im;
+}
+
+static float get_pixel(image m, int x, int y, int c)
+{
+    assert(x < m.w && y < m.h && c < m.c);
+    return m.data[c*m.h*m.w + y*m.w + x];
+}
+static float get_pixel_extend(image m, int x, int y, int c)
+{
+    if(x < 0 || x >= m.w || y < 0 || y >= m.h) return 0;
+    /*
+    if(x < 0) x = 0;
+    if(x >= m.w) x = m.w-1;
+    if(y < 0) y = 0;
+    if(y >= m.h) y = m.h-1;
+    */
+    if(c < 0 || c >= m.c) return 0;
+    return get_pixel(m, x, y, c);
+}
+static void set_pixel(image m, int x, int y, int c, float val)
+{
+    if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
+    assert(x < m.w && y < m.h && c < m.c);
+    m.data[c*m.h*m.w + y*m.w + x] = val;
+}
+static void add_pixel(image m, int x, int y, int c, float val)
+{
+    assert(x < m.w && y < m.h && c < m.c);
+    m.data[c*m.h*m.w + y*m.w + x] += val;
+}
+
+static float bilinear_interpolate(image im, float x, float y, int c)
+{
+    int ix = (int) floorf(x);
+    int iy = (int) floorf(y);
+
+    float dx = x - ix;
+    float dy = y - iy;
+
+    float val = (1-dy) * (1-dx) * get_pixel_extend(im, ix, iy, c) + 
+        dy     * (1-dx) * get_pixel_extend(im, ix, iy+1, c) + 
+        (1-dy) *   dx   * get_pixel_extend(im, ix+1, iy, c) +
+        dy     *   dx   * get_pixel_extend(im, ix+1, iy+1, c);
+    return val;
+}
+
+
 void composite_image(image source, image dest, int dx, int dy)
 {
     int x,y,k;
@@ -73,6 +131,7 @@ image tile_images(image a, image b, int dx)
 
 image get_label(image **characters, char *string, int size)
 {
+    size = size/10;
     if(size > 7) size = 7;
     image label = make_empty_image(0,0,0);
     while(*string){
@@ -177,23 +236,36 @@ image **load_alphabet()
     return alphabets;
 }
 
-void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes)
+void draw_detections(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes)
 {
-    int i;
+    int i,j;
 
     for(i = 0; i < num; ++i){
-        int class = max_index(probs[i], classes);
-        float prob = probs[i][class];
-        if(prob > thresh){
-
-            int width = im.h * .012;
-
-            if(0){
-                width = pow(prob, 1./2.)*10+1;
-                alphabet = 0;
+        char labelstr[4096] = {0};
+        int class = -1;
+        for(j = 0; j < classes; ++j){
+            if (dets[i].prob[j] > thresh){
+                if (class < 0) {
+                    strcat(labelstr, names[j]);
+                    class = j;
+                } else {
+                    strcat(labelstr, ", ");
+                    strcat(labelstr, names[j]);
+                }
+                printf("%s: %.0f%%\n", names[j], dets[i].prob[j]*100);
             }
+        }
+        if(class >= 0){
+            int width = im.h * .006;
 
-            printf("%s: %.0f%%\n", names[class], prob*100);
+            /*
+               if(0){
+               width = pow(prob, 1./2.)*10+1;
+               alphabet = 0;
+               }
+             */
+
+            //printf("%d %s: %.0f%%\n", i, names[class], prob*100);
             int offset = class*123457 % classes;
             float red = get_color(2,offset,classes);
             float green = get_color(1,offset,classes);
@@ -205,7 +277,8 @@ void draw_detections(image im, int num, float thresh, box *boxes, float **probs,
             rgb[0] = red;
             rgb[1] = green;
             rgb[2] = blue;
-            box b = boxes[i];
+            box b = dets[i].bbox;
+            //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
 
             int left  = (b.x-b.w/2.)*im.w;
             int right = (b.x+b.w/2.)*im.w;
@@ -219,8 +292,18 @@ void draw_detections(image im, int num, float thresh, box *boxes, float **probs,
 
             draw_box_width(im, left, top, right, bot, width, red, green, blue);
             if (alphabet) {
-                image label = get_label(alphabet, names[class], (im.h*.03)/10);
+                image label = get_label(alphabet, labelstr, (im.h*.03));
                 draw_label(im, top + width, left, label, rgb);
+                free_image(label);
+            }
+            if (dets[i].mask){
+                image mask = float_to_image(14, 14, 1, dets[i].mask);
+                image resized_mask = resize_image(mask, b.w*im.w, b.h*im.h);
+                image tmask = threshold_image(resized_mask, .5);
+                embed_image(tmask, im, left, top);
+                free_image(mask);
+                free_image(resized_mask);
+                free_image(tmask);
             }
         }
     }
@@ -294,6 +377,54 @@ image image_distance(image a, image b)
     return dist;
 }
 
+void ghost_image(image source, image dest, int dx, int dy)
+{
+    int x,y,k;
+    float max_dist = sqrt((-source.w/2. + .5)*(-source.w/2. + .5));
+    for(k = 0; k < source.c; ++k){
+        for(y = 0; y < source.h; ++y){
+            for(x = 0; x < source.w; ++x){
+                float dist = sqrt((x - source.w/2. + .5)*(x - source.w/2. + .5) + (y - source.h/2. + .5)*(y - source.h/2. + .5));
+                float alpha = (1 - dist/max_dist);
+                if(alpha < 0) alpha = 0;
+                float v1 = get_pixel(source, x,y,k);
+                float v2 = get_pixel(dest, dx+x,dy+y,k);
+                float val = alpha*v1 + (1-alpha)*v2;
+                set_pixel(dest, dx+x, dy+y, k, val);
+            }
+        }
+    }
+}
+
+void blocky_image(image im, int s)
+{
+    int i,j,k;
+    for(k = 0; k < im.c; ++k){
+        for(j = 0; j < im.h; ++j){
+            for(i = 0; i < im.w; ++i){
+                im.data[i + im.w*(j + im.h*k)] = im.data[i/s*s + im.w*(j/s*s + im.h*k)];
+            }
+        }
+    }
+}
+
+void censor_image(image im, int dx, int dy, int w, int h)
+{
+    int i,j,k;
+    int s = 32;
+    if(dx < 0) dx = 0;
+    if(dy < 0) dy = 0;
+
+    for(k = 0; k < im.c; ++k){
+        for(j = dy; j < dy + h && j < im.h; ++j){
+            for(i = dx; i < dx + w && i < im.w; ++i){
+                im.data[i + im.w*(j + im.h*k)] = im.data[i/s*s + im.w*(j/s*s + im.h*k)];
+                //im.data[i + j*im.w + k*im.w*im.h] = 0;
+            }
+        }
+    }
+}
+
 void embed_image(image source, image dest, int dx, int dy)
 {
     int x,y,k;
@@ -380,6 +511,11 @@ void normalize_image2(image p)
     free(max);
 }
 
+void copy_image_into(image src, image dest)
+{
+    memcpy(dest.data, src.data, src.h*src.w*src.c*sizeof(float));
+}
+
 image copy_image(image p)
 {
     image copy = p;
@@ -398,145 +534,27 @@ void rgbgr_image(image im)
     }
 }
 
-#ifdef OPENCV
-void show_image_cv(image p, const char *name)
-{
-    int x,y,k;
-    image copy = copy_image(p);
-    constrain_image(copy);
-    if(p.c == 3) rgbgr_image(copy);
-    //normalize_image(copy);
-
-    char buff[256];
-    //sprintf(buff, "%s (%d)", name, windows);
-    sprintf(buff, "%s", name);
-
-    IplImage *disp = cvCreateImage(cvSize(p.w,p.h), IPL_DEPTH_8U, p.c);
-    int step = disp->widthStep;
-    cvNamedWindow(buff, CV_WINDOW_NORMAL); 
-    //cvMoveWindow(buff, 100*(windows%10) + 200*(windows/10), 100*(windows%10));
-    ++windows;
-    for(y = 0; y < p.h; ++y){
-        for(x = 0; x < p.w; ++x){
-            for(k= 0; k < p.c; ++k){
-                disp->imageData[y*step + x*p.c + k] = (unsigned char)(get_pixel(copy,x,y,k)*255);
-            }
-        }
-    }
-    free_image(copy);
-    if(0){
-        int w = 448;
-        int h = w*p.h/p.w;
-        if(h > 1000){
-            h = 1000;
-            w = h*p.w/p.h;
-        }
-        IplImage *buffer = disp;
-        disp = cvCreateImage(cvSize(w, h), buffer->depth, buffer->nChannels);
-        cvResize(buffer, disp, CV_INTER_LINEAR);
-        cvReleaseImage(&buffer);
-    }
-    cvShowImage(buff, disp);
-    cvReleaseImage(&disp);
-}
-#endif
-
-void show_image(image p, const char *name)
+int show_image(image p, const char *name, int ms)
 {
 #ifdef OPENCV
-    show_image_cv(p, name);
+    int c = show_image_cv(p, name, ms);
+    return c;
 #else
     fprintf(stderr, "Not compiled with OpenCV, saving to %s.png instead\n", name);
     save_image(p, name);
+    return -1;
 #endif
 }
 
-#ifdef OPENCV
-
-image ipl_to_image(IplImage* src)
-{
-    unsigned char *data = (unsigned char *)src->imageData;
-    int h = src->height;
-    int w = src->width;
-    int c = src->nChannels;
-    int step = src->widthStep;
-    image out = make_image(w, h, c);
-    int i, j, k, count=0;;
-
-    for(k= 0; k < c; ++k){
-        for(i = 0; i < h; ++i){
-            for(j = 0; j < w; ++j){
-                out.data[count++] = data[i*step + j*c + k]/255.;
-            }
-        }
-    }
-    return out;
-}
-
-image load_image_cv(char *filename, int channels)
-{
-    IplImage* src = 0;
-    int flag = -1;
-    if (channels == 0) flag = -1;
-    else if (channels == 1) flag = 0;
-    else if (channels == 3) flag = 1;
-    else {
-        fprintf(stderr, "OpenCV can't force load with %d channels\n", channels);
-    }
-
-    if( (src = cvLoadImage(filename, flag)) == 0 )
-    {
-        fprintf(stderr, "Cannot load image \"%s\"\n", filename);
-        char buff[256];
-        sprintf(buff, "echo %s >> bad.list", filename);
-        system(buff);
-        return make_image(10,10,3);
-        //exit(0);
-    }
-    image out = ipl_to_image(src);
-    cvReleaseImage(&src);
-    rgbgr_image(out);
-    return out;
-}
-
-image get_image_from_stream(CvCapture *cap)
-{
-    IplImage* src = cvQueryFrame(cap);
-    if (!src) return make_empty_image(0,0,0);
-    image im = ipl_to_image(src);
-    rgbgr_image(im);
-    return im;
-}
-
-void save_image_jpg(image p, const char *name)
-{
-    image copy = copy_image(p);
-    if(p.c == 3) rgbgr_image(copy);
-    int x,y,k;
-
-    char buff[256];
-    sprintf(buff, "%s.jpg", name);
-
-    IplImage *disp = cvCreateImage(cvSize(p.w,p.h), IPL_DEPTH_8U, p.c);
-    int step = disp->widthStep;
-    for(y = 0; y < p.h; ++y){
-        for(x = 0; x < p.w; ++x){
-            for(k= 0; k < p.c; ++k){
-                disp->imageData[y*step + x*p.c + k] = (unsigned char)(get_pixel(copy,x,y,k)*255);
-            }
-        }
-    }
-    cvSaveImage(buff, disp,0);
-    cvReleaseImage(&disp);
-    free_image(copy);
-}
-#endif
-
-void save_image_png(image im, const char *name)
+void save_image_options(image im, const char *name, IMTYPE f, int quality)
 {
     char buff[256];
     //sprintf(buff, "%s (%d)", name, windows);
-    sprintf(buff, "%s.png", name);
+    if(f == PNG)       sprintf(buff, "%s.png", name);
+    else if (f == BMP) sprintf(buff, "%s.bmp", name);
+    else if (f == TGA) sprintf(buff, "%s.tga", name);
+    else if (f == JPG) sprintf(buff, "%s.jpg", name);
+    else               sprintf(buff, "%s.png", name);
     unsigned char *data = calloc(im.w*im.h*im.c, sizeof(char));
     int i,k;
     for(k = 0; k < im.c; ++k){
@@ -544,21 +562,20 @@ void save_image_png(image im, const char *name)
             data[i*im.c+k] = (unsigned char) (255*im.data[i + k*im.w*im.h]);
         }
     }
-    int success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w*im.c);
+    int success = 0;
+    if(f == PNG)       success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w*im.c);
+    else if (f == BMP) success = stbi_write_bmp(buff, im.w, im.h, im.c, data);
+    else if (f == TGA) success = stbi_write_tga(buff, im.w, im.h, im.c, data);
+    else if (f == JPG) success = stbi_write_jpg(buff, im.w, im.h, im.c, data, quality);
     free(data);
     if(!success) fprintf(stderr, "Failed to write image %s\n", buff);
 }
 
 void save_image(image im, const char *name)
 {
-#ifdef OPENCV
-    save_image_jpg(im, name);
-#else
-    save_image_png(im, name);
-#endif
+    save_image_options(im, name, JPG, 80);
 }
 
-
 void show_image_layers(image p, char *name)
 {
     int i;
@@ -566,7 +583,7 @@ void show_image_layers(image p, char *name)
     for(i = 0; i < p.c; ++i){
         sprintf(buff, "%s - Layer %d", name, i);
         image layer = get_image_layer(p, i);
-        show_image(layer, buff);
+        show_image(layer, buff, 1);
         free_image(layer);
     }
 }
@@ -574,7 +591,7 @@ void show_image_layers(image p, char *name)
 void show_image_collapsed(image p, char *name)
 {
     image c = collapse_image_layers(p, 1);
-    show_image(c, name);
+    show_image(c, name, 1);
     free_image(c);
 }
 
@@ -613,6 +630,29 @@ image float_to_image(int w, int h, int c, float *data)
     return out;
 }
 
+void place_image(image im, int w, int h, int dx, int dy, image canvas)
+{
+    int x, y, c;
+    for(c = 0; c < im.c; ++c){
+        for(y = 0; y < h; ++y){
+            for(x = 0; x < w; ++x){
+                float rx = ((float)x / w) * im.w;
+                float ry = ((float)y / h) * im.h;
+                float val = bilinear_interpolate(im, rx, ry, c);
+                set_pixel(canvas, x + dx, y + dy, c, val);
+            }
+        }
+    }
+}
+
+image center_crop_image(image im, int w, int h)
+{
+    int m = (im.w < im.h) ? im.w : im.h;   
+    image c = crop_image(im, (im.w - m) / 2, (im.h - m)/2, m, m);
+    image r = resize_image(c, w, h);
+    free_image(c);
+    return r;
+}
 
 image rotate_crop_image(image im, float rad, float s, int w, int h, float dx, float dy, float aspect)
 {
@@ -652,6 +692,12 @@ image rotate_image(image im, float rad)
     return rot;
 }
 
+void fill_image(image m, float s)
+{
+    int i;
+    for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s;
+}
+
 void translate_image(image m, float s)
 {
     int i;
@@ -676,9 +722,7 @@ image crop_image(image im, int dx, int dy, int w, int h)
                 float val = 0;
                 r = constrain_int(r, 0, im.h-1);
                 c = constrain_int(c, 0, im.w-1);
-                if (r >= 0 && r < im.h && c >= 0 && c < im.w) {
-                    val = get_pixel(im, c, r, k);
-                }
+                val = get_pixel(im, c, r, k);
                 set_pixel(cropped, i, j, k, val);
             }
         }
@@ -746,11 +790,44 @@ void composite_3d(char *f1, char *f2, char *out, int delta)
     for(i = 0; i < c.w*c.h; ++i){
         c.data[i] = a.data[i];
     }
-#ifdef OPENCV
-    save_image_jpg(c, out);
-#else
     save_image(c, out);
-#endif
+}
+
+void letterbox_image_into(image im, int w, int h, image boxed)
+{
+    int new_w = im.w;
+    int new_h = im.h;
+    if (((float)w/im.w) < ((float)h/im.h)) {
+        new_w = w;
+        new_h = (im.h * w)/im.w;
+    } else {
+        new_h = h;
+        new_w = (im.w * h)/im.h;
+    }
+    image resized = resize_image(im, new_w, new_h);
+    embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); 
+    free_image(resized);
+}
+
+image letterbox_image(image im, int w, int h)
+{
+    int new_w = im.w;
+    int new_h = im.h;
+    if (((float)w/im.w) < ((float)h/im.h)) {
+        new_w = w;
+        new_h = (im.h * w)/im.w;
+    } else {
+        new_h = h;
+        new_w = (im.w * h)/im.h;
+    }
+    image resized = resize_image(im, new_w, new_h);
+    image boxed = make_image(w, h, im.c);
+    fill_image(boxed, .5);
+    //int i;
+    //for(i = 0; i < boxed.w*boxed.h*boxed.c; ++i) boxed.data[i] = 0;
+    embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); 
+    free_image(resized);
+    return boxed;
 }
 
 image resize_max(image im, int max)
@@ -793,8 +870,9 @@ image random_crop_image(image im, int w, int h)
     return crop;
 }
 
-image random_augment_image(image im, float angle, float aspect, int low, int high, int size)
+augment_args random_augment_args(image im, float angle, float aspect, int low, int high, int w, int h)
 {
+    augment_args a = {0};
     aspect = rand_scale(aspect);
     int r = rand_int(low, high);
     int min = (im.h < im.w*aspect) ? im.h : im.w*aspect;
@@ -802,15 +880,27 @@ image random_augment_image(image im, float angle, float aspect, int low, int hig
 
     float rad = rand_uniform(-angle, angle) * TWO_PI / 360.;
 
-    float dx = (im.w*scale/aspect - size) / 2.;
-    float dy = (im.h*scale - size) / 2.;
-    if(dx < 0) dx = 0;
-    if(dy < 0) dy = 0;
+    float dx = (im.w*scale/aspect - w) / 2.;
+    float dy = (im.h*scale - w) / 2.;
+    //if(dx < 0) dx = 0;
+    //if(dy < 0) dy = 0;
     dx = rand_uniform(-dx, dx);
     dy = rand_uniform(-dy, dy);
 
-    image crop = rotate_crop_image(im, rad, scale, size, size, dx, dy, aspect);
+    a.rad = rad;
+    a.scale = scale;
+    a.w = w;
+    a.h = h;
+    a.dx = dx;
+    a.dy = dy;
+    a.aspect = aspect;
+    return a;
+}
 
+image random_augment_image(image im, float angle, float aspect, int low, int high, int w, int h)
+{
+    augment_args a = random_augment_args(im, angle, aspect, low, high, w, h);
+    image crop = rotate_crop_image(im, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
     return crop;
 }
 
@@ -824,6 +914,52 @@ float three_way_min(float a, float b, float c)
     return (a < b) ? ( (a < c) ? a : c) : ( (b < c) ? b : c) ;
 }
 
+void yuv_to_rgb(image im)
+{
+    assert(im.c == 3);
+    int i, j;
+    float r, g, b;
+    float y, u, v;
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            y = get_pixel(im, i , j, 0);
+            u = get_pixel(im, i , j, 1);
+            v = get_pixel(im, i , j, 2);
+
+            r = y + 1.13983*v;
+            g = y + -.39465*u + -.58060*v;
+            b = y + 2.03211*u;
+
+            set_pixel(im, i, j, 0, r);
+            set_pixel(im, i, j, 1, g);
+            set_pixel(im, i, j, 2, b);
+        }
+    }
+}
+
+void rgb_to_yuv(image im)
+{
+    assert(im.c == 3);
+    int i, j;
+    float r, g, b;
+    float y, u, v;
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            r = get_pixel(im, i , j, 0);
+            g = get_pixel(im, i , j, 1);
+            b = get_pixel(im, i , j, 2);
+
+            y = .299*r + .587*g + .114*b;
+            u = -.14713*r + -.28886*g + .436*b;
+            v = .615*r + -.51499*g + -.10001*b;
+
+            set_pixel(im, i, j, 0, y);
+            set_pixel(im, i, j, 1, u);
+            set_pixel(im, i, j, 2, v);
+        }
+    }
+}
+
 // http://www.cs.rit.edu/~ncs/color/t_convert.html
 void rgb_to_hsv(image im)
 {
@@ -903,12 +1039,30 @@ void hsv_to_rgb(image im)
     }
 }
 
+void grayscale_image_3c(image im)
+{
+    assert(im.c == 3);
+    int i, j, k;
+    float scale[] = {0.299, 0.587, 0.114};
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            float val = 0;
+            for(k = 0; k < 3; ++k){
+                val += scale[k]*get_pixel(im, i, j, k);
+            }
+            im.data[0*im.h*im.w + im.w*j + i] = val;
+            im.data[1*im.h*im.w + im.w*j + i] = val;
+            im.data[2*im.h*im.w + im.w*j + i] = val;
+        }
+    }
+}
+
 image grayscale_image(image im)
 {
     assert(im.c == 3);
     int i, j, k;
     image gray = make_image(im.w, im.h, 1);
-    float scale[] = {0.587, 0.299, 0.114};
+    float scale[] = {0.299, 0.587, 0.114};
     for(k = 0; k < im.c; ++k){
         for(j = 0; j < im.h; ++j){
             for(i = 0; i < im.w; ++i){
@@ -1042,21 +1196,6 @@ void saturate_exposure_image(image im, float sat, float exposure)
     constrain_image(im);
 }
 
-float bilinear_interpolate(image im, float x, float y, int c)
-{
-    int ix = (int) floorf(x);
-    int iy = (int) floorf(y);
-
-    float dx = x - ix;
-    float dy = y - iy;
-
-    float val = (1-dy) * (1-dx) * get_pixel_extend(im, ix, iy, c) + 
-        dy     * (1-dx) * get_pixel_extend(im, ix, iy+1, c) + 
-        (1-dy) *   dx   * get_pixel_extend(im, ix+1, iy, c) +
-        dy     *   dx   * get_pixel_extend(im, ix+1, iy+1, c);
-    return val;
-}
-
 image resize_image(image im, int w, int h)
 {
     image resized = make_image(w, h, im.c);   
@@ -1119,16 +1258,16 @@ void test_resize(char *filename)
     distort_image(c4, .1, .66666, 1.5);
 
 
-    show_image(im,   "Original");
-    show_image(gray, "Gray");
-    show_image(c1, "C1");
-    show_image(c2, "C2");
-    show_image(c3, "C3");
-    show_image(c4, "C4");
+    show_image(im,   "Original", 1);
+    show_image(gray, "Gray", 1);
+    show_image(c1, "C1", 1);
+    show_image(c2, "C2", 1);
+    show_image(c3, "C3", 1);
+    show_image(c4, "C4", 1);
 #ifdef OPENCV
     while(1){
-        image aug = random_augment_image(im, 0, .75, 320, 448, 320);
-        show_image(aug, "aug");
+        image aug = random_augment_image(im, 0, .75, 320, 448, 320, 320);
+        show_image(aug, "aug", 1);
         free_image(aug);
 
 
@@ -1143,10 +1282,9 @@ void test_resize(char *filename)
         float dhue = rand_uniform(-hue, hue);
 
         distort_image(c, dhue, dsat, dexp);
-        show_image(c, "rand");
+        show_image(c, "rand", 1);
         printf("%f %f %f\n", dhue, dsat, dexp);
         free_image(c);
-        cvWaitKey(0);
     }
 #endif
 }
@@ -1206,33 +1344,6 @@ image get_image_layer(image m, int l)
     }
     return out;
 }
-
-float get_pixel(image m, int x, int y, int c)
-{
-    assert(x < m.w && y < m.h && c < m.c);
-    return m.data[c*m.h*m.w + y*m.w + x];
-}
-float get_pixel_extend(image m, int x, int y, int c)
-{
-    if(x < 0) x = 0;
-    if(x >= m.w) x = m.w-1;
-    if(y < 0) y = 0;
-    if(y >= m.h) y = m.h-1;
-    if(c < 0 || c >= m.c) return 0;
-    return get_pixel(m, x, y, c);
-}
-void set_pixel(image m, int x, int y, int c, float val)
-{
-    if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
-    assert(x < m.w && y < m.h && c < m.c);
-    m.data[c*m.h*m.w + y*m.w + x] = val;
-}
-void add_pixel(image m, int x, int y, int c, float val)
-{
-    assert(x < m.w && y < m.h && c < m.c);
-    m.data[c*m.h*m.w + y*m.w + x] += val;
-}
-
 void print_image(image m)
 {
     int i, j, k;
@@ -1325,7 +1436,7 @@ void show_image_normalized(image im, const char *name)
 {
     image c = copy_image(im);
     normalize_image(c);
-    show_image(c, name);
+    show_image(c, name, 1);
     free_image(c);
 }
 
@@ -1343,7 +1454,7 @@ void show_images(image *ims, int n, char *window)
      */
     normalize_image(m);
     save_image(m, window);
-    show_image(m, window);
+    show_image(m, window, 1);
     free_image(m);
 }
 
diff --git a/image.darknet/inst/include/darknet/src/image.h b/image.darknet/inst/include/darknet/src/image.h
index 39c3962..3392bb9 100644
--- a/image.darknet/inst/include/darknet/src/image.h
+++ b/image.darknet/inst/include/darknet/src/image.h
@@ -7,81 +7,63 @@
 #include <string.h>
 #include <math.h>
 #include "box.h"
+#include "darknet.h"
 
-typedef struct {
-    int h;
-    int w;
-    int c;
-    float *data;
-} image;
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef OPENCV
+void *open_video_stream(const char *f, int c, int w, int h, int fps);
+image get_image_from_stream(void *p);
+image load_image_cv(char *filename, int channels);
+int show_image_cv(image im, const char* name, int ms);
+#endif
 
 float get_color(int c, int x, int max);
-void flip_image(image a);
 void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b);
-void draw_box_width(image a, int x1, int y1, int x2, int y2, int w, float r, float g, float b);
 void draw_bbox(image a, box bbox, int w, float r, float g, float b);
-void draw_label(image a, int r, int c, image label, const float *rgb);
 void write_label(image a, int r, int c, image *characters, char *string, float *rgb);
-void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **labels, int classes);
 image image_distance(image a, image b);
 void scale_image(image m, float s);
-image crop_image(image im, int dx, int dy, int w, int h);
+image rotate_crop_image(image im, float rad, float s, int w, int h, float dx, float dy, float aspect);
 image random_crop_image(image im, int w, int h);
-image random_augment_image(image im, float angle, float aspect, int low, int high, int size);
-void random_distort_image(image im, float hue, float saturation, float exposure);
-image resize_image(image im, int w, int h);
-image resize_min(image im, int min);
+image random_augment_image(image im, float angle, float aspect, int low, int high, int w, int h);
+augment_args random_augment_args(image im, float angle, float aspect, int low, int high, int w, int h);
+void letterbox_image_into(image im, int w, int h, image boxed);
 image resize_max(image im, int max);
 void translate_image(image m, float s);
-void normalize_image(image p);
-image rotate_image(image m, float rad);
-void rotate_image_cw(image im, int times);
 void embed_image(image source, image dest, int dx, int dy);
+void place_image(image im, int w, int h, int dx, int dy, image canvas);
 void saturate_image(image im, float sat);
 void exposure_image(image im, float sat);
 void distort_image(image im, float hue, float sat, float val);
 void saturate_exposure_image(image im, float sat, float exposure);
+void rgb_to_hsv(image im);
 void hsv_to_rgb(image im);
-void rgbgr_image(image im);
-void constrain_image(image im);
-void composite_3d(char *f1, char *f2, char *out, int delta);
-int best_3d_shift_r(image a, image b, int min, int max);
+void yuv_to_rgb(image im);
+void rgb_to_yuv(image im);
 
-image grayscale_image(image im);
-image threshold_image(image im, float thresh);
 
 image collapse_image_layers(image source, int border);
 image collapse_images_horz(image *ims, int n);
 image collapse_images_vert(image *ims, int n);
 
-void show_image(image p, const char *name);
 void show_image_normalized(image im, const char *name);
-void save_image_png(image im, const char *name);
-void save_image(image p, const char *name);
 void show_images(image *ims, int n, char *window);
 void show_image_layers(image p, char *name);
 void show_image_collapsed(image p, char *name);
 
 void print_image(image m);
 
-image make_image(int w, int h, int c);
-image make_random_image(int w, int h, int c);
 image make_empty_image(int w, int h, int c);
-image float_to_image(int w, int h, int c, float *data);
-image copy_image(image p);
-image load_image(char *filename, int w, int h, int c);
-image load_image_color(char *filename, int w, int h);
-image **load_alphabet();
-
-float get_pixel(image m, int x, int y, int c);
-float get_pixel_extend(image m, int x, int y, int c);
-void set_pixel(image m, int x, int y, int c, float val);
-void add_pixel(image m, int x, int y, int c, float val);
-float bilinear_interpolate(image im, float x, float y, int c);
+void copy_image_into(image src, image dest);
 
 image get_image_layer(image m, int l);
 
-void free_image(image m);
-void test_resize(char *filename);
+#ifdef __cplusplus
+}
+#endif
+
 #endif
 
diff --git a/image.darknet/inst/include/darknet/src/image_opencv.cpp b/image.darknet/inst/include/darknet/src/image_opencv.cpp
new file mode 100644
index 0000000..7511280
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/image_opencv.cpp
@@ -0,0 +1,135 @@
+#ifdef OPENCV
+
+#include "stdio.h"
+#include "stdlib.h"
+#include "opencv2/opencv.hpp"
+#include "image.h"
+
+using namespace cv;
+
+extern "C" {
+
+IplImage *image_to_ipl(image im)
+{
+    int x,y,c;
+    IplImage *disp = cvCreateImage(cvSize(im.w,im.h), IPL_DEPTH_8U, im.c);
+    int step = disp->widthStep;
+    for(y = 0; y < im.h; ++y){
+        for(x = 0; x < im.w; ++x){
+            for(c= 0; c < im.c; ++c){
+                float val = im.data[c*im.h*im.w + y*im.w + x];
+                disp->imageData[y*step + x*im.c + c] = (unsigned char)(val*255);
+            }
+        }
+    }
+    return disp;
+}
+
+image ipl_to_image(IplImage* src)
+{
+    int h = src->height;
+    int w = src->width;
+    int c = src->nChannels;
+    image im = make_image(w, h, c);
+    unsigned char *data = (unsigned char *)src->imageData;
+    int step = src->widthStep;
+    int i, j, k;
+
+    for(i = 0; i < h; ++i){
+        for(k= 0; k < c; ++k){
+            for(j = 0; j < w; ++j){
+                im.data[k*w*h + i*w + j] = data[i*step + j*c + k]/255.;
+            }
+        }
+    }
+    return im;
+}
+
+Mat image_to_mat(image im)
+{
+    image copy = copy_image(im);
+    constrain_image(copy);
+    if(im.c == 3) rgbgr_image(copy);
+
+    IplImage *ipl = image_to_ipl(copy);
+    Mat m = cvarrToMat(ipl, true);
+    cvReleaseImage(&ipl);
+    free_image(copy);
+    return m;
+}
+
+image mat_to_image(Mat m)
+{
+    IplImage ipl = m;
+    image im = ipl_to_image(&ipl);
+    rgbgr_image(im);
+    return im;
+}
+
+void *open_video_stream(const char *f, int c, int w, int h, int fps)
+{
+    VideoCapture *cap;
+    if(f) cap = new VideoCapture(f);
+    else cap = new VideoCapture(c);
+    if(!cap->isOpened()) return 0;
+    if(w) cap->set(CV_CAP_PROP_FRAME_WIDTH, w);
+    if(h) cap->set(CV_CAP_PROP_FRAME_HEIGHT, w);
+    if(fps) cap->set(CV_CAP_PROP_FPS, w);
+    return (void *) cap;
+}
+
+image get_image_from_stream(void *p)
+{
+    VideoCapture *cap = (VideoCapture *)p;
+    Mat m;
+    *cap >> m;
+    if(m.empty()) return make_empty_image(0,0,0);
+    return mat_to_image(m);
+}
+
+image load_image_cv(char *filename, int channels)
+{
+    int flag = -1;
+    if (channels == 0) flag = -1;
+    else if (channels == 1) flag = 0;
+    else if (channels == 3) flag = 1;
+    else {
+        fprintf(stderr, "OpenCV can't force load with %d channels\n", channels);
+    }
+    Mat m;
+    m = imread(filename, flag);
+    if(!m.data){
+        fprintf(stderr, "Cannot load image \"%s\"\n", filename);
+        char buff[256];
+        sprintf(buff, "echo %s >> bad.list", filename);
+        system(buff);
+        return make_image(10,10,3);
+        //exit(0);
+    }
+    image im = mat_to_image(m);
+    return im;
+}
+
+int show_image_cv(image im, const char* name, int ms)
+{
+    Mat m = image_to_mat(im);
+    imshow(name, m);
+    int c = waitKey(ms);
+    if (c != -1) c = c%256;
+    return c;
+}
+
+void make_window(char *name, int w, int h, int fullscreen)
+{
+    namedWindow(name, WINDOW_NORMAL); 
+    if (fullscreen) {
+        setWindowProperty(name, CV_WND_PROP_FULLSCREEN, CV_WINDOW_FULLSCREEN);
+    } else {
+        resizeWindow(name, w, h);
+        if(strcmp(name, "Demo") == 0) moveWindow(name, 0, 0);
+    }
+}
+
+}
+
+#endif
diff --git a/image.darknet/inst/include/darknet/src/iseg_layer.c b/image.darknet/inst/include/darknet/src/iseg_layer.c
new file mode 100644
index 0000000..2bf03a8
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/iseg_layer.c
@@ -0,0 +1,225 @@
+#include "iseg_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "box.h"
+#include "cuda.h"
+#include "utils.h"
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+layer make_iseg_layer(int batch, int w, int h, int classes, int ids)
+{
+    layer l = {0};
+    l.type = ISEG;
+
+    l.h = h;
+    l.w = w;
+    l.c = classes + ids;
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
+    l.classes = classes;
+    l.batch = batch;
+    l.extra = ids;
+    l.cost = calloc(1, sizeof(float));
+    l.outputs = h*w*l.c;
+    l.inputs = l.outputs;
+    l.truths = 90*(l.w*l.h+1);
+    l.delta = calloc(batch*l.outputs, sizeof(float));
+    l.output = calloc(batch*l.outputs, sizeof(float));
+
+    l.counts = calloc(90, sizeof(int));
+    l.sums = calloc(90, sizeof(float*));
+    if(ids){
+        int i;
+        for(i = 0; i < 90; ++i){
+            l.sums[i] = calloc(ids, sizeof(float));
+        }
+    }
+
+    l.forward = forward_iseg_layer;
+    l.backward = backward_iseg_layer;
+#ifdef GPU
+    l.forward_gpu = forward_iseg_layer_gpu;
+    l.backward_gpu = backward_iseg_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+#endif
+
+    fprintf(stderr, "iseg\n");
+    srand(0);
+
+    return l;
+}
+
+void resize_iseg_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+
+    l->outputs = h*w*l->c;
+    l->inputs = l->outputs;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+
+    l->delta_gpu =     cuda_make_array(l->delta, l->batch*l->outputs);
+    l->output_gpu =    cuda_make_array(l->output, l->batch*l->outputs);
+#endif
+}
+
+void forward_iseg_layer(const layer l, network net)
+{
+
+    double time = what_time_is_it_now();
+    int i,b,j,k;
+    int ids = l.extra;
+    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+
+#ifndef GPU
+    for (b = 0; b < l.batch; ++b){
+        int index = b*l.outputs;
+        activate_array(l.output + index, l.classes*l.w*l.h, LOGISTIC);
+    }
+#endif
+
+    for (b = 0; b < l.batch; ++b){
+        // a priori, each pixel has no class
+        for(i = 0; i < l.classes; ++i){
+            for(k = 0; k < l.w*l.h; ++k){
+                int index = b*l.outputs + i*l.w*l.h + k;
+                l.delta[index] = 0 - l.output[index];
+            }
+        }
+
+        // a priori, embedding should be small magnitude
+        for(i = 0; i < ids; ++i){
+            for(k = 0; k < l.w*l.h; ++k){
+                int index = b*l.outputs + (i+l.classes)*l.w*l.h + k;
+                l.delta[index] = .1 * (0 - l.output[index]);
+            }
+        }
+
+
+        memset(l.counts, 0, 90*sizeof(int));
+        for(i = 0; i < 90; ++i){
+            fill_cpu(ids, 0, l.sums[i], 1);
+            
+            int c = net.truth[b*l.truths + i*(l.w*l.h+1)];
+            if(c < 0) break;
+            // add up metric embeddings for each instance
+            for(k = 0; k < l.w*l.h; ++k){
+                int index = b*l.outputs + c*l.w*l.h + k;
+                float v = net.truth[b*l.truths + i*(l.w*l.h + 1) + 1 + k];
+                if(v){
+                    l.delta[index] = v - l.output[index];
+                    axpy_cpu(ids, 1, l.output + b*l.outputs + l.classes*l.w*l.h + k, l.w*l.h, l.sums[i], 1);
+                    ++l.counts[i];
+                }
+            }
+        }
+
+        float *mse = calloc(90, sizeof(float));
+        for(i = 0; i < 90; ++i){
+            int c = net.truth[b*l.truths + i*(l.w*l.h+1)];
+            if(c < 0) break;
+            for(k = 0; k < l.w*l.h; ++k){
+                float v = net.truth[b*l.truths + i*(l.w*l.h + 1) + 1 + k];
+                if(v){
+                    int z;
+                    float sum = 0;
+                    for(z = 0; z < ids; ++z){
+                        int index = b*l.outputs + (l.classes + z)*l.w*l.h + k;
+                        sum += pow(l.sums[i][z]/l.counts[i] - l.output[index], 2);
+                    }
+                    mse[i] += sum;
+                }
+            }
+            mse[i] /= l.counts[i];
+        }
+
+        // Calculate average embedding
+        for(i = 0; i < 90; ++i){
+            if(!l.counts[i]) continue;
+            scal_cpu(ids, 1.f/l.counts[i], l.sums[i], 1);
+            if(b == 0 && net.gpu_index == 0){
+                printf("%4d, %6.3f, ", l.counts[i], mse[i]);
+                for(j = 0; j < ids; ++j){
+                    printf("%6.3f,", l.sums[i][j]);
+                }
+                printf("\n");
+            }
+        }
+        free(mse);
+
+        // Calculate embedding loss
+        for(i = 0; i < 90; ++i){
+            if(!l.counts[i]) continue;
+            for(k = 0; k < l.w*l.h; ++k){
+                float v = net.truth[b*l.truths + i*(l.w*l.h + 1) + 1 + k];
+                if(v){
+                    for(j = 0; j < 90; ++j){
+                        if(!l.counts[j])continue;
+                        int z;
+                        for(z = 0; z < ids; ++z){
+                            int index = b*l.outputs + (l.classes + z)*l.w*l.h + k;
+                            float diff = l.sums[j][z] - l.output[index];
+                            if (j == i) l.delta[index] +=   diff < 0? -.1 : .1;
+                            else        l.delta[index] += -(diff < 0? -.1 : .1);
+                        }
+                    }
+                }
+            }
+        }
+
+        for(i = 0; i < ids; ++i){
+            for(k = 0; k < l.w*l.h; ++k){
+                int index = b*l.outputs + (i+l.classes)*l.w*l.h + k;
+                l.delta[index] *= .01;
+            }
+        }
+    }
+
+    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+    printf("took %lf sec\n", what_time_is_it_now() - time);
+}
+
+void backward_iseg_layer(const layer l, network net)
+{
+    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
+}
+
+#ifdef GPU
+
+void forward_iseg_layer_gpu(const layer l, network net)
+{
+    copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
+    int b;
+    for (b = 0; b < l.batch; ++b){
+        activate_array_gpu(l.output_gpu + b*l.outputs, l.classes*l.w*l.h, LOGISTIC);
+        //if(l.extra) activate_array_gpu(l.output_gpu + b*l.outputs + l.classes*l.w*l.h, l.extra*l.w*l.h, LOGISTIC);
+    }
+
+    cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs);
+    forward_iseg_layer(l, net);
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+}
+
+void backward_iseg_layer_gpu(const layer l, network net)
+{
+    int b;
+    for (b = 0; b < l.batch; ++b){
+        //if(l.extra) gradient_array_gpu(l.output_gpu + b*l.outputs + l.classes*l.w*l.h, l.extra*l.w*l.h, LOGISTIC, l.delta_gpu + b*l.outputs + l.classes*l.w*l.h);
+    }
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+#endif
+
diff --git a/image.darknet/inst/include/darknet/src/iseg_layer.h b/image.darknet/inst/include/darknet/src/iseg_layer.h
new file mode 100644
index 0000000..dd8e64e
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/iseg_layer.h
@@ -0,0 +1,19 @@
+#ifndef ISEG_LAYER_H
+#define ISEG_LAYER_H
+
+#include "darknet.h"
+#include "layer.h"
+#include "network.h"
+
+layer make_iseg_layer(int batch, int w, int h, int classes, int ids);
+void forward_iseg_layer(const layer l, network net);
+void backward_iseg_layer(const layer l, network net);
+void resize_iseg_layer(layer *l, int w, int h);
+int iseg_num_detections(layer l, float thresh);
+
+#ifdef GPU
+void forward_iseg_layer_gpu(const layer l, network net);
+void backward_iseg_layer_gpu(layer l, network net);
+#endif
+
+#endif
diff --git a/image.darknet/inst/include/darknet/src/l2norm_layer.c b/image.darknet/inst/include/darknet/src/l2norm_layer.c
new file mode 100644
index 0000000..d099479
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/l2norm_layer.c
@@ -0,0 +1,63 @@
+#include "l2norm_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "cuda.h"
+
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+layer make_l2norm_layer(int batch, int inputs)
+{
+    fprintf(stderr, "l2norm                                         %4d\n",  inputs);
+    layer l = {0};
+    l.type = L2NORM;
+    l.batch = batch;
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.output = calloc(inputs*batch, sizeof(float));
+    l.scales = calloc(inputs*batch, sizeof(float));
+    l.delta = calloc(inputs*batch, sizeof(float));
+
+    l.forward = forward_l2norm_layer;
+    l.backward = backward_l2norm_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_l2norm_layer_gpu;
+    l.backward_gpu = backward_l2norm_layer_gpu;
+
+    l.output_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.scales_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
+    #endif
+    return l;
+}
+
+void forward_l2norm_layer(const layer l, network net)
+{
+    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
+    l2normalize_cpu(l.output, l.scales, l.batch, l.out_c, l.out_w*l.out_h);
+}
+
+void backward_l2norm_layer(const layer l, network net)
+{
+    //axpy_cpu(l.inputs*l.batch, 1, l.scales, 1, l.delta, 1);
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
+}
+
+#ifdef GPU
+
+void forward_l2norm_layer_gpu(const layer l, network net)
+{
+    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    l2normalize_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+}
+
+void backward_l2norm_layer_gpu(const layer l, network net)
+{
+    axpy_gpu(l.batch*l.inputs, 1, l.scales_gpu, 1, l.delta_gpu, 1);
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+
+#endif
diff --git a/image.darknet/inst/include/darknet/src/l2norm_layer.h b/image.darknet/inst/include/darknet/src/l2norm_layer.h
new file mode 100644
index 0000000..1ca6f71
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/l2norm_layer.h
@@ -0,0 +1,15 @@
+#ifndef L2NORM_LAYER_H
+#define L2NORM_LAYER_H
+#include "layer.h"
+#include "network.h"
+
+layer make_l2norm_layer(int batch, int inputs);
+void forward_l2norm_layer(const layer l, network net);
+void backward_l2norm_layer(const layer l, network net);
+
+#ifdef GPU
+void forward_l2norm_layer_gpu(const layer l, network net);
+void backward_l2norm_layer_gpu(const layer l, network net);
+#endif
+
+#endif
diff --git a/image.darknet/inst/include/darknet/src/layer.c b/image.darknet/inst/include/darknet/src/layer.c
index 622cf26..c27b477 100644
--- a/image.darknet/inst/include/darknet/src/layer.c
+++ b/image.darknet/inst/include/darknet/src/layer.c
@@ -1,5 +1,6 @@
 #include "layer.h"
 #include "cuda.h"
+
 #include <stdlib.h>
 
 void free_layer(layer l)
@@ -32,7 +33,6 @@ void free_layer(layer l)
     if(l.scale_updates)      free(l.scale_updates);
     if(l.weights)            free(l.weights);
     if(l.weight_updates)     free(l.weight_updates);
-    if(l.col_image)          free(l.col_image);
     if(l.delta)              free(l.delta);
     if(l.output)             free(l.output);
     if(l.squared)            free(l.squared);
@@ -80,7 +80,6 @@ void free_layer(layer l)
     if(l.rolling_variance_gpu)    cuda_free(l.rolling_variance_gpu);
     if(l.variance_delta_gpu)      cuda_free(l.variance_delta_gpu);
     if(l.mean_delta_gpu)          cuda_free(l.mean_delta_gpu);
-    if(l.col_image_gpu)           cuda_free(l.col_image_gpu);
     if(l.x_gpu)                   cuda_free(l.x_gpu);
     if(l.x_norm_gpu)              cuda_free(l.x_norm_gpu);
     if(l.weights_gpu)             cuda_free(l.weights_gpu);
diff --git a/image.darknet/inst/include/darknet/src/layer.h b/image.darknet/inst/include/darknet/src/layer.h
index 806542b..af6cd2a 100644
--- a/image.darknet/inst/include/darknet/src/layer.h
+++ b/image.darknet/inst/include/darknet/src/layer.h
@@ -1,271 +1 @@
-#ifndef BASE_LAYER_H
-#define BASE_LAYER_H
-
-#include "activations.h"
-#include "stddef.h"
-#include "tree.h"
-
-struct network_state;
-
-struct layer;
-typedef struct layer layer;
-
-typedef enum {
-    CONVOLUTIONAL,
-    DECONVOLUTIONAL,
-    CONNECTED,
-    MAXPOOL,
-    SOFTMAX,
-    DETECTION,
-    DROPOUT,
-    CROP,
-    ROUTE,
-    COST,
-    NORMALIZATION,
-    AVGPOOL,
-    LOCAL,
-    SHORTCUT,
-    ACTIVE,
-    RNN,
-    GRU,
-    CRNN,
-    BATCHNORM,
-    NETWORK,
-    XNOR,
-    REGION,
-    REORG,
-    BLANK
-} LAYER_TYPE;
-
-typedef enum{
-    SSE, MASKED, SMOOTH
-} COST_TYPE;
-
-struct layer{
-    LAYER_TYPE type;
-    ACTIVATION activation;
-    COST_TYPE cost_type;
-    void (*forward)   (struct layer, struct network_state);
-    void (*backward)  (struct layer, struct network_state);
-    void (*update)    (struct layer, int, float, float, float);
-    void (*forward_gpu)   (struct layer, struct network_state);
-    void (*backward_gpu)  (struct layer, struct network_state);
-    void (*update_gpu)    (struct layer, int, float, float, float);
-    int batch_normalize;
-    int shortcut;
-    int batch;
-    int forced;
-    int flipped;
-    int inputs;
-    int outputs;
-    int truths;
-    int h,w,c;
-    int out_h, out_w, out_c;
-    int n;
-    int max_boxes;
-    int groups;
-    int size;
-    int side;
-    int stride;
-    int reverse;
-    int pad;
-    int sqrt;
-    int flip;
-    int index;
-    int binary;
-    int xnor;
-    int steps;
-    int hidden;
-    float dot;
-    float angle;
-    float jitter;
-    float saturation;
-    float exposure;
-    float shift;
-    float ratio;
-    int softmax;
-    int classes;
-    int coords;
-    int background;
-    int rescore;
-    int objectness;
-    int does_cost;
-    int joint;
-    int noadjust;
-    int reorg;
-    int log;
-
-    int adam;
-    float B1;
-    float B2;
-    float eps;
-    int t;
-
-    float alpha;
-    float beta;
-    float kappa;
-
-    float coord_scale;
-    float object_scale;
-    float noobject_scale;
-    float class_scale;
-    int bias_match;
-    int random;
-    float thresh;
-    int classfix;
-    int absolute;
-
-    int dontload;
-    int dontloadscales;
-
-    float temperature;
-    float probability;
-    float scale;
-
-    char  * cweights;
-    int   * indexes;
-    int   * input_layers;
-    int   * input_sizes;
-    int   * map;
-    float * rand;
-    float * cost;
-    float * state;
-    float * prev_state;
-    float * forgot_state;
-    float * forgot_delta;
-    float * state_delta;
-
-    float * concat;
-    float * concat_delta;
-
-    float * binary_weights;
-
-    float * biases;
-    float * bias_updates;
-
-    float * scales;
-    float * scale_updates;
-
-    float * weights;
-    float * weight_updates;
-
-    float * col_image;
-    float * delta;
-    float * output;
-    float * squared;
-    float * norms;
-
-    float * spatial_mean;
-    float * mean;
-    float * variance;
-
-    float * mean_delta;
-    float * variance_delta;
-
-    float * rolling_mean;
-    float * rolling_variance;
-
-    float * x;
-    float * x_norm;
-
-    float * m;
-    float * v;
-
-    float * z_cpu;
-    float * r_cpu;
-    float * h_cpu;
-
-    float * binary_input;
-
-    struct layer *input_layer;
-    struct layer *self_layer;
-    struct layer *output_layer;
-
-    struct layer *input_gate_layer;
-    struct layer *state_gate_layer;
-    struct layer *input_save_layer;
-    struct layer *state_save_layer;
-    struct layer *input_state_layer;
-    struct layer *state_state_layer;
-
-    struct layer *input_z_layer;
-    struct layer *state_z_layer;
-
-    struct layer *input_r_layer;
-    struct layer *state_r_layer;
-
-    struct layer *input_h_layer;
-    struct layer *state_h_layer;
-
-    tree *softmax_tree;
-
-    size_t workspace_size;
-
-    #ifdef GPU
-    int *indexes_gpu;
-
-    float *z_gpu;
-    float *r_gpu;
-    float *h_gpu;
-
-    float *m_gpu;
-    float *v_gpu;
-
-    float * prev_state_gpu;
-    float * forgot_state_gpu;
-    float * forgot_delta_gpu;
-    float * state_gpu;
-    float * state_delta_gpu;
-    float * gate_gpu;
-    float * gate_delta_gpu;
-    float * save_gpu;
-    float * save_delta_gpu;
-    float * concat_gpu;
-    float * concat_delta_gpu;
-
-    float *binary_input_gpu;
-    float *binary_weights_gpu;
-
-    float * mean_gpu;
-    float * variance_gpu;
-
-    float * rolling_mean_gpu;
-    float * rolling_variance_gpu;
-
-    float * variance_delta_gpu;
-    float * mean_delta_gpu;
-
-    float * col_image_gpu;
-
-    float * x_gpu;
-    float * x_norm_gpu;
-    float * weights_gpu;
-    float * weight_updates_gpu;
-
-    float * biases_gpu;
-    float * bias_updates_gpu;
-
-    float * scales_gpu;
-    float * scale_updates_gpu;
-
-    float * output_gpu;
-    float * delta_gpu;
-    float * rand_gpu;
-    float * squared_gpu;
-    float * norms_gpu;
-    #ifdef CUDNN
-    cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
-    cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
-    cudnnFilterDescriptor_t weightDesc;
-    cudnnFilterDescriptor_t dweightDesc;
-    cudnnConvolutionDescriptor_t convDesc;
-    cudnnConvolutionFwdAlgo_t fw_algo;
-    cudnnConvolutionBwdDataAlgo_t bd_algo;
-    cudnnConvolutionBwdFilterAlgo_t bf_algo;
-    #endif
-    #endif
-};
-
-void free_layer(layer);
-
-#endif
+#include "darknet.h"
diff --git a/image.darknet/inst/include/darknet/src/list.h b/image.darknet/inst/include/darknet/src/list.h
index fb818c2..6b445c7 100644
--- a/image.darknet/inst/include/darknet/src/list.h
+++ b/image.darknet/inst/include/darknet/src/list.h
@@ -1,26 +1,13 @@
 #ifndef LIST_H
 #define LIST_H
-
-typedef struct node{
-    void *val;
-    struct node *next;
-    struct node *prev;
-} node;
-
-typedef struct list{
-    int size;
-    node *front;
-    node *back;
-} list;
+#include "darknet.h"
 
 list *make_list();
 int list_find(list *l, void *val);
 
 void list_insert(list *, void *);
 
-void **list_to_array(list *l);
 
-void free_list(list *l);
 void free_list_contents(list *l);
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/local_layer.c b/image.darknet/inst/include/darknet/src/local_layer.c
index 31f0ca6..74f6910 100644
--- a/image.darknet/inst/include/darknet/src/local_layer.c
+++ b/image.darknet/inst/include/darknet/src/local_layer.c
@@ -57,9 +57,10 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in
     float scale = sqrt(2./(size*size*c));
     for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1,1);
 
-    l.col_image = calloc(out_h*out_w*size*size*c, sizeof(float));
     l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
     l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));
+
+    l.workspace_size = out_h*out_w*size*size*c;
     
     l.forward = forward_local_layer;
     l.backward = backward_local_layer;
@@ -76,7 +77,6 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in
     l.biases_gpu = cuda_make_array(l.biases, l.outputs);
     l.bias_updates_gpu = cuda_make_array(l.bias_updates, l.outputs);
 
-    l.col_image_gpu = cuda_make_array(l.col_image, out_h*out_w*size*size*c);
     l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
     l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
 
@@ -88,7 +88,7 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in
     return l;
 }
 
-void forward_local_layer(const local_layer l, network_state state)
+void forward_local_layer(const local_layer l, network net)
 {
     int out_h = local_out_height(l);
     int out_w = local_out_width(l);
@@ -100,13 +100,13 @@ void forward_local_layer(const local_layer l, network_state state)
     }
 
     for(i = 0; i < l.batch; ++i){
-        float *input = state.input + i*l.w*l.h*l.c;
+        float *input = net.input + i*l.w*l.h*l.c;
         im2col_cpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image);
+                l.size, l.stride, l.pad, net.workspace);
         float *output = l.output + i*l.outputs;
         for(j = 0; j < locations; ++j){
             float *a = l.weights + j*l.size*l.size*l.c*l.n;
-            float *b = l.col_image + j;
+            float *b = net.workspace + j;
             float *c = output + j;
 
             int m = l.n;
@@ -119,7 +119,7 @@ void forward_local_layer(const local_layer l, network_state state)
     activate_array(l.output, l.outputs*l.batch, l.activation);
 }
 
-void backward_local_layer(local_layer l, network_state state)
+void backward_local_layer(local_layer l, network net)
 {
     int i, j;
     int locations = l.out_w*l.out_h;
@@ -131,13 +131,13 @@ void backward_local_layer(local_layer l, network_state state)
     }
 
     for(i = 0; i < l.batch; ++i){
-        float *input = state.input + i*l.w*l.h*l.c;
+        float *input = net.input + i*l.w*l.h*l.c;
         im2col_cpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image);
+                l.size, l.stride, l.pad, net.workspace);
 
         for(j = 0; j < locations; ++j){ 
             float *a = l.delta + i*l.outputs + j;
-            float *b = l.col_image + j;
+            float *b = net.workspace + j;
             float *c = l.weight_updates + j*l.size*l.size*l.c*l.n;
             int m = l.n;
             int n = l.size*l.size*l.c;
@@ -146,11 +146,11 @@ void backward_local_layer(local_layer l, network_state state)
             gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
         }
 
-        if(state.delta){
+        if(net.delta){
             for(j = 0; j < locations; ++j){ 
                 float *a = l.weights + j*l.size*l.size*l.c*l.n;
                 float *b = l.delta + i*l.outputs + j;
-                float *c = l.col_image + j;
+                float *c = net.workspace + j;
 
                 int m = l.size*l.size*l.c;
                 int n = 1;
@@ -159,13 +159,18 @@ void backward_local_layer(local_layer l, network_state state)
                 gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
             }
 
-            col2im_cpu(l.col_image, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+            col2im_cpu(net.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, net.delta+i*l.c*l.h*l.w);
         }
     }
 }
 
-void update_local_layer(local_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_local_layer(local_layer l, update_args a)
 {
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
     int locations = l.out_w*l.out_h;
     int size = l.size*l.size*l.c*l.n*locations;
     axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
@@ -178,7 +183,7 @@ void update_local_layer(local_layer l, int batch, float learning_rate, float mom
 
 #ifdef GPU
 
-void forward_local_layer_gpu(const local_layer l, network_state state)
+void forward_local_layer_gpu(const local_layer l, network net)
 {
     int out_h = local_out_height(l);
     int out_w = local_out_width(l);
@@ -186,83 +191,88 @@ void forward_local_layer_gpu(const local_layer l, network_state state)
     int locations = out_h * out_w;
 
     for(i = 0; i < l.batch; ++i){
-        copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
+        copy_gpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
     }
 
     for(i = 0; i < l.batch; ++i){
-        float *input = state.input + i*l.w*l.h*l.c;
-        im2col_ongpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image_gpu);
+        float *input = net.input_gpu + i*l.w*l.h*l.c;
+        im2col_gpu(input, l.c, l.h, l.w, 
+                l.size, l.stride, l.pad, net.workspace);
         float *output = l.output_gpu + i*l.outputs;
         for(j = 0; j < locations; ++j){
             float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
-            float *b = l.col_image_gpu + j;
+            float *b = net.workspace + j;
             float *c = output + j;
 
             int m = l.n;
             int n = 1;
             int k = l.size*l.size*l.c;
 
-            gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
+            gemm_gpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
         }
     }
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
 }
 
-void backward_local_layer_gpu(local_layer l, network_state state)
+void backward_local_layer_gpu(local_layer l, network net)
 {
     int i, j;
     int locations = l.out_w*l.out_h;
 
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
     for(i = 0; i < l.batch; ++i){
-        axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
+        axpy_gpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
     }
 
     for(i = 0; i < l.batch; ++i){
-        float *input = state.input + i*l.w*l.h*l.c;
-        im2col_ongpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image_gpu);
+        float *input = net.input_gpu + i*l.w*l.h*l.c;
+        im2col_gpu(input, l.c, l.h, l.w, 
+                l.size, l.stride, l.pad, net.workspace);
 
         for(j = 0; j < locations; ++j){ 
             float *a = l.delta_gpu + i*l.outputs + j;
-            float *b = l.col_image_gpu + j;
+            float *b = net.workspace + j;
             float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n;
             int m = l.n;
             int n = l.size*l.size*l.c;
             int k = 1;
 
-            gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
+            gemm_gpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
         }
 
-        if(state.delta){
+        if(net.delta_gpu){
             for(j = 0; j < locations; ++j){ 
                 float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
                 float *b = l.delta_gpu + i*l.outputs + j;
-                float *c = l.col_image_gpu + j;
+                float *c = net.workspace + j;
 
                 int m = l.size*l.size*l.c;
                 int n = 1;
                 int k = l.n;
 
-                gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
+                gemm_gpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
             }
 
-            col2im_ongpu(l.col_image_gpu, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+            col2im_gpu(net.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, net.delta_gpu+i*l.c*l.h*l.w);
         }
     }
 }
 
-void update_local_layer_gpu(local_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_local_layer_gpu(local_layer l, update_args a)
 {
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
     int locations = l.out_w*l.out_h;
     int size = l.size*l.size*l.c*l.n*locations;
-    axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
-    scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
+    axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+    scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1);
 
-    axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
-    axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
-    scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
+    axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+    axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+    scal_gpu(size, momentum, l.weight_updates_gpu, 1);
 }
 
 void pull_local_layer(local_layer l)
diff --git a/image.darknet/inst/include/darknet/src/local_layer.h b/image.darknet/inst/include/darknet/src/local_layer.h
index 28915d8..776e572 100644
--- a/image.darknet/inst/include/darknet/src/local_layer.h
+++ b/image.darknet/inst/include/darknet/src/local_layer.h
@@ -10,9 +10,9 @@
 typedef layer local_layer;
 
 #ifdef GPU
-void forward_local_layer_gpu(local_layer layer, network_state state);
-void backward_local_layer_gpu(local_layer layer, network_state state);
-void update_local_layer_gpu(local_layer layer, int batch, float learning_rate, float momentum, float decay);
+void forward_local_layer_gpu(local_layer layer, network net);
+void backward_local_layer_gpu(local_layer layer, network net);
+void update_local_layer_gpu(local_layer layer, update_args a);
 
 void push_local_layer(local_layer layer);
 void pull_local_layer(local_layer layer);
@@ -20,9 +20,9 @@ void pull_local_layer(local_layer layer);
 
 local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation);
 
-void forward_local_layer(const local_layer layer, network_state state);
-void backward_local_layer(local_layer layer, network_state state);
-void update_local_layer(local_layer layer, int batch, float learning_rate, float momentum, float decay);
+void forward_local_layer(const local_layer layer, network net);
+void backward_local_layer(local_layer layer, network net);
+void update_local_layer(local_layer layer, update_args a);
 
 void bias_output(float *output, float *biases, int batch, int n, int size);
 void backward_bias(float *bias_updates, float *delta, int batch, int n, int size);
diff --git a/image.darknet/inst/include/darknet/src/logistic_layer.c b/image.darknet/inst/include/darknet/src/logistic_layer.c
new file mode 100644
index 0000000..b2b3d6b
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/logistic_layer.c
@@ -0,0 +1,71 @@
+#include "logistic_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "cuda.h"
+
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+layer make_logistic_layer(int batch, int inputs)
+{
+    fprintf(stderr, "logistic x entropy                             %4d\n",  inputs);
+    layer l = {0};
+    l.type = LOGXENT;
+    l.batch = batch;
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.loss = calloc(inputs*batch, sizeof(float));
+    l.output = calloc(inputs*batch, sizeof(float));
+    l.delta = calloc(inputs*batch, sizeof(float));
+    l.cost = calloc(1, sizeof(float));
+
+    l.forward = forward_logistic_layer;
+    l.backward = backward_logistic_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_logistic_layer_gpu;
+    l.backward_gpu = backward_logistic_layer_gpu;
+
+    l.output_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.loss_gpu = cuda_make_array(l.loss, inputs*batch); 
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
+    #endif
+    return l;
+}
+
+void forward_logistic_layer(const layer l, network net)
+{
+    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
+    activate_array(l.output, l.outputs*l.batch, LOGISTIC);
+    if(net.truth){
+        logistic_x_ent_cpu(l.batch*l.inputs, l.output, net.truth, l.delta, l.loss);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
+    }
+}
+
+void backward_logistic_layer(const layer l, network net)
+{
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
+}
+
+#ifdef GPU
+
+void forward_logistic_layer_gpu(const layer l, network net)
+{
+    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, LOGISTIC);
+    if(net.truth){
+        logistic_x_ent_gpu(l.batch*l.inputs, l.output_gpu, net.truth_gpu, l.delta_gpu, l.loss_gpu);
+        cuda_pull_array(l.loss_gpu, l.loss, l.batch*l.inputs);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
+    }
+}
+
+void backward_logistic_layer_gpu(const layer l, network net)
+{
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+
+#endif
diff --git a/image.darknet/inst/include/darknet/src/logistic_layer.h b/image.darknet/inst/include/darknet/src/logistic_layer.h
new file mode 100644
index 0000000..9c25bee
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/logistic_layer.h
@@ -0,0 +1,15 @@
+#ifndef LOGISTIC_LAYER_H
+#define LOGISTIC_LAYER_H
+#include "layer.h"
+#include "network.h"
+
+layer make_logistic_layer(int batch, int inputs);
+void forward_logistic_layer(const layer l, network net);
+void backward_logistic_layer(const layer l, network net);
+
+#ifdef GPU
+void forward_logistic_layer_gpu(const layer l, network net);
+void backward_logistic_layer_gpu(const layer l, network net);
+#endif
+
+#endif
diff --git a/image.darknet/inst/include/darknet/src/lstm_layer.c b/image.darknet/inst/include/darknet/src/lstm_layer.c
new file mode 100644
index 0000000..fb07de2
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/lstm_layer.c
@@ -0,0 +1,626 @@
+#include "lstm_layer.h"
+#include "connected_layer.h"
+#include "utils.h"
+#include "cuda.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void increment_layer(layer *l, int steps)
+{
+    int num = l->outputs*l->batch*steps;
+    l->output += num;
+    l->delta += num;
+    l->x += num;
+    l->x_norm += num;
+
+#ifdef GPU
+    l->output_gpu += num;
+    l->delta_gpu += num;
+    l->x_gpu += num;
+    l->x_norm_gpu += num;
+#endif
+}
+
+layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
+{
+    fprintf(stderr, "LSTM Layer: %d inputs, %d outputs\n", inputs, outputs);
+    batch = batch / steps;
+    layer l = { 0 };
+    l.batch = batch;
+    l.type = LSTM;
+    l.steps = steps;
+    l.inputs = inputs;
+
+    l.uf = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.uf->batch = batch;
+
+    l.ui = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.ui->batch = batch;
+
+    l.ug = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.ug->batch = batch;
+
+    l.uo = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.uo->batch = batch;
+
+    l.wf = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wf->batch = batch;
+
+    l.wi = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wi->batch = batch;
+
+    l.wg = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wg->batch = batch;
+
+    l.wo = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wo->batch = batch;
+
+    l.batch_normalize = batch_normalize;
+    l.outputs = outputs;
+
+    l.output = calloc(outputs*batch*steps, sizeof(float));
+    l.state = calloc(outputs*batch, sizeof(float));
+
+    l.forward = forward_lstm_layer;
+    l.update = update_lstm_layer;
+
+    l.prev_state_cpu =  calloc(batch*outputs, sizeof(float));
+    l.prev_cell_cpu =   calloc(batch*outputs, sizeof(float));
+    l.cell_cpu =        calloc(batch*outputs*steps, sizeof(float));
+
+    l.f_cpu =           calloc(batch*outputs, sizeof(float));
+    l.i_cpu =           calloc(batch*outputs, sizeof(float));
+    l.g_cpu =           calloc(batch*outputs, sizeof(float));
+    l.o_cpu =           calloc(batch*outputs, sizeof(float));
+    l.c_cpu =           calloc(batch*outputs, sizeof(float));
+    l.h_cpu =           calloc(batch*outputs, sizeof(float));
+    l.temp_cpu =        calloc(batch*outputs, sizeof(float));
+    l.temp2_cpu =       calloc(batch*outputs, sizeof(float));
+    l.temp3_cpu =       calloc(batch*outputs, sizeof(float));
+    l.dc_cpu =          calloc(batch*outputs, sizeof(float));
+    l.dh_cpu =          calloc(batch*outputs, sizeof(float));
+
+#ifdef GPU
+    l.forward_gpu = forward_lstm_layer_gpu;
+    l.backward_gpu = backward_lstm_layer_gpu;
+    l.update_gpu = update_lstm_layer_gpu;
+
+    l.output_gpu = cuda_make_array(0, batch*outputs*steps);
+    l.delta_gpu = cuda_make_array(0, batch*l.outputs*steps);
+
+    l.prev_state_gpu = cuda_make_array(0, batch*outputs);
+    l.prev_cell_gpu = cuda_make_array(0, batch*outputs);
+    l.cell_gpu = cuda_make_array(0, batch*outputs*steps);
+
+    l.f_gpu = cuda_make_array(0, batch*outputs);
+    l.i_gpu = cuda_make_array(0, batch*outputs);
+    l.g_gpu = cuda_make_array(0, batch*outputs);
+    l.o_gpu = cuda_make_array(0, batch*outputs);
+    l.c_gpu = cuda_make_array(0, batch*outputs);
+    l.h_gpu = cuda_make_array(0, batch*outputs);
+    l.temp_gpu =  cuda_make_array(0, batch*outputs);
+    l.temp2_gpu = cuda_make_array(0, batch*outputs);
+    l.temp3_gpu = cuda_make_array(0, batch*outputs);
+    l.dc_gpu = cuda_make_array(0, batch*outputs);
+    l.dh_gpu = cuda_make_array(0, batch*outputs);
+#ifdef CUDNN
+        cudnnSetTensor4dDescriptor(l.wf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wf->out_c, l.wf->out_h, l.wf->out_w); 
+        cudnnSetTensor4dDescriptor(l.wi->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wi->out_c, l.wi->out_h, l.wi->out_w); 
+        cudnnSetTensor4dDescriptor(l.wg->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wg->out_c, l.wg->out_h, l.wg->out_w); 
+        cudnnSetTensor4dDescriptor(l.wo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wo->out_c, l.wo->out_h, l.wo->out_w); 
+
+        cudnnSetTensor4dDescriptor(l.uf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uf->out_c, l.uf->out_h, l.uf->out_w); 
+        cudnnSetTensor4dDescriptor(l.ui->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ui->out_c, l.ui->out_h, l.ui->out_w); 
+        cudnnSetTensor4dDescriptor(l.ug->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ug->out_c, l.ug->out_h, l.ug->out_w); 
+        cudnnSetTensor4dDescriptor(l.uo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uo->out_c, l.uo->out_h, l.uo->out_w); 
+#endif
+
+#endif
+
+    return l;
+}
+
+void update_lstm_layer(layer l, update_args a)
+{
+    update_connected_layer(*(l.wf), a);
+    update_connected_layer(*(l.wi), a);
+    update_connected_layer(*(l.wg), a);
+    update_connected_layer(*(l.wo), a);
+    update_connected_layer(*(l.uf), a);
+    update_connected_layer(*(l.ui), a);
+    update_connected_layer(*(l.ug), a);
+    update_connected_layer(*(l.uo), a);
+}
+
+void forward_lstm_layer(layer l, network state)
+{
+    network s = { 0 };
+    s.train = state.train;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1);
+    if (state.train) {
+        fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i) {
+        s.input = l.h_cpu;
+        forward_connected_layer(wf, s);							
+        forward_connected_layer(wi, s);							
+        forward_connected_layer(wg, s);							
+        forward_connected_layer(wo, s);							
+
+        s.input = state.input;
+        forward_connected_layer(uf, s);							
+        forward_connected_layer(ui, s);							
+        forward_connected_layer(ug, s);							
+        forward_connected_layer(uo, s);							
+
+        copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);	
+        axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);	
+
+        copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);	
+        axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);	
+
+        copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);	
+        axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);	
+
+        activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array(l.g_cpu, l.outputs*l.batch, TANH);			
+        activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC);		
+
+        copy_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);		
+        mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);		
+        mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.c_cpu, 1);			
+        axpy_cpu(l.outputs*l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1);	
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.h_cpu, 1);			
+        activate_array(l.h_cpu, l.outputs*l.batch, TANH);		
+        mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.h_cpu, 1);	
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.cell_cpu, 1);		
+        copy_cpu(l.outputs*l.batch, l.h_cpu, 1, l.output, 1);
+
+        state.input += l.inputs*l.batch;
+        l.output    += l.outputs*l.batch;
+        l.cell_cpu      += l.outputs*l.batch;
+
+        increment_layer(&wf, 1);
+        increment_layer(&wi, 1);
+        increment_layer(&wg, 1);
+        increment_layer(&wo, 1);
+
+        increment_layer(&uf, 1);
+        increment_layer(&ui, 1);
+        increment_layer(&ug, 1);
+        increment_layer(&uo, 1);
+    }
+}
+
+void backward_lstm_layer(layer l, network state)
+{
+    network s = { 0 };
+    s.train = state.train;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    increment_layer(&wf, l.steps - 1);
+    increment_layer(&wi, l.steps - 1);
+    increment_layer(&wg, l.steps - 1);
+    increment_layer(&wo, l.steps - 1);
+
+    increment_layer(&uf, l.steps - 1);
+    increment_layer(&ui, l.steps - 1);
+    increment_layer(&ug, l.steps - 1);
+    increment_layer(&uo, l.steps - 1);
+
+    state.input += l.inputs*l.batch*(l.steps - 1);
+    if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1);
+
+    l.output += l.outputs*l.batch*(l.steps - 1);
+    l.cell_cpu += l.outputs*l.batch*(l.steps - 1);
+    l.delta += l.outputs*l.batch*(l.steps - 1);
+
+    for (i = l.steps - 1; i >= 0; --i) {
+        if (i != 0) copy_cpu(l.outputs*l.batch, l.cell_cpu - l.outputs*l.batch, 1, l.prev_cell_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.cell_cpu, 1, l.c_cpu, 1);
+        if (i != 0) copy_cpu(l.outputs*l.batch, l.output - l.outputs*l.batch, 1, l.prev_state_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.output, 1, l.h_cpu, 1);
+
+        l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs*l.batch;
+
+        copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);			
+        axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);			
+
+        copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);			
+        axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);			
+
+        copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);			
+        axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);			
+
+        copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);			
+        axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);			
+
+        activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC);			
+        activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array(l.g_cpu, l.outputs*l.batch, TANH);			
+        activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC);		
+
+        copy_cpu(l.outputs*l.batch, l.delta, 1, l.temp3_cpu, 1);		
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);			
+        activate_array(l.temp_cpu, l.outputs*l.batch, TANH);			
+
+        copy_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1);		
+        mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.temp2_cpu, 1);			
+
+        gradient_array(l.temp_cpu, l.outputs*l.batch, TANH, l.temp2_cpu);
+        axpy_cpu(l.outputs*l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1);		
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);			
+        activate_array(l.temp_cpu, l.outputs*l.batch, TANH);			
+        mul_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp_cpu, 1);		
+        gradient_array(l.o_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wo.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;															
+        backward_connected_layer(wo, s);	
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uo.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(uo, s);									
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);			
+        mul_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);				
+        gradient_array(l.g_cpu, l.outputs*l.batch, TANH, l.temp_cpu);		
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wg.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;														
+        backward_connected_layer(wg, s);	
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ug.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(ug, s);																
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);			
+        mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);				
+        gradient_array(l.i_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);	
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wi.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;
+        backward_connected_layer(wi, s);						
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ui.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(ui, s);									
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);		
+        mul_cpu(l.outputs*l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1);
+        gradient_array(l.f_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wf.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;
+        backward_connected_layer(wf, s);						
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uf.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(uf, s);									
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);			
+        mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.temp_cpu, 1);				
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, l.dc_cpu, 1);				
+
+        state.input -= l.inputs*l.batch;
+        if (state.delta) state.delta -= l.inputs*l.batch;
+        l.output -= l.outputs*l.batch;
+        l.cell_cpu -= l.outputs*l.batch;
+        l.delta -= l.outputs*l.batch;
+
+        increment_layer(&wf, -1);
+        increment_layer(&wi, -1);
+        increment_layer(&wg, -1);
+        increment_layer(&wo, -1);
+
+        increment_layer(&uf, -1);
+        increment_layer(&ui, -1);
+        increment_layer(&ug, -1);
+        increment_layer(&uo, -1);
+    }
+}
+
+#ifdef GPU
+void update_lstm_layer_gpu(layer l, update_args a)
+{
+    update_connected_layer_gpu(*(l.wf), a);
+    update_connected_layer_gpu(*(l.wi), a);
+    update_connected_layer_gpu(*(l.wg), a);
+    update_connected_layer_gpu(*(l.wo), a);
+    update_connected_layer_gpu(*(l.uf), a);
+    update_connected_layer_gpu(*(l.ui), a);
+    update_connected_layer_gpu(*(l.ug), a);
+    update_connected_layer_gpu(*(l.uo), a);
+}
+
+void forward_lstm_layer_gpu(layer l, network state)
+{
+    network s = { 0 };
+    s.train = state.train;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1);
+
+    fill_gpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1);
+    if (state.train) {
+        fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i) {
+        s.input_gpu = l.h_gpu;
+        forward_connected_layer_gpu(wf, s);							
+        forward_connected_layer_gpu(wi, s);							
+        forward_connected_layer_gpu(wg, s);							
+        forward_connected_layer_gpu(wo, s);							
+
+        s.input_gpu = state.input_gpu;
+        forward_connected_layer_gpu(uf, s);							
+        forward_connected_layer_gpu(ui, s);							
+        forward_connected_layer_gpu(ug, s);							
+        forward_connected_layer_gpu(uo, s);							
+
+        copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
+
+        copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);	
+        axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);	
+
+        copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);	
+        axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);	
+
+        copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);	
+        axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);	
+
+        activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH);			
+        activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);		
+
+        copy_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);		
+        mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);		
+        mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1);			
+        axpy_gpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1);	
+
+        copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1);			
+        activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);		
+        mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1);	
+
+        copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1);		
+        copy_gpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1);
+
+        state.input_gpu += l.inputs*l.batch;
+        l.output_gpu    += l.outputs*l.batch;
+        l.cell_gpu      += l.outputs*l.batch;
+
+        increment_layer(&wf, 1);
+        increment_layer(&wi, 1);
+        increment_layer(&wg, 1);
+        increment_layer(&wo, 1);
+
+        increment_layer(&uf, 1);
+        increment_layer(&ui, 1);
+        increment_layer(&ug, 1);
+        increment_layer(&uo, 1);
+    }
+}
+
+void backward_lstm_layer_gpu(layer l, network state)
+{
+    network s = { 0 };
+    s.train = state.train;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    increment_layer(&wf, l.steps - 1);
+    increment_layer(&wi, l.steps - 1);
+    increment_layer(&wg, l.steps - 1);
+    increment_layer(&wo, l.steps - 1);
+
+    increment_layer(&uf, l.steps - 1);
+    increment_layer(&ui, l.steps - 1);
+    increment_layer(&ug, l.steps - 1);
+    increment_layer(&uo, l.steps - 1);
+
+    state.input_gpu += l.inputs*l.batch*(l.steps - 1);
+    if (state.delta_gpu) state.delta_gpu += l.inputs*l.batch*(l.steps - 1);
+
+    l.output_gpu += l.outputs*l.batch*(l.steps - 1);
+    l.cell_gpu += l.outputs*l.batch*(l.steps - 1);
+    l.delta_gpu += l.outputs*l.batch*(l.steps - 1);
+
+    for (i = l.steps - 1; i >= 0; --i) {
+        if (i != 0) copy_gpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1);
+        if (i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1);
+
+        l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
+
+        copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);			
+        axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);			
+
+        copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);			
+        axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);			
+
+        copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);			
+        axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);			
+
+        copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);			
+        axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);			
+
+        activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);			
+        activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH);			
+        activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);		
+
+        copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1);		
+
+        copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);			
+        activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH);			
+
+        copy_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1);		
+        mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1);			
+
+        gradient_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu);
+        axpy_gpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1);		
+
+        copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);			
+        activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH);			
+        mul_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);		
+        gradient_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1);
+        s.input_gpu = l.prev_state_gpu;
+        s.delta_gpu = l.dh_gpu;															
+        backward_connected_layer_gpu(wo, s);	
+
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1);
+        s.input_gpu = state.input_gpu;
+        s.delta_gpu = state.delta_gpu;
+        backward_connected_layer_gpu(uo, s);									
+
+        copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);			
+        mul_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);				
+        gradient_array_gpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu);		
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1);
+        s.input_gpu = l.prev_state_gpu;
+        s.delta_gpu = l.dh_gpu;														
+        backward_connected_layer_gpu(wg, s);	
+
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1);
+        s.input_gpu = state.input_gpu;
+        s.delta_gpu = state.delta_gpu;
+        backward_connected_layer_gpu(ug, s);																
+
+        copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);			
+        mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);				
+        gradient_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);	
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1);
+        s.input_gpu = l.prev_state_gpu;
+        s.delta_gpu = l.dh_gpu;
+        backward_connected_layer_gpu(wi, s);						
+
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1);
+        s.input_gpu = state.input_gpu;
+        s.delta_gpu = state.delta_gpu;
+        backward_connected_layer_gpu(ui, s);									
+
+        copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);		
+        mul_gpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
+        gradient_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1);
+        s.input_gpu = l.prev_state_gpu;
+        s.delta_gpu = l.dh_gpu;
+        backward_connected_layer_gpu(wf, s);						
+
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1);
+        s.input_gpu = state.input_gpu;
+        s.delta_gpu = state.delta_gpu;
+        backward_connected_layer_gpu(uf, s);									
+
+        copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);			
+        mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);				
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1);				
+
+        state.input_gpu -= l.inputs*l.batch;
+        if (state.delta_gpu) state.delta_gpu -= l.inputs*l.batch;
+        l.output_gpu -= l.outputs*l.batch;
+        l.cell_gpu -= l.outputs*l.batch;
+        l.delta_gpu -= l.outputs*l.batch;
+
+        increment_layer(&wf, -1);
+        increment_layer(&wi, -1);
+        increment_layer(&wg, -1);
+        increment_layer(&wo, -1);
+
+        increment_layer(&uf, -1);
+        increment_layer(&ui, -1);
+        increment_layer(&ug, -1);
+        increment_layer(&uo, -1);
+    }
+}
+#endif
diff --git a/image.darknet/inst/include/darknet/src/lstm_layer.h b/image.darknet/inst/include/darknet/src/lstm_layer.h
new file mode 100644
index 0000000..b9f07e6
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/lstm_layer.h
@@ -0,0 +1,20 @@
+#ifndef LSTM_LAYER_H
+#define LSTM_LAYER_H
+
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+#define USET
+
+layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
+
+void forward_lstm_layer(layer l, network net); 
+void update_lstm_layer(layer l, update_args a);
+
+#ifdef GPU
+void forward_lstm_layer_gpu(layer l, network net);
+void backward_lstm_layer_gpu(layer l, network net);
+void update_lstm_layer_gpu(layer l, update_args a); 
+
+#endif
+#endif
diff --git a/image.darknet/inst/include/darknet/src/matrix.c b/image.darknet/inst/include/darknet/src/matrix.c
index ee14979..799916b 100644
--- a/image.darknet/inst/include/darknet/src/matrix.c
+++ b/image.darknet/inst/include/darknet/src/matrix.c
@@ -1,5 +1,6 @@
 #include "matrix.h"
 #include "utils.h"
+#include "blas.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -73,6 +74,20 @@ void matrix_add_matrix(matrix from, matrix to)
     }
 }
 
+matrix copy_matrix(matrix m)
+{
+    matrix c = {0};
+    c.rows = m.rows;
+    c.cols = m.cols;
+    c.vals = calloc(c.rows, sizeof(float *));
+    int i;
+    for(i = 0; i < c.rows; ++i){
+        c.vals[i] = calloc(c.cols, sizeof(float));
+        copy_cpu(c.cols, m.vals[i], 1, c.vals[i], 1);
+    }
+    return c;
+}
+
 matrix make_matrix(int rows, int cols)
 {
     int i;
diff --git a/image.darknet/inst/include/darknet/src/matrix.h b/image.darknet/inst/include/darknet/src/matrix.h
index 641b596..879acd7 100644
--- a/image.darknet/inst/include/darknet/src/matrix.h
+++ b/image.darknet/inst/include/darknet/src/matrix.h
@@ -1,20 +1,11 @@
 #ifndef MATRIX_H
 #define MATRIX_H
-typedef struct matrix{
-    int rows, cols;
-    float **vals;
-} matrix;
+#include "darknet.h"
 
-matrix make_matrix(int rows, int cols);
-void free_matrix(matrix m);
+matrix copy_matrix(matrix m);
 void print_matrix(matrix m);
 
-matrix csv_to_matrix(char *filename);
-void matrix_to_csv(matrix m);
 matrix hold_out_matrix(matrix *m, int n);
-float matrix_topk_accuracy(matrix truth, matrix guess, int k);
-void matrix_add_matrix(matrix from, matrix to);
-void scale_matrix(matrix m, float scale);
 matrix resize_matrix(matrix m, int size);
 
 float *pop_column(matrix *m, int c);
diff --git a/image.darknet/inst/include/darknet/src/maxpool_layer.c b/image.darknet/inst/include/darknet/src/maxpool_layer.c
index 031d116..fb05635 100644
--- a/image.darknet/inst/include/darknet/src/maxpool_layer.c
+++ b/image.darknet/inst/include/darknet/src/maxpool_layer.c
@@ -27,8 +27,8 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
     l.w = w;
     l.c = c;
     l.pad = padding;
-    l.out_w = (w + 2*padding)/stride;
-    l.out_h = (h + 2*padding)/stride;
+    l.out_w = (w + padding - size)/stride + 1;
+    l.out_h = (h + padding - size)/stride + 1;
     l.out_c = c;
     l.outputs = l.out_h * l.out_w * l.out_c;
     l.inputs = h*w*c;
@@ -43,7 +43,7 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
     #ifdef GPU
     l.forward_gpu = forward_maxpool_layer_gpu;
     l.backward_gpu = backward_maxpool_layer_gpu;
-    l.indexes_gpu = cuda_make_int_array(output_size);
+    l.indexes_gpu = cuda_make_int_array(0, output_size);
     l.output_gpu  = cuda_make_array(l.output, output_size);
     l.delta_gpu   = cuda_make_array(l.delta, output_size);
     #endif
@@ -57,8 +57,8 @@ void resize_maxpool_layer(maxpool_layer *l, int w, int h)
     l->w = w;
     l->inputs = h*w*l->c;
 
-    l->out_w = (w + 2*l->pad)/l->stride;
-    l->out_h = (h + 2*l->pad)/l->stride;
+    l->out_w = (w + l->pad - l->size)/l->stride + 1;
+    l->out_h = (h + l->pad - l->size)/l->stride + 1;
     l->outputs = l->out_w * l->out_h * l->c;
     int output_size = l->outputs * l->batch;
 
@@ -70,17 +70,17 @@ void resize_maxpool_layer(maxpool_layer *l, int w, int h)
     cuda_free((float *)l->indexes_gpu);
     cuda_free(l->output_gpu);
     cuda_free(l->delta_gpu);
-    l->indexes_gpu = cuda_make_int_array(output_size);
+    l->indexes_gpu = cuda_make_int_array(0, output_size);
     l->output_gpu  = cuda_make_array(l->output, output_size);
     l->delta_gpu   = cuda_make_array(l->delta,  output_size);
     #endif
 }
 
-void forward_maxpool_layer(const maxpool_layer l, network_state state)
+void forward_maxpool_layer(const maxpool_layer l, network net)
 {
     int b,i,j,k,m,n;
-    int w_offset = -l.pad;
-    int h_offset = -l.pad;
+    int w_offset = -l.pad/2;
+    int h_offset = -l.pad/2;
 
     int h = l.out_h;
     int w = l.out_w;
@@ -100,7 +100,7 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state)
                             int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
                             int valid = (cur_h >= 0 && cur_h < l.h &&
                                          cur_w >= 0 && cur_w < l.w);
-                            float val = (valid != 0) ? state.input[index] : -FLT_MAX;
+                            float val = (valid != 0) ? net.input[index] : -FLT_MAX;
                             max_i = (val > max) ? index : max_i;
                             max   = (val > max) ? val   : max;
                         }
@@ -113,7 +113,7 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state)
     }
 }
 
-void backward_maxpool_layer(const maxpool_layer l, network_state state)
+void backward_maxpool_layer(const maxpool_layer l, network net)
 {
     int i;
     int h = l.out_h;
@@ -121,7 +121,7 @@ void backward_maxpool_layer(const maxpool_layer l, network_state state)
     int c = l.c;
     for(i = 0; i < h*w*c*l.batch; ++i){
         int index = l.indexes[i];
-        state.delta[index] += l.delta[i];
+        net.delta[index] += l.delta[i];
     }
 }
 
diff --git a/image.darknet/inst/include/darknet/src/maxpool_layer.h b/image.darknet/inst/include/darknet/src/maxpool_layer.h
index ce56dd8..ceb5190 100644
--- a/image.darknet/inst/include/darknet/src/maxpool_layer.h
+++ b/image.darknet/inst/include/darknet/src/maxpool_layer.h
@@ -11,12 +11,12 @@ typedef layer maxpool_layer;
 image get_maxpool_image(maxpool_layer l);
 maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding);
 void resize_maxpool_layer(maxpool_layer *l, int w, int h);
-void forward_maxpool_layer(const maxpool_layer l, network_state state);
-void backward_maxpool_layer(const maxpool_layer l, network_state state);
+void forward_maxpool_layer(const maxpool_layer l, network net);
+void backward_maxpool_layer(const maxpool_layer l, network net);
 
 #ifdef GPU
-void forward_maxpool_layer_gpu(maxpool_layer l, network_state state);
-void backward_maxpool_layer_gpu(maxpool_layer l, network_state state);
+void forward_maxpool_layer_gpu(maxpool_layer l, network net);
+void backward_maxpool_layer_gpu(maxpool_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/maxpool_layer_kernels.cu b/image.darknet/inst/include/darknet/src/maxpool_layer_kernels.cu
index 6381cc1..869ef46 100644
--- a/image.darknet/inst/include/darknet/src/maxpool_layer_kernels.cu
+++ b/image.darknet/inst/include/darknet/src/maxpool_layer_kernels.cu
@@ -9,8 +9,8 @@ extern "C" {
 
 __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride, int size, int pad, float *input, float *output, int *indexes)
 {
-    int h = (in_h + 2*pad)/stride;
-    int w = (in_w + 2*pad)/stride;
+    int h = (in_h + pad - size)/stride + 1;
+    int w = (in_w + pad - size)/stride + 1;
     int c = in_c;
 
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -24,8 +24,8 @@ __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c
     id /= c;
     int b = id;
 
-    int w_offset = -pad;
-    int h_offset = -pad;
+    int w_offset = -pad/2;
+    int h_offset = -pad/2;
 
     int out_index = j + w*(i + h*(k + c*b));
     float max = -INFINITY;
@@ -49,8 +49,8 @@ __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c
 
 __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride, int size, int pad, float *delta, float *prev_delta, int *indexes)
 {
-    int h = (in_h + 2*pad)/stride;
-    int w = (in_w + 2*pad)/stride;
+    int h = (in_h + pad - size)/stride + 1;
+    int w = (in_w + pad - size)/stride + 1;
     int c = in_c;
     int area = (size-1)/stride;
 
@@ -66,8 +66,8 @@ __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_
     id /= in_c;
     int b = id;
 
-    int w_offset = -pad;
-    int h_offset = -pad;
+    int w_offset = -pad/2;
+    int h_offset = -pad/2;
 
     float d = 0;
     int l, m;
@@ -84,7 +84,7 @@ __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_
     prev_delta[index] += d;
 }
 
-extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
+extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network net)
 {
     int h = layer.out_h;
     int w = layer.out_w;
@@ -92,15 +92,15 @@ extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state sta
 
     size_t n = h*w*c*layer.batch;
 
-    forward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu);
+    forward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, net.input_gpu, layer.output_gpu, layer.indexes_gpu);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
+extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network net)
 {
     size_t n = layer.h*layer.w*layer.c*layer.batch;
 
-    backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, layer.delta_gpu, state.delta, layer.indexes_gpu);
+    backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, layer.delta_gpu, net.delta_gpu, layer.indexes_gpu);
     check_error(cudaPeekAtLastError());
 }
 
diff --git a/image.darknet/inst/include/darknet/src/network.c b/image.darknet/inst/include/darknet/src/network.c
index 0914e37..aaab799 100644
--- a/image.darknet/inst/include/darknet/src/network.c
+++ b/image.darknet/inst/include/darknet/src/network.c
@@ -17,6 +17,7 @@
 #include "activation_layer.h"
 #include "detection_layer.h"
 #include "region_layer.h"
+#include "yolo_layer.h"
 #include "normalization_layer.h"
 #include "batchnorm_layer.h"
 #include "maxpool_layer.h"
@@ -26,55 +27,95 @@
 #include "softmax_layer.h"
 #include "dropout_layer.h"
 #include "route_layer.h"
+#include "upsample_layer.h"
 #include "shortcut_layer.h"
+#include "parser.h"
+#include "data.h"
+
+load_args get_base_args(network *net)
+{
+    load_args args = {0};
+    args.w = net->w;
+    args.h = net->h;
+    args.size = net->w;
+
+    args.min = net->min_crop;
+    args.max = net->max_crop;
+    args.angle = net->angle;
+    args.aspect = net->aspect;
+    args.exposure = net->exposure;
+    args.center = net->center;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
+    return args;
+}
+
+network *load_network(char *cfg, char *weights, int clear)
+{
+    network *net = parse_network_cfg(cfg);
+    if(weights && weights[0] != 0){
+        load_weights(net, weights);
+    }
+    if(clear) (*net->seen) = 0;
+    return net;
+}
 
-int get_current_batch(network net)
+size_t get_current_batch(network *net)
 {
-    int batch_num = (*net.seen)/(net.batch*net.subdivisions);
+    size_t batch_num = (*net->seen)/(net->batch*net->subdivisions);
     return batch_num;
 }
 
-void reset_momentum(network net)
+void reset_network_state(network *net, int b)
 {
-    if (net.momentum == 0) return;
-    net.learning_rate = 0;
-    net.momentum = 0;
-    net.decay = 0;
-    #ifdef GPU
-        //if(net.gpu_index >= 0) update_network_gpu(net);
-    #endif
+    int i;
+    for (i = 0; i < net->n; ++i) {
+        #ifdef GPU
+        layer l = net->layers[i];
+        if(l.state_gpu){
+            fill_gpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
+        }
+        if(l.h_gpu){
+            fill_gpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1);
+        }
+        #endif
+    }
 }
 
-float get_current_rate(network net)
+void reset_rnn(network *net)
 {
-    int batch_num = get_current_batch(net);
+    reset_network_state(net, 0);
+}
+
+float get_current_rate(network *net)
+{
+    size_t batch_num = get_current_batch(net);
     int i;
     float rate;
-    switch (net.policy) {
+    if (batch_num < net->burn_in) return net->learning_rate * pow((float)batch_num / net->burn_in, net->power);
+    switch (net->policy) {
         case CONSTANT:
-            return net.learning_rate;
+            return net->learning_rate;
         case STEP:
-            return net.learning_rate * pow(net.scale, batch_num/net.step);
+            return net->learning_rate * pow(net->scale, batch_num/net->step);
         case STEPS:
-            rate = net.learning_rate;
-            for(i = 0; i < net.num_steps; ++i){
-                if(net.steps[i] > batch_num) return rate;
-                rate *= net.scales[i];
-                //if(net.steps[i] > batch_num - 1 && net.scales[i] > 1) reset_momentum(net);
+            rate = net->learning_rate;
+            for(i = 0; i < net->num_steps; ++i){
+                if(net->steps[i] > batch_num) return rate;
+                rate *= net->scales[i];
             }
             return rate;
         case EXP:
-            return net.learning_rate * pow(net.gamma, batch_num);
+            return net->learning_rate * pow(net->gamma, batch_num);
         case POLY:
-            if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
-            return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
+            return net->learning_rate * pow(1 - (float)batch_num / net->max_batches, net->power);
         case RANDOM:
-            return net.learning_rate * pow(rand_uniform(0,1), net.power);
+            return net->learning_rate * pow(rand_uniform(0,1), net->power);
         case SIG:
-            return net.learning_rate * (1./(1.+exp(net.gamma*(batch_num - net.step))));
+            return net->learning_rate * (1./(1.+exp(net->gamma*(batch_num - net->step))));
         default:
             fprintf(stderr, "Policy is weird!\n");
-            return net.learning_rate;
+            return net->learning_rate;
     }
 }
 
@@ -95,6 +136,8 @@ char *get_layer_string(LAYER_TYPE a)
             return "rnn";
         case GRU:
             return "gru";
+        case LSTM:
+	    return "lstm";
         case CRNN:
             return "crnn";
         case MAXPOOL:
@@ -109,6 +152,8 @@ char *get_layer_string(LAYER_TYPE a)
             return "detection";
         case REGION:
             return "region";
+        case YOLO:
+            return "yolo";
         case DROPOUT:
             return "dropout";
         case CROP:
@@ -129,59 +174,75 @@ char *get_layer_string(LAYER_TYPE a)
     return "none";
 }
 
-network make_network(int n)
+network *make_network(int n)
 {
-    network net = {0};
-    net.n = n;
-    net.layers = calloc(net.n, sizeof(layer));
-    net.seen = calloc(1, sizeof(int));
-    #ifdef GPU
-    net.input_gpu = calloc(1, sizeof(float *));
-    net.truth_gpu = calloc(1, sizeof(float *));
-    #endif
+    network *net = calloc(1, sizeof(network));
+    net->n = n;
+    net->layers = calloc(net->n, sizeof(layer));
+    net->seen = calloc(1, sizeof(size_t));
+    net->t    = calloc(1, sizeof(int));
+    net->cost = calloc(1, sizeof(float));
     return net;
 }
 
-void forward_network(network net, network_state state)
+void forward_network(network *netp)
 {
-    state.workspace = net.workspace;
+#ifdef GPU
+    if(netp->gpu_index >= 0){
+        forward_network_gpu(netp);   
+        return;
+    }
+#endif
+    network net = *netp;
     int i;
     for(i = 0; i < net.n; ++i){
-        state.index = i;
+        net.index = i;
         layer l = net.layers[i];
         if(l.delta){
-            scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
+            fill_cpu(l.outputs * l.batch, 0, l.delta, 1);
+        }
+        l.forward(l, net);
+        net.input = l.output;
+        if(l.truth) {
+            net.truth = l.output;
         }
-        l.forward(l, state);
-        state.input = l.output;
     }
+    calc_network_cost(netp);
 }
 
-void update_network(network net)
+void update_network(network *netp)
 {
+#ifdef GPU
+    if(netp->gpu_index >= 0){
+        update_network_gpu(netp);   
+        return;
+    }
+#endif
+    network net = *netp;
     int i;
-    int update_batch = net.batch*net.subdivisions;
-    float rate = get_current_rate(net);
+    update_args a = {0};
+    a.batch = net.batch*net.subdivisions;
+    a.learning_rate = get_current_rate(netp);
+    a.momentum = net.momentum;
+    a.decay = net.decay;
+    a.adam = net.adam;
+    a.B1 = net.B1;
+    a.B2 = net.B2;
+    a.eps = net.eps;
+    ++*net.t;
+    a.t = *net.t;
+
     for(i = 0; i < net.n; ++i){
         layer l = net.layers[i];
         if(l.update){
-            l.update(l, update_batch, rate, net.momentum, net.decay);
+            l.update(l, a);
         }
     }
 }
 
-float *get_network_output(network net)
-{
-#ifdef GPU
-    if (gpu_index >= 0) return get_network_output_gpu(net);
-#endif 
-    int i;
-    for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
-    return net.layers[i].output;
-}
-
-float get_network_cost(network net)
+void calc_network_cost(network *netp)
 {
+    network net = *netp;
     int i;
     float sum = 0;
     int count = 0;
@@ -191,120 +252,90 @@ float get_network_cost(network net)
             ++count;
         }
     }
-    return sum/count;
+    *net.cost = sum/count;
 }
 
-int get_predicted_class_network(network net)
+int get_predicted_class_network(network *net)
 {
-    float *out = get_network_output(net);
-    int k = get_network_output_size(net);
-    return max_index(out, k);
+    return max_index(net->output, net->outputs);
 }
 
-void backward_network(network net, network_state state)
+void backward_network(network *netp)
 {
+#ifdef GPU
+    if(netp->gpu_index >= 0){
+        backward_network_gpu(netp);   
+        return;
+    }
+#endif
+    network net = *netp;
     int i;
-    float *original_input = state.input;
-    float *original_delta = state.delta;
-    state.workspace = net.workspace;
+    network orig = net;
     for(i = net.n-1; i >= 0; --i){
-        state.index = i;
+        layer l = net.layers[i];
+        if(l.stopbackward) break;
         if(i == 0){
-            state.input = original_input;
-            state.delta = original_delta;
+            net = orig;
         }else{
             layer prev = net.layers[i-1];
-            state.input = prev.output;
-            state.delta = prev.delta;
+            net.input = prev.output;
+            net.delta = prev.delta;
         }
-        layer l = net.layers[i];
-        l.backward(l, state);
+        net.index = i;
+        l.backward(l, net);
     }
 }
 
-float train_network_datum(network net, float *x, float *y)
+float train_network_datum(network *net)
 {
-#ifdef GPU
-    if(gpu_index >= 0) return train_network_datum_gpu(net, x, y);
-#endif
-    network_state state;
-    *net.seen += net.batch;
-    state.index = 0;
-    state.net = net;
-    state.input = x;
-    state.delta = 0;
-    state.truth = y;
-    state.train = 1;
-    forward_network(net, state);
-    backward_network(net, state);
-    float error = get_network_cost(net);
-    if(((*net.seen)/net.batch)%net.subdivisions == 0) update_network(net);
+    *net->seen += net->batch;
+    net->train = 1;
+    forward_network(net);
+    backward_network(net);
+    float error = *net->cost;
+    if(((*net->seen)/net->batch)%net->subdivisions == 0) update_network(net);
     return error;
 }
 
-float train_network_sgd(network net, data d, int n)
+float train_network_sgd(network *net, data d, int n)
 {
-    int batch = net.batch;
-    float *X = calloc(batch*d.X.cols, sizeof(float));
-    float *y = calloc(batch*d.y.cols, sizeof(float));
+    int batch = net->batch;
 
     int i;
     float sum = 0;
     for(i = 0; i < n; ++i){
-        get_random_batch(d, batch, X, y);
-        float err = train_network_datum(net, X, y);
+        get_random_batch(d, batch, net->input, net->truth);
+        float err = train_network_datum(net);
         sum += err;
     }
-    free(X);
-    free(y);
     return (float)sum/(n*batch);
 }
 
-float train_network(network net, data d)
+float train_network(network *net, data d)
 {
-    assert(d.X.rows % net.batch == 0);
-    int batch = net.batch;
+    assert(d.X.rows % net->batch == 0);
+    int batch = net->batch;
     int n = d.X.rows / batch;
-    float *X = calloc(batch*d.X.cols, sizeof(float));
-    float *y = calloc(batch*d.y.cols, sizeof(float));
 
     int i;
     float sum = 0;
     for(i = 0; i < n; ++i){
-        get_next_batch(d, batch, i*batch, X, y);
-        float err = train_network_datum(net, X, y);
+        get_next_batch(d, batch, i*batch, net->input, net->truth);
+        float err = train_network_datum(net);
         sum += err;
     }
-    free(X);
-    free(y);
     return (float)sum/(n*batch);
 }
 
-
-float train_network_batch(network net, data d, int n)
+void set_temp_network(network *net, float t)
 {
-    int i,j;
-    network_state state;
-    state.index = 0;
-    state.net = net;
-    state.train = 1;
-    state.delta = 0;
-    float sum = 0;
-    int batch = 2;
-    for(i = 0; i < n; ++i){
-        for(j = 0; j < batch; ++j){
-            int index = rand()%d.X.rows;
-            state.input = d.X.vals[index];
-            state.truth = d.y.vals[index];
-            forward_network(net, state);
-            backward_network(net, state);
-            sum += get_network_cost(net);
-        }
-        update_network(net);
+    int i;
+    for(i = 0; i < net->n; ++i){
+        net->layers[i].temperature = t;
     }
-    return (float)sum/(n*batch);
 }
 
+
 void set_batch_network(network *net, int b)
 {
     net->batch = b;
@@ -315,6 +346,11 @@ void set_batch_network(network *net, int b)
         if(net->layers[i].type == CONVOLUTIONAL){
             cudnn_convolutional_setup(net->layers + i);
         }
+        if(net->layers[i].type == DECONVOLUTIONAL){
+            layer *l = net->layers + i;
+            cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, l->out_h, l->out_w);
+            cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
+        }
 #endif
     }
 }
@@ -323,9 +359,7 @@ int resize_network(network *net, int w, int h)
 {
 #ifdef GPU
     cuda_set_device(net->gpu_index);
-    if(gpu_index >= 0){
-        cuda_free(net->workspace);
-    }
+    cuda_free(net->workspace);
 #endif
     int i;
     //if(w == net->w && h == net->h) return 0;
@@ -345,8 +379,14 @@ int resize_network(network *net, int w, int h)
             resize_maxpool_layer(&l, w, h);
         }else if(l.type == REGION){
             resize_region_layer(&l, w, h);
+        }else if(l.type == YOLO){
+            resize_yolo_layer(&l, w, h);
         }else if(l.type == ROUTE){
             resize_route_layer(&l, net);
+        }else if(l.type == SHORTCUT){
+            resize_shortcut_layer(&l, w, h);
+        }else if(l.type == UPSAMPLE){
+            resize_upsample_layer(&l, w, h);
         }else if(l.type == REORG){
             resize_reorg_layer(&l, w, h);
         }else if(l.type == AVGPOOL){
@@ -359,21 +399,32 @@ int resize_network(network *net, int w, int h)
             error("Cannot resize this type of layer");
         }
         if(l.workspace_size > workspace_size) workspace_size = l.workspace_size;
+        if(l.workspace_size > 2000000000) assert(0);
         inputs = l.outputs;
         net->layers[i] = l;
         w = l.out_w;
         h = l.out_h;
         if(l.type == AVGPOOL) break;
     }
+    layer out = get_network_output_layer(net);
+    net->inputs = net->layers[0].inputs;
+    net->outputs = out.outputs;
+    net->truths = out.outputs;
+    if(net->layers[net->n-1].truths) net->truths = net->layers[net->n-1].truths;
+    net->output = out.output;
+    free(net->input);
+    free(net->truth);
+    net->input = calloc(net->inputs*net->batch, sizeof(float));
+    net->truth = calloc(net->truths*net->batch, sizeof(float));
 #ifdef GPU
     if(gpu_index >= 0){
-        if(net->input_gpu) {
-            cuda_free(*net->input_gpu);
-            *net->input_gpu = 0;
-            cuda_free(*net->truth_gpu);
-            *net->truth_gpu = 0;
+        cuda_free(net->input_gpu);
+        cuda_free(net->truth_gpu);
+        net->input_gpu = cuda_make_array(net->input, net->inputs*net->batch);
+        net->truth_gpu = cuda_make_array(net->truth, net->truths*net->batch);
+        if(workspace_size){
+            net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
         }
-        net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
     }else {
         free(net->workspace);
         net->workspace = calloc(1, workspace_size);
@@ -386,34 +437,25 @@ int resize_network(network *net, int w, int h)
     return 0;
 }
 
-int get_network_output_size(network net)
+layer get_network_detection_layer(network *net)
 {
     int i;
-    for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
-    return net.layers[i].outputs;
-}
-
-int get_network_input_size(network net)
-{
-    return net.layers[0].inputs;
-}
-
-detection_layer get_network_detection_layer(network net)
-{
-    int i;
-    for(i = 0; i < net.n; ++i){
-        if(net.layers[i].type == DETECTION){
-            return net.layers[i];
+    for(i = 0; i < net->n; ++i){
+        if(net->layers[i].type == DETECTION){
+            return net->layers[i];
         }
     }
     fprintf(stderr, "Detection layer not found!!\n");
-    detection_layer l = {0};
+    layer l = {0};
     return l;
 }
 
-image get_network_image_layer(network net, int i)
+image get_network_image_layer(network *net, int i)
 {
-    layer l = net.layers[i];
+    layer l = net->layers[i];
+#ifdef GPU
+    //cuda_pull_array(l.output_gpu, l.output, l.outputs);
+#endif
     if (l.out_w && l.out_h && l.out_c){
         return float_to_image(l.out_w, l.out_h, l.out_c, l.output);
     }
@@ -421,10 +463,10 @@ image get_network_image_layer(network net, int i)
     return def;
 }
 
-image get_network_image(network net)
+image get_network_image(network *net)
 {
     int i;
-    for(i = net.n-1; i >= 0; --i){
+    for(i = net->n-1; i >= 0; --i){
         image m = get_network_image_layer(net, i);
         if(m.h != 0) return m;
     }
@@ -432,60 +474,134 @@ image get_network_image(network net)
     return def;
 }
 
-void visualize_network(network net)
+void visualize_network(network *net)
 {
     image *prev = 0;
     int i;
     char buff[256];
-    for(i = 0; i < net.n; ++i){
+    for(i = 0; i < net->n; ++i){
         sprintf(buff, "Layer %d", i);
-        layer l = net.layers[i];
+        layer l = net->layers[i];
         if(l.type == CONVOLUTIONAL){
             prev = visualize_convolutional_layer(l, buff, prev);
         }
     } 
 }
 
-void top_predictions(network net, int k, int *index)
+void top_predictions(network *net, int k, int *index)
 {
-    int size = get_network_output_size(net);
-    float *out = get_network_output(net);
-    top_k(out, size, k, index);
+    top_k(net->output, net->outputs, k, index);
 }
 
 
-float *network_predict(network net, float *input)
+float *network_predict(network *net, float *input)
 {
-#ifdef GPU
-    if(gpu_index >= 0)  return network_predict_gpu(net, input);
-#endif
-
-    network_state state;
-    state.net = net;
-    state.index = 0;
-    state.input = input;
-    state.truth = 0;
-    state.train = 0;
-    state.delta = 0;
-    forward_network(net, state);
-    float *out = get_network_output(net);
+    network orig = *net;
+    net->input = input;
+    net->truth = 0;
+    net->train = 0;
+    net->delta = 0;
+    forward_network(net);
+    float *out = net->output;
+    *net = orig;
     return out;
 }
 
-matrix network_predict_data_multi(network net, data test, int n)
+int num_detections(network *net, float thresh)
+{
+    int i;
+    int s = 0;
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        if(l.type == YOLO){
+            s += yolo_num_detections(l, thresh);
+        }
+        if(l.type == DETECTION || l.type == REGION){
+            s += l.w*l.h*l.n;
+        }
+    }
+    return s;
+}
+
+detection *make_network_boxes(network *net, float thresh, int *num)
+{
+    layer l = net->layers[net->n - 1];
+    int i;
+    int nboxes = num_detections(net, thresh);
+    if(num) *num = nboxes;
+    detection *dets = calloc(nboxes, sizeof(detection));
+    for(i = 0; i < nboxes; ++i){
+        dets[i].prob = calloc(l.classes, sizeof(float));
+        if(l.coords > 4){
+            dets[i].mask = calloc(l.coords-4, sizeof(float));
+        }
+    }
+    return dets;
+}
+
+void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets)
+{
+    int j;
+    for(j = 0; j < net->n; ++j){
+        layer l = net->layers[j];
+        if(l.type == YOLO){
+            int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets);
+            dets += count;
+        }
+        if(l.type == REGION){
+            get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
+            dets += l.w*l.h*l.n;
+        }
+        if(l.type == DETECTION){
+            get_detection_detections(l, w, h, thresh, dets);
+            dets += l.w*l.h*l.n;
+        }
+    }
+}
+
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num)
+{
+    detection *dets = make_network_boxes(net, thresh, num);
+    fill_network_boxes(net, w, h, thresh, hier, map, relative, dets);
+    return dets;
+}
+
+void free_detections(detection *dets, int n)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        free(dets[i].prob);
+        if(dets[i].mask) free(dets[i].mask);
+    }
+    free(dets);
+}
+
+float *network_predict_image(network *net, image im)
+{
+    image imr = letterbox_image(im, net->w, net->h);
+    set_batch_network(net, 1);
+    float *p = network_predict(net, imr.data);
+    free_image(imr);
+    return p;
+}
+
+int network_width(network *net){return net->w;}
+int network_height(network *net){return net->h;}
+
+matrix network_predict_data_multi(network *net, data test, int n)
 {
     int i,j,b,m;
-    int k = get_network_output_size(net);
+    int k = net->outputs;
     matrix pred = make_matrix(test.X.rows, k);
-    float *X = calloc(net.batch*test.X.rows, sizeof(float));
-    for(i = 0; i < test.X.rows; i += net.batch){
-        for(b = 0; b < net.batch; ++b){
+    float *X = calloc(net->batch*test.X.rows, sizeof(float));
+    for(i = 0; i < test.X.rows; i += net->batch){
+        for(b = 0; b < net->batch; ++b){
             if(i+b == test.X.rows) break;
             memcpy(X+b*test.X.cols, test.X.vals[i+b], test.X.cols*sizeof(float));
         }
         for(m = 0; m < n; ++m){
             float *out = network_predict(net, X);
-            for(b = 0; b < net.batch; ++b){
+            for(b = 0; b < net->batch; ++b){
                 if(i+b == test.X.rows) break;
                 for(j = 0; j < k; ++j){
                     pred.vals[i+b][j] += out[j+b*k]/n;
@@ -497,19 +613,19 @@ matrix network_predict_data_multi(network net, data test, int n)
     return pred;   
 }
 
-matrix network_predict_data(network net, data test)
+matrix network_predict_data(network *net, data test)
 {
     int i,j,b;
-    int k = get_network_output_size(net);
+    int k = net->outputs;
     matrix pred = make_matrix(test.X.rows, k);
-    float *X = calloc(net.batch*test.X.cols, sizeof(float));
-    for(i = 0; i < test.X.rows; i += net.batch){
-        for(b = 0; b < net.batch; ++b){
+    float *X = calloc(net->batch*test.X.cols, sizeof(float));
+    for(i = 0; i < test.X.rows; i += net->batch){
+        for(b = 0; b < net->batch; ++b){
             if(i+b == test.X.rows) break;
             memcpy(X+b*test.X.cols, test.X.vals[i+b], test.X.cols*sizeof(float));
         }
         float *out = network_predict(net, X);
-        for(b = 0; b < net.batch; ++b){
+        for(b = 0; b < net->batch; ++b){
             if(i+b == test.X.rows) break;
             for(j = 0; j < k; ++j){
                 pred.vals[i+b][j] = out[j+b*k];
@@ -520,11 +636,11 @@ matrix network_predict_data(network net, data test)
     return pred;   
 }
 
-void print_network(network net)
+void print_network(network *net)
 {
     int i,j;
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
         float *output = l.output;
         int n = l.outputs;
         float mean = mean_array(output, n);
@@ -537,7 +653,7 @@ void print_network(network net)
     }
 }
 
-void compare_networks(network n1, network n2, data test)
+void compare_networks(network *n1, network *n2, data test)
 {
     matrix g1 = network_predict_data(n1, test);
     matrix g2 = network_predict_data(n2, test);
@@ -562,7 +678,7 @@ void compare_networks(network n1, network n2, data test)
     printf("%f\n", num/den); 
 }
 
-float network_accuracy(network net, data d)
+float network_accuracy(network *net, data d)
 {
     matrix guess = network_predict_data(net, d);
     float acc = matrix_topk_accuracy(d.y, guess,1);
@@ -570,7 +686,7 @@ float network_accuracy(network net, data d)
     return acc;
 }
 
-float *network_accuracies(network net, data d, int n)
+float *network_accuracies(network *net, data d, int n)
 {
     static float acc[2];
     matrix guess = network_predict_data(net, d);
@@ -580,7 +696,16 @@ float *network_accuracies(network net, data d, int n)
     return acc;
 }
 
-float network_accuracy_multi(network net, data d, int n)
+layer get_network_output_layer(network *net)
+{
+    int i;
+    for(i = net->n - 1; i >= 0; --i){
+        if(net->layers[i].type != COST) break;
+    }
+    return net->layers[i];
+}
+
+float network_accuracy_multi(network *net, data d, int n)
 {
     matrix guess = network_predict_data_multi(net, d, n);
     float acc = matrix_topk_accuracy(d.y, guess,1);
@@ -588,17 +713,417 @@ float network_accuracy_multi(network net, data d, int n)
     return acc;
 }
 
-void free_network(network net)
+void free_network(network *net)
 {
     int i;
-    for(i = 0; i < net.n; ++i){
-        free_layer(net.layers[i]);
+    for(i = 0; i < net->n; ++i){
+        free_layer(net->layers[i]);
     }
-    free(net.layers);
+    free(net->layers);
+    if(net->input) free(net->input);
+    if(net->truth) free(net->truth);
 #ifdef GPU
-    if(*net.input_gpu) cuda_free(*net.input_gpu);
-    if(*net.truth_gpu) cuda_free(*net.truth_gpu);
-    if(net.input_gpu) free(net.input_gpu);
-    if(net.truth_gpu) free(net.truth_gpu);
+    if(net->input_gpu) cuda_free(net->input_gpu);
+    if(net->truth_gpu) cuda_free(net->truth_gpu);
 #endif
+    free(net);
+}
+
+// Some day...
+// ^ What the hell is this comment for?
+
+
+layer network_output_layer(network *net)
+{
+    int i;
+    for(i = net->n - 1; i >= 0; --i){
+        if(net->layers[i].type != COST) break;
+    }
+    return net->layers[i];
 }
+
+int network_inputs(network *net)
+{
+    return net->layers[0].inputs;
+}
+
+int network_outputs(network *net)
+{
+    return network_output_layer(net).outputs;
+}
+
+float *network_output(network *net)
+{
+    return network_output_layer(net).output;
+}
+
+#ifdef GPU
+
+void forward_network_gpu(network *netp)
+{
+    network net = *netp;
+    cuda_set_device(net.gpu_index);
+    cuda_push_array(net.input_gpu, net.input, net.inputs*net.batch);
+    if(net.truth){
+        cuda_push_array(net.truth_gpu, net.truth, net.truths*net.batch);
+    }
+
+    int i;
+    for(i = 0; i < net.n; ++i){
+        net.index = i;
+        layer l = net.layers[i];
+        if(l.delta_gpu){
+            fill_gpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
+        }
+        l.forward_gpu(l, net);
+        net.input_gpu = l.output_gpu;
+        net.input = l.output;
+        if(l.truth) {
+            net.truth_gpu = l.output_gpu;
+            net.truth = l.output;
+        }
+    }
+    pull_network_output(netp);
+    calc_network_cost(netp);
+}
+
+void backward_network_gpu(network *netp)
+{
+    int i;
+    network net = *netp;
+    network orig = net;
+    cuda_set_device(net.gpu_index);
+    for(i = net.n-1; i >= 0; --i){
+        layer l = net.layers[i];
+        if(l.stopbackward) break;
+        if(i == 0){
+            net = orig;
+        }else{
+            layer prev = net.layers[i-1];
+            net.input = prev.output;
+            net.delta = prev.delta;
+            net.input_gpu = prev.output_gpu;
+            net.delta_gpu = prev.delta_gpu;
+        }
+        net.index = i;
+        l.backward_gpu(l, net);
+    }
+}
+
+void update_network_gpu(network *netp)
+{
+    network net = *netp;
+    cuda_set_device(net.gpu_index);
+    int i;
+    update_args a = {0};
+    a.batch = net.batch*net.subdivisions;
+    a.learning_rate = get_current_rate(netp);
+    a.momentum = net.momentum;
+    a.decay = net.decay;
+    a.adam = net.adam;
+    a.B1 = net.B1;
+    a.B2 = net.B2;
+    a.eps = net.eps;
+    ++*net.t;
+    a.t = (*net.t);
+
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        if(l.update_gpu){
+            l.update_gpu(l, a);
+        }
+    }
+}
+
+void harmless_update_network_gpu(network *netp)
+{
+    network net = *netp;
+    cuda_set_device(net.gpu_index);
+    int i;
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        if(l.weight_updates_gpu) fill_gpu(l.nweights, 0, l.weight_updates_gpu, 1);
+        if(l.bias_updates_gpu) fill_gpu(l.nbiases, 0, l.bias_updates_gpu, 1);
+        if(l.scale_updates_gpu) fill_gpu(l.nbiases, 0, l.scale_updates_gpu, 1);
+    }
+}
+
+typedef struct {
+    network *net;
+    data d;
+    float *err;
+} train_args;
+
+void *train_thread(void *ptr)
+{
+    train_args args = *(train_args*)ptr;
+    free(ptr);
+    cuda_set_device(args.net->gpu_index);
+    *args.err = train_network(args.net, args.d);
+    return 0;
+}
+
+pthread_t train_network_in_thread(network *net, data d, float *err)
+{
+    pthread_t thread;
+    train_args *ptr = (train_args *)calloc(1, sizeof(train_args));
+    ptr->net = net;
+    ptr->d = d;
+    ptr->err = err;
+    if(pthread_create(&thread, 0, train_thread, ptr)) error("Thread creation failed");
+    return thread;
+}
+
+void merge_weights(layer l, layer base)
+{
+    if (l.type == CONVOLUTIONAL) {
+        axpy_cpu(l.n, 1, l.bias_updates, 1, base.biases, 1);
+        axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weights, 1);
+        if (l.scales) {
+            axpy_cpu(l.n, 1, l.scale_updates, 1, base.scales, 1);
+        }
+    } else if(l.type == CONNECTED) {
+        axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.biases, 1);
+        axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weights, 1);
+    }
+}
+
+void scale_weights(layer l, float s)
+{
+    if (l.type == CONVOLUTIONAL) {
+        scal_cpu(l.n, s, l.biases, 1);
+        scal_cpu(l.nweights, s, l.weights, 1);
+        if (l.scales) {
+            scal_cpu(l.n, s, l.scales, 1);
+        }
+    } else if(l.type == CONNECTED) {
+        scal_cpu(l.outputs, s, l.biases, 1);
+        scal_cpu(l.outputs*l.inputs, s, l.weights, 1);
+    }
+}
+
+
+void pull_weights(layer l)
+{
+    if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
+        cuda_pull_array(l.biases_gpu, l.bias_updates, l.n);
+        cuda_pull_array(l.weights_gpu, l.weight_updates, l.nweights);
+        if(l.scales) cuda_pull_array(l.scales_gpu, l.scale_updates, l.n);
+    } else if(l.type == CONNECTED){
+        cuda_pull_array(l.biases_gpu, l.bias_updates, l.outputs);
+        cuda_pull_array(l.weights_gpu, l.weight_updates, l.outputs*l.inputs);
+    }
+}
+
+void push_weights(layer l)
+{
+    if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
+        cuda_push_array(l.biases_gpu, l.biases, l.n);
+        cuda_push_array(l.weights_gpu, l.weights, l.nweights);
+        if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n);
+    } else if(l.type == CONNECTED){
+        cuda_push_array(l.biases_gpu, l.biases, l.outputs);
+        cuda_push_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
+    }
+}
+
+void distribute_weights(layer l, layer base)
+{
+    if (l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL) {
+        cuda_push_array(l.biases_gpu, base.biases, l.n);
+        cuda_push_array(l.weights_gpu, base.weights, l.nweights);
+        if (base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n);
+    } else if (l.type == CONNECTED) {
+        cuda_push_array(l.biases_gpu, base.biases, l.outputs);
+        cuda_push_array(l.weights_gpu, base.weights, l.outputs*l.inputs);
+    }
+}
+
+
+/*
+
+   void pull_updates(layer l)
+   {
+   if(l.type == CONVOLUTIONAL){
+   cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
+   cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+   if(l.scale_updates) cuda_pull_array(l.scale_updates_gpu, l.scale_updates, l.n);
+   } else if(l.type == CONNECTED){
+   cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
+   cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
+   }
+   }
+
+   void push_updates(layer l)
+   {
+   if(l.type == CONVOLUTIONAL){
+   cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+   cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+   if(l.scale_updates) cuda_push_array(l.scale_updates_gpu, l.scale_updates, l.n);
+   } else if(l.type == CONNECTED){
+   cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
+   cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
+   }
+   }
+
+   void update_layer(layer l, network net)
+   {
+   int update_batch = net.batch*net.subdivisions;
+   float rate = get_current_rate(net);
+   l.t = get_current_batch(net);
+   if(l.update_gpu){
+   l.update_gpu(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay);
+   }
+   }
+   void merge_updates(layer l, layer base)
+   {
+   if (l.type == CONVOLUTIONAL) {
+   axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1);
+   axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weight_updates, 1);
+   if (l.scale_updates) {
+   axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1);
+   }
+   } else if(l.type == CONNECTED) {
+   axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.bias_updates, 1);
+   axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weight_updates, 1);
+   }
+   }
+
+   void distribute_updates(layer l, layer base)
+   {
+   if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
+   cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.n);
+   cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.nweights);
+   if(base.scale_updates) cuda_push_array(l.scale_updates_gpu, base.scale_updates, l.n);
+   } else if(l.type == CONNECTED){
+   cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.outputs);
+   cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.outputs*l.inputs);
+   }
+   }
+ */
+
+/*
+   void sync_layer(network *nets, int n, int j)
+   {
+   int i;
+   network net = nets[0];
+   layer base = net.layers[j];
+   scale_weights(base, 0);
+   for (i = 0; i < n; ++i) {
+   cuda_set_device(nets[i].gpu_index);
+   layer l = nets[i].layers[j];
+   pull_weights(l);
+   merge_weights(l, base);
+   }
+   scale_weights(base, 1./n);
+   for (i = 0; i < n; ++i) {
+   cuda_set_device(nets[i].gpu_index);
+   layer l = nets[i].layers[j];
+   distribute_weights(l, base);
+   }
+   }
+ */
+
+void sync_layer(network **nets, int n, int j)
+{
+    int i;
+    network *net = nets[0];
+    layer base = net->layers[j];
+    scale_weights(base, 0);
+    for (i = 0; i < n; ++i) {
+        cuda_set_device(nets[i]->gpu_index);
+        layer l = nets[i]->layers[j];
+        pull_weights(l);
+        merge_weights(l, base);
+    }
+    scale_weights(base, 1./n);
+    for (i = 0; i < n; ++i) {
+        cuda_set_device(nets[i]->gpu_index);
+        layer l = nets[i]->layers[j];
+        distribute_weights(l, base);
+    }
+}
+
+typedef struct{
+    network **nets;
+    int n;
+    int j;
+} sync_args;
+
+void *sync_layer_thread(void *ptr)
+{
+    sync_args args = *(sync_args*)ptr;
+    sync_layer(args.nets, args.n, args.j);
+    free(ptr);
+    return 0;
+}
+
+pthread_t sync_layer_in_thread(network **nets, int n, int j)
+{
+    pthread_t thread;
+    sync_args *ptr = (sync_args *)calloc(1, sizeof(sync_args));
+    ptr->nets = nets;
+    ptr->n = n;
+    ptr->j = j;
+    if(pthread_create(&thread, 0, sync_layer_thread, ptr)) error("Thread creation failed");
+    return thread;
+}
+
+void sync_nets(network **nets, int n, int interval)
+{
+    int j;
+    int layers = nets[0]->n;
+    pthread_t *threads = (pthread_t *) calloc(layers, sizeof(pthread_t));
+
+    *(nets[0]->seen) += interval * (n-1) * nets[0]->batch * nets[0]->subdivisions;
+    for (j = 0; j < n; ++j){
+        *(nets[j]->seen) = *(nets[0]->seen);
+    }
+    for (j = 0; j < layers; ++j) {
+        threads[j] = sync_layer_in_thread(nets, n, j);
+    }
+    for (j = 0; j < layers; ++j) {
+        pthread_join(threads[j], 0);
+    }
+    free(threads);
+}
+
+float train_networks(network **nets, int n, data d, int interval)
+{
+    int i;
+    int batch = nets[0]->batch;
+    int subdivisions = nets[0]->subdivisions;
+    assert(batch * subdivisions * n == d.X.rows);
+    pthread_t *threads = (pthread_t *) calloc(n, sizeof(pthread_t));
+    float *errors = (float *) calloc(n, sizeof(float));
+
+    float sum = 0;
+    for(i = 0; i < n; ++i){
+        data p = get_data_part(d, i, n);
+        threads[i] = train_network_in_thread(nets[i], p, errors + i);
+    }
+    for(i = 0; i < n; ++i){
+        pthread_join(threads[i], 0);
+        //printf("%f\n", errors[i]);
+        sum += errors[i];
+    }
+    //cudaDeviceSynchronize();
+    if (get_current_batch(nets[0]) % interval == 0) {
+        printf("Syncing... ");
+        fflush(stdout);
+        sync_nets(nets, n, interval);
+        printf("Done!\n");
+    }
+    //cudaDeviceSynchronize();
+    free(threads);
+    free(errors);
+    return (float)sum/(n);
+}
+
+void pull_network_output(network *net)
+{
+    layer l = get_network_output_layer(net);
+    cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
+}
+
+#endif
diff --git a/image.darknet/inst/include/darknet/src/network.h b/image.darknet/inst/include/darknet/src/network.h
index e48cbc2..1b0dfd1 100644
--- a/image.darknet/inst/include/darknet/src/network.h
+++ b/image.darknet/inst/include/darknet/src/network.h
@@ -1,129 +1,29 @@
 // Oh boy, why am I about to do this....
 #ifndef NETWORK_H
 #define NETWORK_H
+#include "darknet.h"
 
 #include "image.h"
 #include "layer.h"
 #include "data.h"
 #include "tree.h"
 
-typedef enum {
-    CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM
-} learning_rate_policy;
-
-typedef struct network{
-    float *workspace;
-    int n;
-    int batch;
-    int *seen;
-    float epoch;
-    int subdivisions;
-    float momentum;
-    float decay;
-    layer *layers;
-    int outputs;
-    float *output;
-    learning_rate_policy policy;
-
-    float learning_rate;
-    float gamma;
-    float scale;
-    float power;
-    int time_steps;
-    int step;
-    int max_batches;
-    float *scales;
-    int   *steps;
-    int num_steps;
-    int burn_in;
-
-    int adam;
-    float B1;
-    float B2;
-    float eps;
-
-    int inputs;
-    int h, w, c;
-    int max_crop;
-    int min_crop;
-    float angle;
-    float aspect;
-    float exposure;
-    float saturation;
-    float hue;
-
-    int gpu_index;
-    tree *hierarchy;
-
-    #ifdef GPU
-    float **input_gpu;
-    float **truth_gpu;
-    #endif
-} network;
-
-typedef struct network_state {
-    float *truth;
-    float *input;
-    float *delta;
-    float *workspace;
-    int train;
-    int index;
-    network net;
-} network_state;
 
 #ifdef GPU
-float train_networks(network *nets, int n, data d, int interval);
-void sync_nets(network *nets, int n, int interval);
-float train_network_datum_gpu(network net, float *x, float *y);
-float *network_predict_gpu(network net, float *input);
-float * get_network_output_gpu_layer(network net, int i);
-float * get_network_delta_gpu_layer(network net, int i);
-float *get_network_output_gpu(network net);
-void forward_network_gpu(network net, network_state state);
-void backward_network_gpu(network net, network_state state);
-void update_network_gpu(network net);
+void pull_network_output(network *net);
 #endif
 
-float get_current_rate(network net);
-int get_current_batch(network net);
-void free_network(network net);
-void compare_networks(network n1, network n2, data d);
+void compare_networks(network *n1, network *n2, data d);
 char *get_layer_string(LAYER_TYPE a);
 
-network make_network(int n);
-void forward_network(network net, network_state state);
-void backward_network(network net, network_state state);
-void update_network(network net);
+network *make_network(int n);
 
-float train_network(network net, data d);
-float train_network_batch(network net, data d, int n);
-float train_network_sgd(network net, data d, int n);
-float train_network_datum(network net, float *x, float *y);
 
-matrix network_predict_data(network net, data test);
-float *network_predict(network net, float *input);
-float network_accuracy(network net, data d);
-float *network_accuracies(network net, data d, int n);
-float network_accuracy_multi(network net, data d, int n);
-void top_predictions(network net, int n, int *index);
-float *get_network_output(network net);
-float *get_network_output_layer(network net, int i);
-float *get_network_delta_layer(network net, int i);
-float *get_network_delta(network net);
-int get_network_output_size_layer(network net, int i);
-int get_network_output_size(network net);
-image get_network_image(network net);
-image get_network_image_layer(network net, int i);
-int get_predicted_class_network(network net);
-void print_network(network net);
-void visualize_network(network net);
+float network_accuracy_multi(network *net, data d, int n);
+int get_predicted_class_network(network *net);
+void print_network(network *net);
 int resize_network(network *net, int w, int h);
-void set_batch_network(network *net, int b);
-int get_network_input_size(network net);
-float get_network_cost(network net);
-
-int get_network_nuisance(network net);
-int get_network_background(network net);
+void calc_network_cost(network *net);
 
 #endif
 
diff --git a/image.darknet/inst/include/darknet/src/network_kernels.cu b/image.darknet/inst/include/darknet/src/network_kernels.cu
deleted file mode 100644
index 313cd6d..0000000
--- a/image.darknet/inst/include/darknet/src/network_kernels.cu
+++ /dev/null
@@ -1,408 +0,0 @@
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
-
-extern "C" {
-#include <stdio.h>
-#include <time.h>
-#include <assert.h>
-
-#include "network.h"
-#include "image.h"
-#include "data.h"
-#include "utils.h"
-#include "parser.h"
-
-#include "crop_layer.h"
-#include "connected_layer.h"
-#include "rnn_layer.h"
-#include "gru_layer.h"
-#include "crnn_layer.h"
-#include "detection_layer.h"
-#include "region_layer.h"
-#include "convolutional_layer.h"
-#include "activation_layer.h"
-#include "maxpool_layer.h"
-#include "reorg_layer.h"
-#include "avgpool_layer.h"
-#include "normalization_layer.h"
-#include "batchnorm_layer.h"
-#include "cost_layer.h"
-#include "local_layer.h"
-#include "softmax_layer.h"
-#include "dropout_layer.h"
-#include "route_layer.h"
-#include "shortcut_layer.h"
-#include "blas.h"
-}
-
-float * get_network_output_gpu_layer(network net, int i);
-float * get_network_delta_gpu_layer(network net, int i);
-float * get_network_output_gpu(network net);
-
-void forward_network_gpu(network net, network_state state)
-{
-    state.workspace = net.workspace;
-    int i;
-    for(i = 0; i < net.n; ++i){
-        state.index = i;
-        layer l = net.layers[i];
-        if(l.delta_gpu){
-            fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
-        }
-        l.forward_gpu(l, state);
-        state.input = l.output_gpu;
-    }
-}
-
-void backward_network_gpu(network net, network_state state)
-{
-    state.workspace = net.workspace;
-    int i;
-    float * original_input = state.input;
-    float * original_delta = state.delta;
-    for(i = net.n-1; i >= 0; --i){
-        state.index = i;
-        layer l = net.layers[i];
-        if(i == 0){
-            state.input = original_input;
-            state.delta = original_delta;
-        }else{
-            layer prev = net.layers[i-1];
-            state.input = prev.output_gpu;
-            state.delta = prev.delta_gpu;
-        }
-        l.backward_gpu(l, state);
-    }
-}
-
-void update_network_gpu(network net)
-{
-    cuda_set_device(net.gpu_index);
-    int i;
-    int update_batch = net.batch*net.subdivisions;
-    float rate = get_current_rate(net);
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
-        l.t = get_current_batch(net);
-        if(l.update_gpu){
-            l.update_gpu(l, update_batch, rate, net.momentum, net.decay);
-        }
-    }
-}
-
-void forward_backward_network_gpu(network net, float *x, float *y)
-{
-    network_state state;
-    state.index = 0;
-    state.net = net;
-    int x_size = get_network_input_size(net)*net.batch;
-    int y_size = get_network_output_size(net)*net.batch;
-    if(net.layers[net.n-1].truths) y_size = net.layers[net.n-1].truths*net.batch;
-    if(!*net.input_gpu){
-        *net.input_gpu = cuda_make_array(x, x_size);
-        *net.truth_gpu = cuda_make_array(y, y_size);
-    }else{
-        cuda_push_array(*net.input_gpu, x, x_size);
-        cuda_push_array(*net.truth_gpu, y, y_size);
-    }
-    state.input = *net.input_gpu;
-    state.delta = 0;
-    state.truth = *net.truth_gpu;
-    state.train = 1;
-    forward_network_gpu(net, state);
-    backward_network_gpu(net, state);
-}
-
-float train_network_datum_gpu(network net, float *x, float *y)
-{
-    *net.seen += net.batch;
-    forward_backward_network_gpu(net, x, y);
-    float error = get_network_cost(net);
-    if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net);
-
-    return error;
-}
-
-typedef struct {
-    network net;
-    data d;
-    float *err;
-} train_args;
-
-void *train_thread(void *ptr)
-{
-    train_args args = *(train_args*)ptr;
-    free(ptr);
-    cuda_set_device(args.net.gpu_index);
-    *args.err = train_network(args.net, args.d);
-    return 0;
-}
-
-pthread_t train_network_in_thread(network net, data d, float *err)
-{
-    pthread_t thread;
-    train_args *ptr = (train_args *)calloc(1, sizeof(train_args));
-    ptr->net = net;
-    ptr->d = d;
-    ptr->err = err;
-    if(pthread_create(&thread, 0, train_thread, ptr)) error("Thread creation failed");
-    return thread;
-}
-
-void pull_updates(layer l)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
-        cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.n*l.size*l.size*l.c);
-        if(l.scale_updates) cuda_pull_array(l.scale_updates_gpu, l.scale_updates, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
-        cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
-    }
-}
-
-void push_updates(layer l)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
-        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.n*l.size*l.size*l.c);
-        if(l.scale_updates) cuda_push_array(l.scale_updates_gpu, l.scale_updates, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
-        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
-    }
-}
-
-void update_layer(layer l, network net)
-{
-    int update_batch = net.batch*net.subdivisions;
-    float rate = get_current_rate(net);
-    l.t = get_current_batch(net);
-    if(l.update_gpu){
-        l.update_gpu(l, update_batch, rate, net.momentum, net.decay);
-    }
-}
-
-void merge_weights(layer l, layer base)
-{
-    if (l.type == CONVOLUTIONAL) {
-        axpy_cpu(l.n, 1, l.biases, 1, base.biases, 1);
-        axpy_cpu(l.n*l.size*l.size*l.c, 1, l.weights, 1, base.weights, 1);
-        if (l.scales) {
-            axpy_cpu(l.n, 1, l.scales, 1, base.scales, 1);
-        }
-    } else if(l.type == CONNECTED) {
-        axpy_cpu(l.outputs, 1, l.biases, 1, base.biases, 1);
-        axpy_cpu(l.outputs*l.inputs, 1, l.weights, 1, base.weights, 1);
-    }
-}
-
-void scale_weights(layer l, float s)
-{
-    if (l.type == CONVOLUTIONAL) {
-        scal_cpu(l.n, s, l.biases, 1);
-        scal_cpu(l.n*l.size*l.size*l.c, s, l.weights, 1);
-        if (l.scales) {
-            scal_cpu(l.n, s, l.scales, 1);
-        }
-    } else if(l.type == CONNECTED) {
-        scal_cpu(l.outputs, s, l.biases, 1);
-        scal_cpu(l.outputs*l.inputs, s, l.weights, 1);
-    }
-}
-
-
-void pull_weights(layer l)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_pull_array(l.biases_gpu, l.biases, l.n);
-        cuda_pull_array(l.weights_gpu, l.weights, l.n*l.size*l.size*l.c);
-        if(l.scales) cuda_pull_array(l.scales_gpu, l.scales, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
-        cuda_pull_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
-    }
-}
-
-void push_weights(layer l)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_push_array(l.biases_gpu, l.biases, l.n);
-        cuda_push_array(l.weights_gpu, l.weights, l.n*l.size*l.size*l.c);
-        if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_push_array(l.biases_gpu, l.biases, l.outputs);
-        cuda_push_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
-    }
-}
-
-void distribute_weights(layer l, layer base)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_push_array(l.biases_gpu, base.biases, l.n);
-        cuda_push_array(l.weights_gpu, base.weights, l.n*l.size*l.size*l.c);
-        if(base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_push_array(l.biases_gpu, base.biases, l.outputs);
-        cuda_push_array(l.weights_gpu, base.weights, l.outputs*l.inputs);
-    }
-}
-
-
-void merge_updates(layer l, layer base)
-{
-    if (l.type == CONVOLUTIONAL) {
-        axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1);
-        axpy_cpu(l.n*l.size*l.size*l.c, 1, l.weight_updates, 1, base.weight_updates, 1);
-        if (l.scale_updates) {
-            axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1);
-        }
-    } else if(l.type == CONNECTED) {
-        axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.bias_updates, 1);
-        axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weight_updates, 1);
-    }
-}
-
-void distribute_updates(layer l, layer base)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.n);
-        cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.n*l.size*l.size*l.c);
-        if(base.scale_updates) cuda_push_array(l.scale_updates_gpu, base.scale_updates, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.outputs);
-        cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.outputs*l.inputs);
-    }
-}
-
-void sync_layer(network *nets, int n, int j)
-{
-    //printf("Syncing layer %d\n", j);
-    int i;
-    network net = nets[0];
-    layer base = net.layers[j];
-    cuda_set_device(net.gpu_index);
-    pull_weights(base);
-    for (i = 1; i < n; ++i) {
-        cuda_set_device(nets[i].gpu_index);
-        layer l = nets[i].layers[j];
-        pull_weights(l);
-        merge_weights(l, base);
-    }
-    scale_weights(base, 1./n);
-    for (i = 0; i < n; ++i) {
-        cuda_set_device(nets[i].gpu_index);
-        layer l = nets[i].layers[j];
-        distribute_weights(l, base);
-    }
-    //printf("Done syncing layer %d\n", j);
-}
-
-typedef struct{
-    network *nets;
-    int n;
-    int j;
-} sync_args;
-
-void *sync_layer_thread(void *ptr)
-{
-    sync_args args = *(sync_args*)ptr;
-    sync_layer(args.nets, args.n, args.j);
-    free(ptr);
-    return 0;
-}
-
-pthread_t sync_layer_in_thread(network *nets, int n, int j)
-{
-    pthread_t thread;
-    sync_args *ptr = (sync_args *)calloc(1, sizeof(sync_args));
-    ptr->nets = nets;
-    ptr->n = n;
-    ptr->j = j;
-    if(pthread_create(&thread, 0, sync_layer_thread, ptr)) error("Thread creation failed");
-    return thread;
-}
-
-void sync_nets(network *nets, int n, int interval)
-{
-    int j;
-    int layers = nets[0].n;
-    pthread_t *threads = (pthread_t *) calloc(layers, sizeof(pthread_t));
-
-    *nets[0].seen += interval * (n-1) * nets[0].batch * nets[0].subdivisions;
-    for (j = 0; j < n; ++j){
-        *nets[j].seen = *nets[0].seen;
-    }
-    for (j = 0; j < layers; ++j) {
-        threads[j] = sync_layer_in_thread(nets, n, j);
-    }
-    for (j = 0; j < layers; ++j) {
-        pthread_join(threads[j], 0);
-    }
-    free(threads);
-}
-
-float train_networks(network *nets, int n, data d, int interval)
-{
-    int i;
-    int batch = nets[0].batch;
-    int subdivisions = nets[0].subdivisions;
-    assert(batch * subdivisions * n == d.X.rows);
-    pthread_t *threads = (pthread_t *) calloc(n, sizeof(pthread_t));
-    float *errors = (float *) calloc(n, sizeof(float));
-
-    float sum = 0;
-    for(i = 0; i < n; ++i){
-        data p = get_data_part(d, i, n);
-        threads[i] = train_network_in_thread(nets[i], p, errors + i);
-    }
-    for(i = 0; i < n; ++i){
-        pthread_join(threads[i], 0);
-        //printf("%f\n", errors[i]);
-        sum += errors[i];
-    }
-    //cudaDeviceSynchronize();
-    if (get_current_batch(nets[0]) % interval == 0) {
-        printf("Syncing... ");
-        fflush(stdout);
-        sync_nets(nets, n, interval);
-        printf("Done!\n");
-    }
-    //cudaDeviceSynchronize();
-    free(threads);
-    free(errors);
-    return (float)sum/(n);
-}
-
-float *get_network_output_layer_gpu(network net, int i)
-{
-    layer l = net.layers[i];
-    if(l.type != REGION) cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
-    return l.output;
-}
-
-float *get_network_output_gpu(network net)
-{
-    int i;
-    for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
-    return get_network_output_layer_gpu(net, i);
-}
-
-float *network_predict_gpu(network net, float *input)
-{
-    cuda_set_device(net.gpu_index);
-    int size = get_network_input_size(net) * net.batch;
-    network_state state;
-    state.index = 0;
-    state.net = net;
-    state.input = cuda_make_array(input, size);
-    state.truth = 0;
-    state.train = 0;
-    state.delta = 0;
-    forward_network_gpu(net, state);
-    float *out = get_network_output_gpu(net);
-    cuda_free(state.input);
-    return out;
-}
-
diff --git a/image.darknet/inst/include/darknet/src/normalization_layer.c b/image.darknet/inst/include/darknet/src/normalization_layer.c
index 069a079..424714f 100644
--- a/image.darknet/inst/include/darknet/src/normalization_layer.c
+++ b/image.darknet/inst/include/darknet/src/normalization_layer.c
@@ -1,5 +1,6 @@
 #include "normalization_layer.h"
 #include "blas.h"
+
 #include <stdio.h>
 
 layer make_normalization_layer(int batch, int w, int h, int c, int size, float alpha, float beta, float kappa)
@@ -62,7 +63,7 @@ void resize_normalization_layer(layer *layer, int w, int h)
 #endif
 }
 
-void forward_normalization_layer(const layer layer, network_state state)
+void forward_normalization_layer(const layer layer, network net)
 {
     int k,b;
     int w = layer.w;
@@ -73,7 +74,7 @@ void forward_normalization_layer(const layer layer, network_state state)
     for(b = 0; b < layer.batch; ++b){
         float *squared = layer.squared + w*h*c*b;
         float *norms   = layer.norms + w*h*c*b;
-        float *input   = state.input + w*h*c*b;
+        float *input   = net.input + w*h*c*b;
         pow_cpu(w*h*c, 2, input, 1, squared, 1);
 
         const_cpu(w*h, layer.kappa, norms, 1);
@@ -90,10 +91,10 @@ void forward_normalization_layer(const layer layer, network_state state)
         }
     }
     pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, layer.output, 1);
-    mul_cpu(w*h*c*layer.batch, state.input, 1, layer.output, 1);
+    mul_cpu(w*h*c*layer.batch, net.input, 1, layer.output, 1);
 }
 
-void backward_normalization_layer(const layer layer, network_state state)
+void backward_normalization_layer(const layer layer, network net)
 {
     // TODO This is approximate ;-)
     // Also this should add in to delta instead of overwritting.
@@ -101,50 +102,50 @@ void backward_normalization_layer(const layer layer, network_state state)
     int w = layer.w;
     int h = layer.h;
     int c = layer.c;
-    pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, state.delta, 1);
-    mul_cpu(w*h*c*layer.batch, layer.delta, 1, state.delta, 1);
+    pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, net.delta, 1);
+    mul_cpu(w*h*c*layer.batch, layer.delta, 1, net.delta, 1);
 }
 
 #ifdef GPU
-void forward_normalization_layer_gpu(const layer layer, network_state state)
+void forward_normalization_layer_gpu(const layer layer, network net)
 {
     int k,b;
     int w = layer.w;
     int h = layer.h;
     int c = layer.c;
-    scal_ongpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1);
+    scal_gpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1);
 
     for(b = 0; b < layer.batch; ++b){
         float *squared = layer.squared_gpu + w*h*c*b;
         float *norms   = layer.norms_gpu + w*h*c*b;
-        float *input   = state.input + w*h*c*b;
-        pow_ongpu(w*h*c, 2, input, 1, squared, 1);
+        float *input   = net.input_gpu + w*h*c*b;
+        pow_gpu(w*h*c, 2, input, 1, squared, 1);
 
-        const_ongpu(w*h, layer.kappa, norms, 1);
+        const_gpu(w*h, layer.kappa, norms, 1);
         for(k = 0; k < layer.size/2; ++k){
-            axpy_ongpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1);
+            axpy_gpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1);
         }
 
         for(k = 1; k < layer.c; ++k){
-            copy_ongpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1);
+            copy_gpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1);
             int prev = k - ((layer.size-1)/2) - 1;
             int next = k + (layer.size/2);
-            if(prev >= 0)      axpy_ongpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1);
-            if(next < layer.c) axpy_ongpu(w*h,  layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1);
+            if(prev >= 0)      axpy_gpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1);
+            if(next < layer.c) axpy_gpu(w*h,  layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1);
         }
     }
-    pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1);
-    mul_ongpu(w*h*c*layer.batch, state.input, 1, layer.output_gpu, 1);
+    pow_gpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1);
+    mul_gpu(w*h*c*layer.batch, net.input_gpu, 1, layer.output_gpu, 1);
 }
 
-void backward_normalization_layer_gpu(const layer layer, network_state state)
+void backward_normalization_layer_gpu(const layer layer, network net)
 {
     // TODO This is approximate ;-)
 
     int w = layer.w;
     int h = layer.h;
     int c = layer.c;
-    pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, state.delta, 1);
-    mul_ongpu(w*h*c*layer.batch, layer.delta_gpu, 1, state.delta, 1);
+    pow_gpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, net.delta_gpu, 1);
+    mul_gpu(w*h*c*layer.batch, layer.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
diff --git a/image.darknet/inst/include/darknet/src/normalization_layer.h b/image.darknet/inst/include/darknet/src/normalization_layer.h
index ab32776..665baa5 100644
--- a/image.darknet/inst/include/darknet/src/normalization_layer.h
+++ b/image.darknet/inst/include/darknet/src/normalization_layer.h
@@ -7,13 +7,13 @@
 
 layer make_normalization_layer(int batch, int w, int h, int c, int size, float alpha, float beta, float kappa);
 void resize_normalization_layer(layer *layer, int h, int w);
-void forward_normalization_layer(const layer layer, network_state state);
-void backward_normalization_layer(const layer layer, network_state state);
+void forward_normalization_layer(const layer layer, network net);
+void backward_normalization_layer(const layer layer, network net);
 void visualize_normalization_layer(layer layer, char *window);
 
 #ifdef GPU
-void forward_normalization_layer_gpu(const layer layer, network_state state);
-void backward_normalization_layer_gpu(const layer layer, network_state state);
+void forward_normalization_layer_gpu(const layer layer, network net);
+void backward_normalization_layer_gpu(const layer layer, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/option_list.c b/image.darknet/inst/include/darknet/src/option_list.c
index f935af3..2f52781 100644
--- a/image.darknet/inst/include/darknet/src/option_list.c
+++ b/image.darknet/inst/include/darknet/src/option_list.c
@@ -32,6 +32,23 @@ list *read_data_cfg(char *filename)
     return options;
 }
 
+metadata get_metadata(char *file)
+{
+    metadata m = {0};
+    list *options = read_data_cfg(file);
+
+    char *name_list = option_find_str(options, "names", 0);
+    if(!name_list) name_list = option_find_str(options, "labels", 0);
+    if(!name_list) {
+        fprintf(stderr, "No names or labels found\n");
+    } else {
+        m.names = get_labels(name_list);
+    }
+    m.classes = option_find_int(options, "classes", 2);
+    free_list(options);
+    return m;
+}
+
 int read_option(char *s, list *options)
 {
     size_t i;
diff --git a/image.darknet/inst/include/darknet/src/option_list.h b/image.darknet/inst/include/darknet/src/option_list.h
index 054b3fd..844bd87 100644
--- a/image.darknet/inst/include/darknet/src/option_list.h
+++ b/image.darknet/inst/include/darknet/src/option_list.h
@@ -9,13 +9,9 @@ typedef struct{
 } kvp;
 
 
-list *read_data_cfg(char *filename);
 int read_option(char *s, list *options);
 void option_insert(list *l, char *key, char *val);
 char *option_find(list *l, char *key);
-char *option_find_str(list *l, char *key, char *def);
-int option_find_int(list *l, char *key, int def);
-int option_find_int_quiet(list *l, char *key, int def);
 float option_find_float(list *l, char *key, float def);
 float option_find_float_quiet(list *l, char *key, float def);
 void option_unused(list *l);
diff --git a/image.darknet/inst/include/darknet/src/parser.c b/image.darknet/inst/include/darknet/src/parser.c
index 3f39a13..c8141c9 100644
--- a/image.darknet/inst/include/darknet/src/parser.c
+++ b/image.darknet/inst/include/darknet/src/parser.c
@@ -1,14 +1,17 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <assert.h>
 
 #include "activation_layer.h"
+#include "logistic_layer.h"
+#include "l2norm_layer.h"
 #include "activations.h"
-#include "assert.h"
 #include "avgpool_layer.h"
 #include "batchnorm_layer.h"
 #include "blas.h"
 #include "connected_layer.h"
+#include "deconvolutional_layer.h"
 #include "convolutional_layer.h"
 #include "cost_layer.h"
 #include "crnn_layer.h"
@@ -23,11 +26,15 @@
 #include "option_list.h"
 #include "parser.h"
 #include "region_layer.h"
+#include "yolo_layer.h"
+#include "iseg_layer.h"
 #include "reorg_layer.h"
 #include "rnn_layer.h"
 #include "route_layer.h"
+#include "upsample_layer.h"
 #include "shortcut_layer.h"
 #include "softmax_layer.h"
+#include "lstm_layer.h"
 #include "utils.h"
 
 typedef struct{
@@ -45,14 +52,21 @@ LAYER_TYPE string_to_layer_type(char * type)
     if (strcmp(type, "[cost]")==0) return COST;
     if (strcmp(type, "[detection]")==0) return DETECTION;
     if (strcmp(type, "[region]")==0) return REGION;
+    if (strcmp(type, "[yolo]")==0) return YOLO;
+    if (strcmp(type, "[iseg]")==0) return ISEG;
     if (strcmp(type, "[local]")==0) return LOCAL;
     if (strcmp(type, "[conv]")==0
             || strcmp(type, "[convolutional]")==0) return CONVOLUTIONAL;
+    if (strcmp(type, "[deconv]")==0
+            || strcmp(type, "[deconvolutional]")==0) return DECONVOLUTIONAL;
     if (strcmp(type, "[activation]")==0) return ACTIVE;
+    if (strcmp(type, "[logistic]")==0) return LOGXENT;
+    if (strcmp(type, "[l2norm]")==0) return L2NORM;
     if (strcmp(type, "[net]")==0
             || strcmp(type, "[network]")==0) return NETWORK;
     if (strcmp(type, "[crnn]")==0) return CRNN;
     if (strcmp(type, "[gru]")==0) return GRU;
+    if (strcmp(type, "[lstm]") == 0) return LSTM;
     if (strcmp(type, "[rnn]")==0) return RNN;
     if (strcmp(type, "[conn]")==0
             || strcmp(type, "[connected]")==0) return CONNECTED;
@@ -68,6 +82,7 @@ LAYER_TYPE string_to_layer_type(char * type)
     if (strcmp(type, "[soft]")==0
             || strcmp(type, "[softmax]")==0) return SOFTMAX;
     if (strcmp(type, "[route]")==0) return ROUTE;
+    if (strcmp(type, "[upsample]")==0) return UPSAMPLE;
     return BLANK;
 }
 
@@ -111,7 +126,7 @@ typedef struct size_params{
     int c;
     int index;
     int time_steps;
-    network net;
+    network *net;
 } size_params;
 
 local_layer parse_local(list *options, size_params params)
@@ -135,6 +150,32 @@ local_layer parse_local(list *options, size_params params)
     return layer;
 }
 
+layer parse_deconvolutional(list *options, size_params params)
+{
+    int n = option_find_int(options, "filters",1);
+    int size = option_find_int(options, "size",1);
+    int stride = option_find_int(options, "stride",1);
+
+    char *activation_s = option_find_str(options, "activation", "logistic");
+    ACTIVATION activation = get_activation(activation_s);
+
+    int batch,h,w,c;
+    h = params.h;
+    w = params.w;
+    c = params.c;
+    batch=params.batch;
+    if(!(h && w && c)) error("Layer before deconvolutional layer must output image.");
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+    int pad = option_find_int_quiet(options, "pad",0);
+    int padding = option_find_int_quiet(options, "padding",0);
+    if(pad) padding = size/2;
+
+    layer l = make_deconvolutional_layer(batch,h,w,c,n,size,stride,padding, activation, batch_normalize, params.net->adam);
+
+    return l;
+}
+
+
 convolutional_layer parse_convolutional(list *options, size_params params)
 {
     int n = option_find_int(options, "filters",1);
@@ -142,6 +183,7 @@ convolutional_layer parse_convolutional(list *options, size_params params)
     int stride = option_find_int(options, "stride",1);
     int pad = option_find_int_quiet(options, "pad",0);
     int padding = option_find_int_quiet(options, "padding",0);
+    int groups = option_find_int_quiet(options, "groups", 1);
     if(pad) padding = size/2;
 
     char *activation_s = option_find_str(options, "activation", "logistic");
@@ -157,14 +199,9 @@ convolutional_layer parse_convolutional(list *options, size_params params)
     int binary = option_find_int_quiet(options, "binary", 0);
     int xnor = option_find_int_quiet(options, "xnor", 0);
 
-    convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam);
+    convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,groups,size,stride,padding,activation, batch_normalize, binary, xnor, params.net->adam);
     layer.flipped = option_find_int_quiet(options, "flipped", 0);
     layer.dot = option_find_float_quiet(options, "dot", 0);
-    if(params.net.adam){
-        layer.B1 = params.net.B1;
-        layer.B2 = params.net.B2;
-        layer.eps = params.net.eps;
-    }
 
     return layer;
 }
@@ -187,13 +224,11 @@ layer parse_crnn(list *options, size_params params)
 layer parse_rnn(list *options, size_params params)
 {
     int output = option_find_int(options, "output",1);
-    int hidden = option_find_int(options, "hidden",1);
     char *activation_s = option_find_str(options, "activation", "logistic");
     ACTIVATION activation = get_activation(activation_s);
     int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
-    int logistic = option_find_int_quiet(options, "logistic", 0);
 
-    layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic);
+    layer l = make_rnn_layer(params.batch, params.inputs, output, params.time_steps, activation, batch_normalize, params.net->adam);
 
     l.shortcut = option_find_int_quiet(options, "shortcut", 0);
 
@@ -205,31 +240,114 @@ layer parse_gru(list *options, size_params params)
     int output = option_find_int(options, "output",1);
     int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
 
-    layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize);
+    layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize, params.net->adam);
+    l.tanh = option_find_int_quiet(options, "tanh", 0);
+
+    return l;
+}
+
+layer parse_lstm(list *options, size_params params)
+{
+    int output = option_find_int(options, "output", 1);
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+
+    layer l = make_lstm_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize, params.net->adam);
 
     return l;
 }
 
-connected_layer parse_connected(list *options, size_params params)
+layer parse_connected(list *options, size_params params)
 {
     int output = option_find_int(options, "output",1);
     char *activation_s = option_find_str(options, "activation", "logistic");
     ACTIVATION activation = get_activation(activation_s);
     int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
 
-    connected_layer layer = make_connected_layer(params.batch, params.inputs, output, activation, batch_normalize);
-
-    return layer;
+    layer l = make_connected_layer(params.batch, params.inputs, output, activation, batch_normalize, params.net->adam);
+    return l;
 }
 
-softmax_layer parse_softmax(list *options, size_params params)
+layer parse_softmax(list *options, size_params params)
 {
     int groups = option_find_int_quiet(options, "groups",1);
-    softmax_layer layer = make_softmax_layer(params.batch, params.inputs, groups);
-    layer.temperature = option_find_float_quiet(options, "temperature", 1);
+    layer l = make_softmax_layer(params.batch, params.inputs, groups);
+    l.temperature = option_find_float_quiet(options, "temperature", 1);
     char *tree_file = option_find_str(options, "tree", 0);
-    if (tree_file) layer.softmax_tree = read_tree(tree_file);
-    return layer;
+    if (tree_file) l.softmax_tree = read_tree(tree_file);
+    l.w = params.w;
+    l.h = params.h;
+    l.c = params.c;
+    l.spatial = option_find_float_quiet(options, "spatial", 0);
+    l.noloss =  option_find_int_quiet(options, "noloss", 0);
+    return l;
+}
+
+int *parse_yolo_mask(char *a, int *num)
+{
+    int *mask = 0;
+    if(a){
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (a[i] == ',') ++n;
+        }
+        mask = calloc(n, sizeof(int));
+        for(i = 0; i < n; ++i){
+            int val = atoi(a);
+            mask[i] = val;
+            a = strchr(a, ',')+1;
+        }
+        *num = n;
+    }
+    return mask;
+}
+
+layer parse_yolo(list *options, size_params params)
+{
+    int classes = option_find_int(options, "classes", 20);
+    int total = option_find_int(options, "num", 1);
+    int num = total;
+
+    char *a = option_find_str(options, "mask", 0);
+    int *mask = parse_yolo_mask(a, &num);
+    layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes);
+    assert(l.outputs == params.inputs);
+
+    l.max_boxes = option_find_int_quiet(options, "max",90);
+    l.jitter = option_find_float(options, "jitter", .2);
+
+    l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
+    l.truth_thresh = option_find_float(options, "truth_thresh", 1);
+    l.random = option_find_int_quiet(options, "random", 0);
+
+    char *map_file = option_find_str(options, "map", 0);
+    if (map_file) l.map = read_map(map_file);
+
+    a = option_find_str(options, "anchors", 0);
+    if(a){
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (a[i] == ',') ++n;
+        }
+        for(i = 0; i < n; ++i){
+            float bias = atof(a);
+            l.biases[i] = bias;
+            a = strchr(a, ',')+1;
+        }
+    }
+    return l;
+}
+
+layer parse_iseg(list *options, size_params params)
+{
+    int classes = option_find_int(options, "classes", 20);
+    int ids = option_find_int(options, "ids", 32);
+    layer l = make_iseg_layer(params.batch, params.w, params.h, classes, ids);
+    assert(l.outputs == params.inputs);
+    return l;
 }
 
 layer parse_region(list *options, size_params params)
@@ -245,6 +363,7 @@ layer parse_region(list *options, size_params params)
     l.sqrt = option_find_int_quiet(options, "sqrt", 0);
 
     l.softmax = option_find_int(options, "softmax", 0);
+    l.background = option_find_int_quiet(options, "background", 0);
     l.max_boxes = option_find_int_quiet(options, "max",30);
     l.jitter = option_find_float(options, "jitter", .2);
     l.rescore = option_find_int_quiet(options, "rescore",0);
@@ -257,6 +376,7 @@ layer parse_region(list *options, size_params params)
     l.coord_scale = option_find_float(options, "coord_scale", 1);
     l.object_scale = option_find_float(options, "object_scale", 1);
     l.noobject_scale = option_find_float(options, "noobject_scale", 1);
+    l.mask_scale = option_find_float(options, "mask_scale", 1);
     l.class_scale = option_find_float(options, "class_scale", 1);
     l.bias_match = option_find_int_quiet(options, "bias_match",0);
 
@@ -281,6 +401,7 @@ layer parse_region(list *options, size_params params)
     }
     return l;
 }
+
 detection_layer parse_detection(list *options, size_params params)
 {
     int coords = option_find_int(options, "coords", 1);
@@ -293,7 +414,7 @@ detection_layer parse_detection(list *options, size_params params)
     layer.softmax = option_find_int(options, "softmax", 0);
     layer.sqrt = option_find_int(options, "sqrt", 0);
 
-    layer.max_boxes = option_find_int_quiet(options, "max",30);
+    layer.max_boxes = option_find_int_quiet(options, "max",90);
     layer.coord_scale = option_find_float(options, "coord_scale", 1);
     layer.forced = option_find_int(options, "forced", 0);
     layer.object_scale = option_find_float(options, "object_scale", 1);
@@ -312,6 +433,8 @@ cost_layer parse_cost(list *options, size_params params)
     float scale = option_find_float_quiet(options, "scale",1);
     cost_layer layer = make_cost_layer(params.batch, params.inputs, type, scale);
     layer.ratio =  option_find_float_quiet(options, "ratio",0);
+    layer.noobject_scale =  option_find_float_quiet(options, "noobj", 1);
+    layer.thresh =  option_find_float_quiet(options, "thresh",0);
     return layer;
 }
 
@@ -343,6 +466,8 @@ layer parse_reorg(list *options, size_params params)
 {
     int stride = option_find_int(options, "stride",1);
     int reverse = option_find_int_quiet(options, "reverse",0);
+    int flatten = option_find_int_quiet(options, "flatten",0);
+    int extra = option_find_int_quiet(options, "extra",0);
 
     int batch,h,w,c;
     h = params.h;
@@ -351,7 +476,7 @@ layer parse_reorg(list *options, size_params params)
     batch=params.batch;
     if(!(h && w && c)) error("Layer before reorg layer must output image.");
 
-    layer layer = make_reorg_layer(batch,w,h,c,stride,reverse);
+    layer layer = make_reorg_layer(batch,w,h,c,stride,reverse, flatten, extra);
     return layer;
 }
 
@@ -359,7 +484,7 @@ maxpool_layer parse_maxpool(list *options, size_params params)
 {
     int stride = option_find_int(options, "stride",1);
     int size = option_find_int(options, "size",stride);
-    int padding = option_find_int_quiet(options, "padding", (size-1)/2);
+    int padding = option_find_int_quiet(options, "padding", size-1);
 
     int batch,h,w,c;
     h = params.h;
@@ -411,24 +536,45 @@ layer parse_batchnorm(list *options, size_params params)
     return l;
 }
 
-layer parse_shortcut(list *options, size_params params, network net)
+layer parse_shortcut(list *options, size_params params, network *net)
 {
-    char *l = option_find(options, "from");   
+    char *l = option_find(options, "from");
     int index = atoi(l);
     if(index < 0) index = params.index + index;
 
     int batch = params.batch;
-    layer from = net.layers[index];
+    layer from = net->layers[index];
 
     layer s = make_shortcut_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c);
 
     char *activation_s = option_find_str(options, "activation", "linear");
     ACTIVATION activation = get_activation(activation_s);
     s.activation = activation;
+    s.alpha = option_find_float_quiet(options, "alpha", 1);
+    s.beta = option_find_float_quiet(options, "beta", 1);
     return s;
 }
 
 
+layer parse_l2norm(list *options, size_params params)
+{
+    layer l = make_l2norm_layer(params.batch, params.inputs);
+    l.h = l.out_h = params.h;
+    l.w = l.out_w = params.w;
+    l.c = l.out_c = params.c;
+    return l;
+}
+
+
+layer parse_logistic(list *options, size_params params)
+{
+    layer l = make_logistic_layer(params.batch, params.inputs);
+    l.h = l.out_h = params.h;
+    l.w = l.out_w = params.w;
+    l.c = l.out_c = params.c;
+    return l;
+}
+
 layer parse_activation(list *options, size_params params)
 {
     char *activation_s = option_find_str(options, "activation", "linear");
@@ -436,19 +582,25 @@ layer parse_activation(list *options, size_params params)
 
     layer l = make_activation_layer(params.batch, params.inputs, activation);
 
-    l.out_h = params.h;
-    l.out_w = params.w;
-    l.out_c = params.c;
-    l.h = params.h;
-    l.w = params.w;
-    l.c = params.c;
+    l.h = l.out_h = params.h;
+    l.w = l.out_w = params.w;
+    l.c = l.out_c = params.c;
+
+    return l;
+}
+
+layer parse_upsample(list *options, size_params params, network *net)
+{
 
+    int stride = option_find_int(options, "stride",2);
+    layer l = make_upsample_layer(params.batch, params.w, params.h, params.c, stride);
+    l.scale = option_find_float_quiet(options, "scale", 1);
     return l;
 }
 
-route_layer parse_route(list *options, size_params params, network net)
+route_layer parse_route(list *options, size_params params, network *net)
 {
-    char *l = option_find(options, "layers");   
+    char *l = option_find(options, "layers");
     int len = strlen(l);
     if(!l) error("Route Layer must specify input layers");
     int n = 1;
@@ -464,19 +616,19 @@ route_layer parse_route(list *options, size_params params, network net)
         l = strchr(l, ',')+1;
         if(index < 0) index = params.index + index;
         layers[i] = index;
-        sizes[i] = net.layers[index].outputs;
+        sizes[i] = net->layers[index].outputs;
     }
     int batch = params.batch;
 
     route_layer layer = make_route_layer(batch, n, layers, sizes);
 
-    convolutional_layer first = net.layers[layers[0]];
+    convolutional_layer first = net->layers[layers[0]];
     layer.out_w = first.out_w;
     layer.out_h = first.out_h;
     layer.out_c = first.out_c;
     for(i = 1; i < n; ++i){
         int index = layers[i];
-        convolutional_layer next = net.layers[index];
+        convolutional_layer next = net->layers[index];
         if(next.out_w == first.out_w && next.out_h == first.out_h){
             layer.out_c += next.out_c;
         }else{
@@ -508,15 +660,17 @@ void parse_net_options(list *options, network *net)
     net->decay = option_find_float(options, "decay", .0001);
     int subdivs = option_find_int(options, "subdivisions",1);
     net->time_steps = option_find_int_quiet(options, "time_steps",1);
+    net->notruth = option_find_int_quiet(options, "notruth",0);
     net->batch /= subdivs;
     net->batch *= net->time_steps;
     net->subdivisions = subdivs;
+    net->random = option_find_int_quiet(options, "random", 0);
 
     net->adam = option_find_int_quiet(options, "adam", 0);
     if(net->adam){
         net->B1 = option_find_float(options, "B1", .9);
         net->B2 = option_find_float(options, "B2", .999);
-        net->eps = option_find_float(options, "eps", .000001);
+        net->eps = option_find_float(options, "eps", .0000001);
     }
 
     net->h = option_find_int_quiet(options, "height",0);
@@ -525,6 +679,10 @@ void parse_net_options(list *options, network *net)
     net->inputs = option_find_int_quiet(options, "inputs", net->h * net->w * net->c);
     net->max_crop = option_find_int_quiet(options, "max_crop",net->w*2);
     net->min_crop = option_find_int_quiet(options, "min_crop",net->w);
+    net->max_ratio = option_find_float_quiet(options, "max_ratio", (float) net->max_crop / net->w);
+    net->min_ratio = option_find_float_quiet(options, "min_ratio", (float) net->min_crop / net->w);
+    net->center = option_find_int_quiet(options, "center",0);
+    net->clip = option_find_float_quiet(options, "clip", 0);
 
     net->angle = option_find_float_quiet(options, "angle", 0);
     net->aspect = option_find_float_quiet(options, "aspect", 1);
@@ -537,12 +695,13 @@ void parse_net_options(list *options, network *net)
     char *policy_s = option_find_str(options, "policy", "constant");
     net->policy = get_policy(policy_s);
     net->burn_in = option_find_int_quiet(options, "burn_in", 0);
+    net->power = option_find_float_quiet(options, "power", 4);
     if(net->policy == STEP){
         net->step = option_find_int(options, "step", 1);
         net->scale = option_find_float(options, "scale", 1);
     } else if (net->policy == STEPS){
-        char *l = option_find(options, "steps");   
-        char *p = option_find(options, "scales");   
+        char *l = option_find(options, "steps");
+        char *p = option_find(options, "scales");
         if(!l || !p) error("STEPS policy must have steps and scales in cfg file");
 
         int len = strlen(l);
@@ -570,7 +729,6 @@ void parse_net_options(list *options, network *net)
         net->gamma = option_find_float(options, "gamma", 1);
         net->step = option_find_int(options, "step", 1);
     } else if (net->policy == POLY || net->policy == RANDOM){
-        net->power = option_find_float(options, "power", 1);
     }
     net->max_batches = option_find_int(options, "max_batches", 0);
 }
@@ -581,26 +739,26 @@ int is_network(section *s)
             || strcmp(s->type, "[network]")==0);
 }
 
-network parse_network_cfg(char *filename)
+network *parse_network_cfg(char *filename)
 {
     list *sections = read_cfg(filename);
     node *n = sections->front;
     if(!n) error("Config file has no sections");
-    network net = make_network(sections->size - 1);
-    net.gpu_index = gpu_index;
+    network *net = make_network(sections->size - 1);
+    net->gpu_index = gpu_index;
     size_params params;
 
     section *s = (section *)n->val;
     list *options = s->options;
     if(!is_network(s)) error("First section must be [net] or [network]");
-    parse_net_options(options, &net);
-
-    params.h = net.h;
-    params.w = net.w;
-    params.c = net.c;
-    params.inputs = net.inputs;
-    params.batch = net.batch;
-    params.time_steps = net.time_steps;
+    parse_net_options(options, net);
+
+    params.h = net->h;
+    params.w = net->w;
+    params.c = net->c;
+    params.inputs = net->inputs;
+    params.batch = net->batch;
+    params.time_steps = net->time_steps;
     params.net = net;
 
     size_t workspace_size = 0;
@@ -617,14 +775,22 @@ network parse_network_cfg(char *filename)
         LAYER_TYPE lt = string_to_layer_type(s->type);
         if(lt == CONVOLUTIONAL){
             l = parse_convolutional(options, params);
+        }else if(lt == DECONVOLUTIONAL){
+            l = parse_deconvolutional(options, params);
         }else if(lt == LOCAL){
             l = parse_local(options, params);
         }else if(lt == ACTIVE){
             l = parse_activation(options, params);
+        }else if(lt == LOGXENT){
+            l = parse_logistic(options, params);
+        }else if(lt == L2NORM){
+            l = parse_l2norm(options, params);
         }else if(lt == RNN){
             l = parse_rnn(options, params);
         }else if(lt == GRU){
             l = parse_gru(options, params);
+        }else if (lt == LSTM) {
+            l = parse_lstm(options, params);
         }else if(lt == CRNN){
             l = parse_crnn(options, params);
         }else if(lt == CONNECTED){
@@ -635,11 +801,15 @@ network parse_network_cfg(char *filename)
             l = parse_cost(options, params);
         }else if(lt == REGION){
             l = parse_region(options, params);
+        }else if(lt == YOLO){
+            l = parse_yolo(options, params);
+        }else if(lt == ISEG){
+            l = parse_iseg(options, params);
         }else if(lt == DETECTION){
             l = parse_detection(options, params);
         }else if(lt == SOFTMAX){
             l = parse_softmax(options, params);
-            net.hierarchy = l.softmax_tree;
+            net->hierarchy = l.softmax_tree;
         }else if(lt == NORMALIZATION){
             l = parse_normalization(options, params);
         }else if(lt == BATCHNORM){
@@ -652,23 +822,33 @@ network parse_network_cfg(char *filename)
             l = parse_avgpool(options, params);
         }else if(lt == ROUTE){
             l = parse_route(options, params, net);
+        }else if(lt == UPSAMPLE){
+            l = parse_upsample(options, params, net);
         }else if(lt == SHORTCUT){
             l = parse_shortcut(options, params, net);
         }else if(lt == DROPOUT){
             l = parse_dropout(options, params);
-            l.output = net.layers[count-1].output;
-            l.delta = net.layers[count-1].delta;
+            l.output = net->layers[count-1].output;
+            l.delta = net->layers[count-1].delta;
 #ifdef GPU
-            l.output_gpu = net.layers[count-1].output_gpu;
-            l.delta_gpu = net.layers[count-1].delta_gpu;
+            l.output_gpu = net->layers[count-1].output_gpu;
+            l.delta_gpu = net->layers[count-1].delta_gpu;
 #endif
         }else{
             fprintf(stderr, "Type not recognized: %s\n", s->type);
         }
+        l.clip = net->clip;
+        l.truth = option_find_int_quiet(options, "truth", 0);
+        l.onlyforward = option_find_int_quiet(options, "onlyforward", 0);
+        l.stopbackward = option_find_int_quiet(options, "stopbackward", 0);
+        l.dontsave = option_find_int_quiet(options, "dontsave", 0);
         l.dontload = option_find_int_quiet(options, "dontload", 0);
+        l.numload = option_find_int_quiet(options, "numload", 0);
         l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0);
+        l.learning_rate_scale = option_find_float_quiet(options, "learning_rate", 1);
+        l.smooth = option_find_float_quiet(options, "smooth", 0);
         option_unused(options);
-        net.layers[count] = l;
+        net->layers[count] = l;
         if (l.workspace_size > workspace_size) workspace_size = l.workspace_size;
         free_section(s);
         n = n->next;
@@ -679,20 +859,30 @@ network parse_network_cfg(char *filename)
             params.c = l.out_c;
             params.inputs = l.outputs;
         }
-    }   
+    }
     free_list(sections);
-    net.outputs = get_network_output_size(net);
-    net.output = get_network_output(net);
+    layer out = get_network_output_layer(net);
+    net->outputs = out.outputs;
+    net->truths = out.outputs;
+    if(net->layers[net->n-1].truths) net->truths = net->layers[net->n-1].truths;
+    net->output = out.output;
+    net->input = calloc(net->inputs*net->batch, sizeof(float));
+    net->truth = calloc(net->truths*net->batch, sizeof(float));
+#ifdef GPU
+    net->output_gpu = out.output_gpu;
+    net->input_gpu = cuda_make_array(net->input, net->inputs*net->batch);
+    net->truth_gpu = cuda_make_array(net->truth, net->truths*net->batch);
+#endif
     if(workspace_size){
         //printf("%ld\n", workspace_size);
 #ifdef GPU
         if(gpu_index >= 0){
-            net.workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
+            net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
         }else {
-            net.workspace = calloc(1, workspace_size);
+            net->workspace = calloc(1, workspace_size);
         }
 #else
-        net.workspace = calloc(1, workspace_size);
+        net->workspace = calloc(1, workspace_size);
 #endif
     }
     return net;
@@ -704,7 +894,7 @@ list *read_cfg(char *filename)
     if(file == 0) file_error(filename);
     char *line;
     int nu = 0;
-    list *sections = make_list();
+    list *options = make_list();
     section *current = 0;
     while((line=fgetl(file)) != 0){
         ++ nu;
@@ -712,7 +902,7 @@ list *read_cfg(char *filename)
         switch(line[0]){
             case '[':
                 current = malloc(sizeof(section));
-                list_insert(sections, current);
+                list_insert(options, current);
                 current->options = make_list();
                 current->type = line;
                 break;
@@ -730,7 +920,7 @@ list *read_cfg(char *filename)
         }
     }
     fclose(file);
-    return sections;
+    return options;
 }
 
 void save_convolutional_weights_binary(layer l, FILE *fp)
@@ -776,7 +966,7 @@ void save_convolutional_weights(layer l, FILE *fp)
         pull_convolutional_layer(l);
     }
 #endif
-    int num = l.n*l.c*l.size*l.size;
+    int num = l.nweights;
     fwrite(l.biases, sizeof(float), l.n, fp);
     if (l.batch_normalize){
         fwrite(l.scales, sizeof(float), l.n, fp);
@@ -784,10 +974,6 @@ void save_convolutional_weights(layer l, FILE *fp)
         fwrite(l.rolling_variance, sizeof(float), l.n, fp);
     }
     fwrite(l.weights, sizeof(float), num, fp);
-    if(l.adam){
-        fwrite(l.m, sizeof(float), num, fp);
-        fwrite(l.v, sizeof(float), num, fp);
-    }
 }
 
 void save_batchnorm_weights(layer l, FILE *fp)
@@ -818,11 +1004,11 @@ void save_connected_weights(layer l, FILE *fp)
     }
 }
 
-void save_weights_upto(network net, char *filename, int cutoff)
+void save_weights_upto(network *net, char *filename, int cutoff)
 {
 #ifdef GPU
-    if(net.gpu_index >= 0){
-        cuda_set_device(net.gpu_index);
+    if(net->gpu_index >= 0){
+        cuda_set_device(net->gpu_index);
     }
 #endif
     fprintf(stderr, "Saving weights to %s\n", filename);
@@ -830,17 +1016,18 @@ void save_weights_upto(network net, char *filename, int cutoff)
     if(!fp) file_error(filename);
 
     int major = 0;
-    int minor = 1;
+    int minor = 2;
     int revision = 0;
     fwrite(&major, sizeof(int), 1, fp);
     fwrite(&minor, sizeof(int), 1, fp);
     fwrite(&revision, sizeof(int), 1, fp);
-    fwrite(net.seen, sizeof(int), 1, fp);
+    fwrite(net->seen, sizeof(size_t), 1, fp);
 
     int i;
-    for(i = 0; i < net.n && i < cutoff; ++i){
-        layer l = net.layers[i];
-        if(l.type == CONVOLUTIONAL){
+    for(i = 0; i < net->n && i < cutoff; ++i){
+        layer l = net->layers[i];
+        if (l.dontsave) continue;
+        if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
             save_convolutional_weights(l, fp);
         } if(l.type == CONNECTED){
             save_connected_weights(l, fp);
@@ -850,14 +1037,29 @@ void save_weights_upto(network net, char *filename, int cutoff)
             save_connected_weights(*(l.input_layer), fp);
             save_connected_weights(*(l.self_layer), fp);
             save_connected_weights(*(l.output_layer), fp);
-        } if(l.type == GRU){
-            save_connected_weights(*(l.input_z_layer), fp);
-            save_connected_weights(*(l.input_r_layer), fp);
-            save_connected_weights(*(l.input_h_layer), fp);
-            save_connected_weights(*(l.state_z_layer), fp);
-            save_connected_weights(*(l.state_r_layer), fp);
-            save_connected_weights(*(l.state_h_layer), fp);
-        } if(l.type == CRNN){
+        } if (l.type == LSTM) {
+            save_connected_weights(*(l.wi), fp);
+            save_connected_weights(*(l.wf), fp);
+            save_connected_weights(*(l.wo), fp);
+            save_connected_weights(*(l.wg), fp);
+            save_connected_weights(*(l.ui), fp);
+            save_connected_weights(*(l.uf), fp);
+            save_connected_weights(*(l.uo), fp);
+            save_connected_weights(*(l.ug), fp);
+        } if (l.type == GRU) {
+            if(1){
+                save_connected_weights(*(l.wz), fp);
+                save_connected_weights(*(l.wr), fp);
+                save_connected_weights(*(l.wh), fp);
+                save_connected_weights(*(l.uz), fp);
+                save_connected_weights(*(l.ur), fp);
+                save_connected_weights(*(l.uh), fp);
+            }else{
+                save_connected_weights(*(l.reset_layer), fp);
+                save_connected_weights(*(l.update_layer), fp);
+                save_connected_weights(*(l.state_layer), fp);
+            }
+        }  if(l.type == CRNN){
             save_convolutional_weights(*(l.input_layer), fp);
             save_convolutional_weights(*(l.self_layer), fp);
             save_convolutional_weights(*(l.output_layer), fp);
@@ -875,9 +1077,9 @@ void save_weights_upto(network net, char *filename, int cutoff)
     }
     fclose(fp);
 }
-void save_weights(network net, char *filename)
+void save_weights(network *net, char *filename)
 {
-    save_weights_upto(net, filename, net.n);
+    save_weights_upto(net, filename, net->n);
 }
 
 void transpose_matrix(float *a, int rows, int cols)
@@ -965,7 +1167,8 @@ void load_convolutional_weights(layer l, FILE *fp)
         //load_convolutional_weights_binary(l, fp);
         //return;
     }
-    int num = l.n*l.c*l.size*l.size;
+    if(l.numload) l.n = l.numload;
+    int num = l.c/l.groups*l.n*l.size*l.size;
     fread(l.biases, sizeof(float), l.n, fp);
     if (l.batch_normalize && (!l.dontloadscales)){
         fread(l.scales, sizeof(float), l.n, fp);
@@ -986,12 +1189,19 @@ void load_convolutional_weights(layer l, FILE *fp)
             fill_cpu(l.n, 0, l.rolling_mean, 1);
             fill_cpu(l.n, 0, l.rolling_variance, 1);
         }
+        if(0){
+            int i;
+            for(i = 0; i < l.n; ++i){
+                printf("%g, ", l.rolling_mean[i]);
+            }
+            printf("\n");
+            for(i = 0; i < l.n; ++i){
+                printf("%g, ", l.rolling_variance[i]);
+            }
+            printf("\n");
+        }
     }
     fread(l.weights, sizeof(float), num, fp);
-    if(l.adam){
-        fread(l.m, sizeof(float), num, fp);
-        fread(l.v, sizeof(float), num, fp);
-    }
     //if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1);
     if (l.flipped) {
         transpose_matrix(l.weights, l.c*l.size*l.size, l.n);
@@ -1005,7 +1215,7 @@ void load_convolutional_weights(layer l, FILE *fp)
 }
 
 
-void load_weights_upto(network *net, char *filename, int cutoff)
+void load_weights_upto(network *net, char *filename, int start, int cutoff)
 {
 #ifdef GPU
     if(net->gpu_index >= 0){
@@ -1023,14 +1233,20 @@ void load_weights_upto(network *net, char *filename, int cutoff)
     fread(&major, sizeof(int), 1, fp);
     fread(&minor, sizeof(int), 1, fp);
     fread(&revision, sizeof(int), 1, fp);
-    fread(net->seen, sizeof(int), 1, fp);
+    if ((major*10 + minor) >= 2 && major < 1000 && minor < 1000){
+        fread(net->seen, sizeof(size_t), 1, fp);
+    } else {
+        int iseen = 0;
+        fread(&iseen, sizeof(int), 1, fp);
+        *net->seen = iseen;
+    }
     int transpose = (major > 1000) || (minor > 1000);
 
     int i;
-    for(i = 0; i < net->n && i < cutoff; ++i){
+    for(i = start; i < net->n && i < cutoff; ++i){
         layer l = net->layers[i];
         if (l.dontload) continue;
-        if(l.type == CONVOLUTIONAL){
+        if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
             load_convolutional_weights(l, fp);
         }
         if(l.type == CONNECTED){
@@ -1049,13 +1265,29 @@ void load_weights_upto(network *net, char *filename, int cutoff)
             load_connected_weights(*(l.self_layer), fp, transpose);
             load_connected_weights(*(l.output_layer), fp, transpose);
         }
-        if(l.type == GRU){
-            load_connected_weights(*(l.input_z_layer), fp, transpose);
-            load_connected_weights(*(l.input_r_layer), fp, transpose);
-            load_connected_weights(*(l.input_h_layer), fp, transpose);
-            load_connected_weights(*(l.state_z_layer), fp, transpose);
-            load_connected_weights(*(l.state_r_layer), fp, transpose);
-            load_connected_weights(*(l.state_h_layer), fp, transpose);
+        if (l.type == LSTM) {
+            load_connected_weights(*(l.wi), fp, transpose);
+            load_connected_weights(*(l.wf), fp, transpose);
+            load_connected_weights(*(l.wo), fp, transpose);
+            load_connected_weights(*(l.wg), fp, transpose);
+            load_connected_weights(*(l.ui), fp, transpose);
+            load_connected_weights(*(l.uf), fp, transpose);
+            load_connected_weights(*(l.uo), fp, transpose);
+            load_connected_weights(*(l.ug), fp, transpose);
+        }
+        if (l.type == GRU) {
+            if(1){
+                load_connected_weights(*(l.wz), fp, transpose);
+                load_connected_weights(*(l.wr), fp, transpose);
+                load_connected_weights(*(l.wh), fp, transpose);
+                load_connected_weights(*(l.uz), fp, transpose);
+                load_connected_weights(*(l.ur), fp, transpose);
+                load_connected_weights(*(l.uh), fp, transpose);
+            }else{
+                load_connected_weights(*(l.reset_layer), fp, transpose);
+                load_connected_weights(*(l.update_layer), fp, transpose);
+                load_connected_weights(*(l.state_layer), fp, transpose);
+            }
         }
         if(l.type == LOCAL){
             int locations = l.out_w*l.out_h;
@@ -1075,6 +1307,6 @@ void load_weights_upto(network *net, char *filename, int cutoff)
 
 void load_weights(network *net, char *filename)
 {
-    load_weights_upto(net, filename, net->n);
+    load_weights_upto(net, filename, 0, net->n);
 }
 
diff --git a/image.darknet/inst/include/darknet/src/parser.h b/image.darknet/inst/include/darknet/src/parser.h
index 6cff4fb..81aef2c 100644
--- a/image.darknet/inst/include/darknet/src/parser.h
+++ b/image.darknet/inst/include/darknet/src/parser.h
@@ -1,13 +1,9 @@
 #ifndef PARSER_H
 #define PARSER_H
+#include "darknet.h"
 #include "network.h"
 
-network parse_network_cfg(char *filename);
 void save_network(network net, char *filename);
-void save_weights(network net, char *filename);
-void save_weights_upto(network net, char *filename, int cutoff);
 void save_weights_double(network net, char *filename);
-void load_weights(network *net, char *filename);
-void load_weights_upto(network *net, char *filename, int cutoff);
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/region_layer.c b/image.darknet/inst/include/darknet/src/region_layer.c
index f5522c3..179f5e3 100644
--- a/image.darknet/inst/include/darknet/src/region_layer.c
+++ b/image.darknet/inst/include/darknet/src/region_layer.c
@@ -4,6 +4,7 @@
 #include "box.h"
 #include "cuda.h"
 #include "utils.h"
+
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
@@ -18,6 +19,10 @@ layer make_region_layer(int batch, int w, int h, int n, int classes, int coords)
     l.batch = batch;
     l.h = h;
     l.w = w;
+    l.c = n*(classes + coords + 1);
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
     l.classes = classes;
     l.coords = coords;
     l.cost = calloc(1, sizeof(float));
@@ -25,7 +30,7 @@ layer make_region_layer(int batch, int w, int h, int n, int classes, int coords)
     l.bias_updates = calloc(n*2, sizeof(float));
     l.outputs = h*w*n*(classes + coords + 1);
     l.inputs = l.outputs;
-    l.truths = 30*(5);
+    l.truths = 30*(l.coords + 1);
     l.delta = calloc(batch*l.outputs, sizeof(float));
     l.output = calloc(batch*l.outputs, sizeof(float));
     int i;
@@ -68,19 +73,19 @@ void resize_region_layer(layer *l, int w, int h)
 #endif
 }
 
-box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h)
+box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h, int stride)
 {
     box b;
-    b.x = (i + logistic_activate(x[index + 0])) / w;
-    b.y = (j + logistic_activate(x[index + 1])) / h;
-    b.w = exp(x[index + 2]) * biases[2*n]   / w;
-    b.h = exp(x[index + 3]) * biases[2*n+1] / h;
+    b.x = (i + x[index + 0*stride]) / w;
+    b.y = (j + x[index + 1*stride]) / h;
+    b.w = exp(x[index + 2*stride]) * biases[2*n]   / w;
+    b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
     return b;
 }
 
-float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale)
+float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale, int stride)
 {
-    box pred = get_region_box(x, biases, n, index, i, j, w, h);
+    box pred = get_region_box(x, biases, n, index, i, j, w, h, stride);
     float iou = box_iou(pred, truth);
 
     float tx = (truth.x*w - i);
@@ -88,34 +93,47 @@ float delta_region_box(box truth, float *x, float *biases, int n, int index, int
     float tw = log(truth.w*w / biases[2*n]);
     float th = log(truth.h*h / biases[2*n + 1]);
 
-    delta[index + 0] = scale * (tx - logistic_activate(x[index + 0])) * logistic_gradient(logistic_activate(x[index + 0]));
-    delta[index + 1] = scale * (ty - logistic_activate(x[index + 1])) * logistic_gradient(logistic_activate(x[index + 1]));
-    delta[index + 2] = scale * (tw - x[index + 2]);
-    delta[index + 3] = scale * (th - x[index + 3]);
+    delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
+    delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
+    delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
+    delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
     return iou;
 }
 
-void delta_region_class(float *output, float *delta, int index, int class, int classes, tree *hier, float scale, float *avg_cat)
+void delta_region_mask(float *truth, float *x, int n, int index, float *delta, int stride, int scale)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        delta[index + i*stride] = scale*(truth[i] - x[index + i*stride]);
+    }
+}
+
+
+void delta_region_class(float *output, float *delta, int index, int class, int classes, tree *hier, float scale, int stride, float *avg_cat, int tag)
 {
     int i, n;
     if(hier){
         float pred = 1;
         while(class >= 0){
-            pred *= output[index + class];
+            pred *= output[index + stride*class];
             int g = hier->group[class];
             int offset = hier->group_offset[g];
             for(i = 0; i < hier->group_size[g]; ++i){
-                delta[index + offset + i] = scale * (0 - output[index + offset + i]);
+                delta[index + stride*(offset + i)] = scale * (0 - output[index + stride*(offset + i)]);
             }
-            delta[index + class] = scale * (1 - output[index + class]);
+            delta[index + stride*class] = scale * (1 - output[index + stride*class]);
 
             class = hier->parent[class];
         }
         *avg_cat += pred;
     } else {
+        if (delta[index] && tag){
+            delta[index + stride*class] = scale * (1 - output[index + stride*class]);
+            return;
+        }
         for(n = 0; n < classes; ++n){
-            delta[index + n] = scale * (((n == class)?1 : 0) - output[index + n]);
-            if(n == class) *avg_cat += output[index + n];
+            delta[index + stride*n] = scale * (((n == class)?1 : 0) - output[index + stride*n]);
+            if(n == class) *avg_cat += output[index + stride*n];
         }
     }
 }
@@ -130,42 +148,45 @@ float tisnan(float x)
     return (x != x);
 }
 
-void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output);
-void forward_region_layer(const layer l, network_state state)
+int entry_index(layer l, int batch, int location, int entry)
+{
+    int n =   location / (l.w*l.h);
+    int loc = location % (l.w*l.h);
+    return batch*l.outputs + n*l.w*l.h*(l.coords+l.classes+1) + entry*l.w*l.h + loc;
+}
+
+void forward_region_layer(const layer l, network net)
 {
     int i,j,b,t,n;
-    int size = l.coords + l.classes + 1;
-    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
+    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+
 #ifndef GPU
-    flatten(l.output, l.w*l.h, size*l.n, l.batch, 1);
-#endif
     for (b = 0; b < l.batch; ++b){
-        for(i = 0; i < l.h*l.w*l.n; ++i){
-            int index = size*i + b*l.outputs;
-            l.output[index + 4] = logistic_activate(l.output[index + 4]);
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, l.coords);
+            if(!l.background) activate_array(l.output + index,   l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, l.coords + 1);
+            if(!l.softmax && !l.softmax_tree) activate_array(l.output + index, l.classes*l.w*l.h, LOGISTIC);
         }
     }
-
-
-#ifndef GPU
     if (l.softmax_tree){
-        for (b = 0; b < l.batch; ++b){
-            for(i = 0; i < l.h*l.w*l.n; ++i){
-                int index = size*i + b*l.outputs;
-                softmax_tree(l.output + index + 5, 1, 0, 1, l.softmax_tree, l.output + index + 5);
-            }
+        int i;
+        int count = l.coords + 1;
+        for (i = 0; i < l.softmax_tree->groups; ++i) {
+            int group_size = l.softmax_tree->group_size[i];
+            softmax_cpu(net.input + count, group_size, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + count);
+            count += group_size;
         }
     } else if (l.softmax){
-        for (b = 0; b < l.batch; ++b){
-            for(i = 0; i < l.h*l.w*l.n; ++i){
-                int index = size*i + b*l.outputs;
-                softmax(l.output + index + 5, l.classes, 1, l.output + index + 5);
-            }
-        }
+        int index = entry_index(l, 0, 0, l.coords + !l.background);
+        softmax_cpu(net.input + index, l.classes + l.background, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output + index);
     }
 #endif
-    if(!state.train) return;
+
     memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+    if(!net.train) return;
     float avg_iou = 0;
     float recall = 0;
     float avg_cat = 0;
@@ -178,26 +199,29 @@ void forward_region_layer(const layer l, network_state state)
         if(l.softmax_tree){
             int onlyclass = 0;
             for(t = 0; t < 30; ++t){
-                box truth = float_to_box(state.truth + t*5 + b*l.truths);
+                box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
                 if(!truth.x) break;
-                int class = state.truth[t*5 + b*l.truths + 4];
+                int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];
                 float maxp = 0;
                 int maxi = 0;
                 if(truth.x > 100000 && truth.y > 100000){
                     for(n = 0; n < l.n*l.w*l.h; ++n){
-                        int index = size*n + b*l.outputs + 5;
-                        float scale =  l.output[index-1];
-                        l.delta[index - 1] = l.noobject_scale * ((0 - l.output[index - 1]) * logistic_gradient(l.output[index - 1]));
-                        float p = scale*get_hierarchy_probability(l.output + index, l.softmax_tree, class);
+                        int class_index = entry_index(l, b, n, l.coords + 1);
+                        int obj_index = entry_index(l, b, n, l.coords);
+                        float scale =  l.output[obj_index];
+                        l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]);
+                        float p = scale*get_hierarchy_probability(l.output + class_index, l.softmax_tree, class, l.w*l.h);
                         if(p > maxp){
                             maxp = p;
                             maxi = n;
                         }
                     }
-                    int index = size*maxi + b*l.outputs + 5;
-                    delta_region_class(l.output, l.delta, index, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat);
-                    if(l.output[index - 1] < .3) l.delta[index - 1] = l.object_scale * ((.3 - l.output[index - 1]) * logistic_gradient(l.output[index - 1]));
-                    else  l.delta[index - 1] = 0;
+                    int class_index = entry_index(l, b, maxi, l.coords + 1);
+                    int obj_index = entry_index(l, b, maxi, l.coords);
+                    delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat, !l.softmax);
+                    if(l.output[obj_index] < .3) l.delta[obj_index] = l.object_scale * (.3 - l.output[obj_index]);
+                    else  l.delta[obj_index] = 0;
+                    l.delta[obj_index] = 0;
                     ++class_count;
                     onlyclass = 1;
                     break;
@@ -208,190 +232,276 @@ void forward_region_layer(const layer l, network_state state)
         for (j = 0; j < l.h; ++j) {
             for (i = 0; i < l.w; ++i) {
                 for (n = 0; n < l.n; ++n) {
-                    int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
-                    box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
+                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h);
                     float best_iou = 0;
                     for(t = 0; t < 30; ++t){
-                        box truth = float_to_box(state.truth + t*5 + b*l.truths);
+                        box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
                         if(!truth.x) break;
                         float iou = box_iou(pred, truth);
                         if (iou > best_iou) {
                             best_iou = iou;
                         }
                     }
-                    avg_anyobj += l.output[index + 4];
-                    l.delta[index + 4] = l.noobject_scale * ((0 - l.output[index + 4]) * logistic_gradient(l.output[index + 4]));
+                    int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, l.coords);
+                    avg_anyobj += l.output[obj_index];
+                    l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]);
+                    if(l.background) l.delta[obj_index] = l.noobject_scale * (1 - l.output[obj_index]);
                     if (best_iou > l.thresh) {
-                        l.delta[index + 4] = 0;
+                        l.delta[obj_index] = 0;
                     }
 
-                    if(*(state.net.seen) < 12800){
+                    if(*(net.seen) < 12800){
                         box truth = {0};
                         truth.x = (i + .5)/l.w;
                         truth.y = (j + .5)/l.h;
                         truth.w = l.biases[2*n]/l.w;
                         truth.h = l.biases[2*n+1]/l.h;
-                        delta_region_box(truth, l.output, l.biases, n, index, i, j, l.w, l.h, l.delta, .01);
+                        delta_region_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, l.delta, .01, l.w*l.h);
                     }
                 }
             }
         }
         for(t = 0; t < 30; ++t){
-            box truth = float_to_box(state.truth + t*5 + b*l.truths);
+            box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
 
             if(!truth.x) break;
             float best_iou = 0;
-            int best_index = 0;
             int best_n = 0;
             i = (truth.x * l.w);
             j = (truth.y * l.h);
-            //printf("%d %f %d %f\n", i, truth.x*l.w, j, truth.y*l.h);
             box truth_shift = truth;
             truth_shift.x = 0;
             truth_shift.y = 0;
-            //printf("index %d %d\n",i, j);
             for(n = 0; n < l.n; ++n){
-                int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
-                box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
+                int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h);
                 if(l.bias_match){
                     pred.w = l.biases[2*n]/l.w;
                     pred.h = l.biases[2*n+1]/l.h;
                 }
-                //printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h);
                 pred.x = 0;
                 pred.y = 0;
                 float iou = box_iou(pred, truth_shift);
                 if (iou > best_iou){
-                    best_index = index;
                     best_iou = iou;
                     best_n = n;
                 }
             }
-            //printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h);
 
-            float iou = delta_region_box(truth, l.output, l.biases, best_n, best_index, i, j, l.w, l.h, l.delta, l.coord_scale);
+            int box_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 0);
+            float iou = delta_region_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, l.delta, l.coord_scale *  (2 - truth.w*truth.h), l.w*l.h);
+            if(l.coords > 4){
+                int mask_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 4);
+                delta_region_mask(net.truth + t*(l.coords + 1) + b*l.truths + 5, l.output, l.coords - 4, mask_index, l.delta, l.w*l.h, l.mask_scale);
+            }
             if(iou > .5) recall += 1;
             avg_iou += iou;
 
-            //l.delta[best_index + 4] = iou - l.output[best_index + 4];
-            avg_obj += l.output[best_index + 4];
-            l.delta[best_index + 4] = l.object_scale * (1 - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]);
+            int obj_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords);
+            avg_obj += l.output[obj_index];
+            l.delta[obj_index] = l.object_scale * (1 - l.output[obj_index]);
             if (l.rescore) {
-                l.delta[best_index + 4] = l.object_scale * (iou - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]);
+                l.delta[obj_index] = l.object_scale * (iou - l.output[obj_index]);
+            }
+            if(l.background){
+                l.delta[obj_index] = l.object_scale * (0 - l.output[obj_index]);
             }
 
-
-            int class = state.truth[t*5 + b*l.truths + 4];
+            int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];
             if (l.map) class = l.map[class];
-            delta_region_class(l.output, l.delta, best_index + 5, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat);
+            int class_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords + 1);
+            delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat, !l.softmax);
             ++count;
             ++class_count;
         }
     }
-    //printf("\n");
-#ifndef GPU
-    flatten(l.delta, l.w*l.h, size*l.n, l.batch, 0);
-#endif
     *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
     printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f,  count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count);
 }
 
-void backward_region_layer(const layer l, network_state state)
+void backward_region_layer(const layer l, network net)
 {
-    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+    /*
+       int b;
+       int size = l.coords + l.classes + 1;
+       for (b = 0; b < l.batch*l.n; ++b){
+       int index = (b*size + 4)*l.w*l.h;
+       gradient_array(l.output + index, l.w*l.h, LOGISTIC, l.delta + index);
+       }
+       axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
+     */
+}
+
+void correct_region_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
+{
+    int i;
+    int new_w=0;
+    int new_h=0;
+    if (((float)netw/w) < ((float)neth/h)) {
+        new_w = netw;
+        new_h = (h * netw)/w;
+    } else {
+        new_h = neth;
+        new_w = (w * neth)/h;
+    }
+    for (i = 0; i < n; ++i){
+        box b = dets[i].bbox;
+        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); 
+        b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth); 
+        b.w *= (float)netw/new_w;
+        b.h *= (float)neth/new_h;
+        if(!relative){
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+        dets[i].bbox = b;
+    }
 }
 
-void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh)
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets)
 {
-    int i,j,n;
+    int i,j,n,z;
     float *predictions = l.output;
+    if (l.batch == 2) {
+        float *flip = l.output + l.outputs;
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w/2; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    for(z = 0; z < l.classes + l.coords + 1; ++z){
+                        int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+                        int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+                        float swap = flip[i1];
+                        flip[i1] = flip[i2];
+                        flip[i2] = swap;
+                        if(z == 0){
+                            flip[i1] = -flip[i1];
+                            flip[i2] = -flip[i2];
+                        }
+                    }
+                }
+            }
+        }
+        for(i = 0; i < l.outputs; ++i){
+            l.output[i] = (l.output[i] + flip[i])/2.;
+        }
+    }
     for (i = 0; i < l.w*l.h; ++i){
         int row = i / l.w;
         int col = i % l.w;
         for(n = 0; n < l.n; ++n){
-            int index = i*l.n + n;
-            int p_index = index * (l.classes + 5) + 4;
-            float scale = predictions[p_index];
-            int box_index = index * (l.classes + 5);
-            boxes[index] = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);
-            boxes[index].x *= w;
-            boxes[index].y *= h;
-            boxes[index].w *= w;
-            boxes[index].h *= h;
-
-            int class_index = index * (l.classes + 5) + 5;
+            int index = n*l.w*l.h + i;
+            for(j = 0; j < l.classes; ++j){
+                dets[index].prob[j] = 0;
+            }
+            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, l.coords);
+            int box_index  = entry_index(l, 0, n*l.w*l.h + i, 0);
+            int mask_index = entry_index(l, 0, n*l.w*l.h + i, 4);
+            float scale = l.background ? 1 : predictions[obj_index];
+            dets[index].bbox = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h, l.w*l.h);
+            dets[index].objectness = scale > thresh ? scale : 0;
+            if(dets[index].mask){
+                for(j = 0; j < l.coords - 4; ++j){
+                    dets[index].mask[j] = l.output[mask_index + j*l.w*l.h];
+                }
+            }
+
+            int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + !l.background);
             if(l.softmax_tree){
 
-                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);
+                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0, l.w*l.h);
                 if(map){
                     for(j = 0; j < 200; ++j){
-                        float prob = scale*predictions[class_index+map[j]];
-                        probs[index][j] = (prob > thresh) ? prob : 0;
+                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + map[j]);
+                        float prob = scale*predictions[class_index];
+                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
                     }
                 } else {
-                    int j =  hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh);
-                    probs[index][j] = (scale > thresh) ? scale : 0;
-                    probs[index][l.classes] = scale;
+                    int j =  hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h);
+                    dets[index].prob[j] = (scale > thresh) ? scale : 0;
                 }
             } else {
-                for(j = 0; j < l.classes; ++j){
-                    float prob = scale*predictions[class_index+j];
-                    probs[index][j] = (prob > thresh) ? prob : 0;
+                if(dets[index].objectness){
+                    for(j = 0; j < l.classes; ++j){
+                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + j);
+                        float prob = scale*predictions[class_index];
+                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
+                    }
                 }
             }
-            if(only_objectness){
-                probs[index][0] = scale;
-            }
         }
     }
+    correct_region_boxes(dets, l.w*l.h*l.n, w, h, netw, neth, relative);
 }
 
 #ifdef GPU
 
-void forward_region_layer_gpu(const layer l, network_state state)
+void forward_region_layer_gpu(const layer l, network net)
 {
-    /*
-       if(!state.train){
-       copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
-       return;
-       }
-     */
-    flatten_ongpu(state.input, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 1, l.output_gpu);
-    if(l.softmax_tree){
-        int i;
-        int count = 5;
-        for (i = 0; i < l.softmax_tree->groups; ++i) {
-            int group_size = l.softmax_tree->group_size[i];
-            softmax_gpu(l.output_gpu+count, group_size, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + count);
-            count += group_size;
+    copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
+    int b, n;
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            if(l.coords > 4){
+                index = entry_index(l, b, n*l.w*l.h, 4);
+                activate_array_gpu(l.output_gpu + index, (l.coords - 4)*l.w*l.h, LOGISTIC);
+            }
+            index = entry_index(l, b, n*l.w*l.h, l.coords);
+            if(!l.background) activate_array_gpu(l.output_gpu + index,   l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, l.coords + 1);
+            if(!l.softmax && !l.softmax_tree) activate_array_gpu(l.output_gpu + index, l.classes*l.w*l.h, LOGISTIC);
         }
-    }else if (l.softmax){
-        softmax_gpu(l.output_gpu+5, l.classes, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + 5);
     }
-
-    float *in_cpu = calloc(l.batch*l.inputs, sizeof(float));
-    float *truth_cpu = 0;
-    if(state.truth){
-        int num_truth = l.batch*l.truths;
-        truth_cpu = calloc(num_truth, sizeof(float));
-        cuda_pull_array(state.truth, truth_cpu, num_truth);
+    if (l.softmax_tree){
+        int index = entry_index(l, 0, 0, l.coords + 1);
+        softmax_tree(net.input_gpu + index, l.w*l.h, l.batch*l.n, l.inputs/l.n, 1, l.output_gpu + index, *l.softmax_tree);
+    } else if (l.softmax) {
+        int index = entry_index(l, 0, 0, l.coords + !l.background);
+        softmax_gpu(net.input_gpu + index, l.classes + l.background, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu + index);
     }
-    cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs);
-    network_state cpu_state = state;
-    cpu_state.train = state.train;
-    cpu_state.truth = truth_cpu;
-    cpu_state.input = in_cpu;
-    forward_region_layer(l, cpu_state);
+    if(!net.train || l.onlyforward){
+        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        return;
+    }
+
+    cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs);
+    forward_region_layer(l, net);
     //cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs);
-    free(cpu_state.input);
-    if(!state.train) return;
+    if(!net.train) return;
     cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
-    if(cpu_state.truth) free(cpu_state.truth);
 }
 
-void backward_region_layer_gpu(layer l, network_state state)
+void backward_region_layer_gpu(const layer l, network net)
 {
-    flatten_ongpu(l.delta_gpu, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 0, state.delta);
+    int b, n;
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            gradient_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index);
+            if(l.coords > 4){
+                index = entry_index(l, b, n*l.w*l.h, 4);
+                gradient_array_gpu(l.output_gpu + index, (l.coords - 4)*l.w*l.h, LOGISTIC, l.delta_gpu + index);
+            }
+            index = entry_index(l, b, n*l.w*l.h, l.coords);
+            if(!l.background) gradient_array_gpu(l.output_gpu + index,   l.w*l.h, LOGISTIC, l.delta_gpu + index);
+        }
+    }
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
 
+void zero_objectness(layer l)
+{
+    int i, n;
+    for (i = 0; i < l.w*l.h; ++i){
+        for(n = 0; n < l.n; ++n){
+            int obj_index = entry_index(l, 0, n*l.w*l.h + i, l.coords);
+            l.output[obj_index] = 0;
+        }
+    }
+}
+
diff --git a/image.darknet/inst/include/darknet/src/region_layer.h b/image.darknet/inst/include/darknet/src/region_layer.h
index 9a3b7cd..9f12fd1 100644
--- a/image.darknet/inst/include/darknet/src/region_layer.h
+++ b/image.darknet/inst/include/darknet/src/region_layer.h
@@ -1,18 +1,18 @@
 #ifndef REGION_LAYER_H
 #define REGION_LAYER_H
 
+#include "darknet.h"
 #include "layer.h"
 #include "network.h"
 
-layer make_region_layer(int batch, int h, int w, int n, int classes, int coords);
-void forward_region_layer(const layer l, network_state state);
-void backward_region_layer(const layer l, network_state state);
-void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh);
+layer make_region_layer(int batch, int w, int h, int n, int classes, int coords);
+void forward_region_layer(const layer l, network net);
+void backward_region_layer(const layer l, network net);
 void resize_region_layer(layer *l, int w, int h);
 
 #ifdef GPU
-void forward_region_layer_gpu(const layer l, network_state state);
-void backward_region_layer_gpu(layer l, network_state state);
+void forward_region_layer_gpu(const layer l, network net);
+void backward_region_layer_gpu(layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/reorg_layer.c b/image.darknet/inst/include/darknet/src/reorg_layer.c
index 2abca8f..31d6b84 100644
--- a/image.darknet/inst/include/darknet/src/reorg_layer.c
+++ b/image.darknet/inst/include/darknet/src/reorg_layer.c
@@ -1,18 +1,21 @@
 #include "reorg_layer.h"
 #include "cuda.h"
 #include "blas.h"
+
 #include <stdio.h>
 
 
-layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse)
+layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse, int flatten, int extra)
 {
     layer l = {0};
     l.type = REORG;
     l.batch = batch;
     l.stride = stride;
+    l.extra = extra;
     l.h = h;
     l.w = w;
     l.c = c;
+    l.flatten = flatten;
     if(reverse){
         l.out_w = w*stride;
         l.out_h = h*stride;
@@ -23,10 +26,20 @@ layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse)
         l.out_c = c*(stride*stride);
     }
     l.reverse = reverse;
-    fprintf(stderr, "reorg              /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n",  stride, w, h, c, l.out_w, l.out_h, l.out_c);
+
     l.outputs = l.out_h * l.out_w * l.out_c;
     l.inputs = h*w*c;
-    int output_size = l.out_h * l.out_w * l.out_c * batch;
+    if(l.extra){
+        l.out_w = l.out_h = l.out_c = 0;
+        l.outputs = l.inputs + l.extra;
+    }
+
+    if(extra){
+        fprintf(stderr, "reorg              %4d   ->  %4d\n",  l.inputs, l.outputs);
+    } else {
+        fprintf(stderr, "reorg              /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n",  stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    }
+    int output_size = l.outputs * batch;
     l.output =  calloc(output_size, sizeof(float));
     l.delta =   calloc(output_size, sizeof(float));
 
@@ -75,40 +88,86 @@ void resize_reorg_layer(layer *l, int w, int h)
 #endif
 }
 
-void forward_reorg_layer(const layer l, network_state state)
+void forward_reorg_layer(const layer l, network net)
 {
-    if(l.reverse){
-        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output);
-    }else {
-        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output);
+    int i;
+    if(l.flatten){
+        memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+        if(l.reverse){
+            flatten(l.output, l.w*l.h, l.c, l.batch, 0);
+        }else{
+            flatten(l.output, l.w*l.h, l.c, l.batch, 1);
+        }
+    } else if (l.extra) {
+        for(i = 0; i < l.batch; ++i){
+            copy_cpu(l.inputs, net.input + i*l.inputs, 1, l.output + i*l.outputs, 1);
+        }
+    } else if (l.reverse){
+        reorg_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output);
+    } else {
+        reorg_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output);
     }
 }
 
-void backward_reorg_layer(const layer l, network_state state)
+void backward_reorg_layer(const layer l, network net)
 {
-    if(l.reverse){
-        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
+    int i;
+    if(l.flatten){
+        memcpy(net.delta, l.delta, l.outputs*l.batch*sizeof(float));
+        if(l.reverse){
+            flatten(net.delta, l.w*l.h, l.c, l.batch, 1);
+        }else{
+            flatten(net.delta, l.w*l.h, l.c, l.batch, 0);
+        }
+    } else if(l.reverse){
+        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta);
+    } else if (l.extra) {
+        for(i = 0; i < l.batch; ++i){
+            copy_cpu(l.inputs, l.delta + i*l.outputs, 1, net.delta + i*l.inputs, 1);
+        }
     }else{
-        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
+        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta);
     }
 }
 
 #ifdef GPU
-void forward_reorg_layer_gpu(layer l, network_state state)
+void forward_reorg_layer_gpu(layer l, network net)
 {
-    if(l.reverse){
-        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
+    int i;
+    if(l.flatten){
+        if(l.reverse){
+            flatten_gpu(net.input_gpu, l.w*l.h, l.c, l.batch, 0, l.output_gpu);
+        }else{
+            flatten_gpu(net.input_gpu, l.w*l.h, l.c, l.batch, 1, l.output_gpu);
+        }
+    } else if (l.extra) {
+        for(i = 0; i < l.batch; ++i){
+            copy_gpu(l.inputs, net.input_gpu + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1);
+        }
+    } else if (l.reverse) {
+        reorg_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
     }else {
-        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
+        reorg_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
     }
 }
 
-void backward_reorg_layer_gpu(layer l, network_state state)
+void backward_reorg_layer_gpu(layer l, network net)
 {
-    if(l.reverse){
-        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
-    }else{
-        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
+    if(l.flatten){
+        if(l.reverse){
+            flatten_gpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, net.delta_gpu);
+        }else{
+            flatten_gpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, net.delta_gpu);
+        }
+    } else if (l.extra) {
+        int i;
+        for(i = 0; i < l.batch; ++i){
+            copy_gpu(l.inputs, l.delta_gpu + i*l.outputs, 1, net.delta_gpu + i*l.inputs, 1);
+        }
+    } else if(l.reverse){
+        reorg_gpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta_gpu);
+    } else {
+        reorg_gpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta_gpu);
     }
 }
 #endif
diff --git a/image.darknet/inst/include/darknet/src/reorg_layer.h b/image.darknet/inst/include/darknet/src/reorg_layer.h
index 21c22cd..e6513a5 100644
--- a/image.darknet/inst/include/darknet/src/reorg_layer.h
+++ b/image.darknet/inst/include/darknet/src/reorg_layer.h
@@ -6,14 +6,14 @@
 #include "layer.h"
 #include "network.h"
 
-layer make_reorg_layer(int batch, int h, int w, int c, int stride, int reverse);
+layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse, int flatten, int extra);
 void resize_reorg_layer(layer *l, int w, int h);
-void forward_reorg_layer(const layer l, network_state state);
-void backward_reorg_layer(const layer l, network_state state);
+void forward_reorg_layer(const layer l, network net);
+void backward_reorg_layer(const layer l, network net);
 
 #ifdef GPU
-void forward_reorg_layer_gpu(layer l, network_state state);
-void backward_reorg_layer_gpu(layer l, network_state state);
+void forward_reorg_layer_gpu(layer l, network net);
+void backward_reorg_layer_gpu(layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/rnn.c b/image.darknet/inst/include/darknet/src/rnn.c
deleted file mode 100644
index eca6f55..0000000
--- a/image.darknet/inst/include/darknet/src/rnn.c
+++ /dev/null
@@ -1,492 +0,0 @@
-#include "network.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "blas.h"
-#include "parser.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-typedef struct {
-    float *x;
-    float *y;
-} float_pair;
-
-int *read_tokenized_data(char *filename, size_t *read)
-{
-    size_t size = 512;
-    size_t count = 0;
-    FILE *fp = fopen(filename, "r");
-    int *d = calloc(size, sizeof(int));
-    int n, one;
-    one = fscanf(fp, "%d", &n);
-    while(one == 1){
-        ++count;
-        if(count > size){
-            size = size*2;
-            d = realloc(d, size*sizeof(int));
-        }
-        d[count-1] = n;
-        one = fscanf(fp, "%d", &n);
-    }
-    fclose(fp);
-    d = realloc(d, count*sizeof(int));
-    *read = count;
-    return d;
-}
-
-char **read_tokens(char *filename, size_t *read)
-{
-    size_t size = 512;
-    size_t count = 0;
-    FILE *fp = fopen(filename, "r");
-    char **d = calloc(size, sizeof(char *));
-    char *line;
-    while((line=fgetl(fp)) != 0){
-        ++count;
-        if(count > size){
-            size = size*2;
-            d = realloc(d, size*sizeof(char *));
-        }
-        d[count-1] = line;
-    }
-    fclose(fp);
-    d = realloc(d, count*sizeof(char *));
-    *read = count;
-    return d;
-}
-
-float_pair get_rnn_token_data(int *tokens, size_t *offsets, int characters, size_t len, int batch, int steps)
-{
-    float *x = calloc(batch * steps * characters, sizeof(float));
-    float *y = calloc(batch * steps * characters, sizeof(float));
-    int i,j;
-    for(i = 0; i < batch; ++i){
-        for(j = 0; j < steps; ++j){
-            int curr = tokens[(offsets[i])%len];
-            int next = tokens[(offsets[i] + 1)%len];
-
-            x[(j*batch + i)*characters + curr] = 1;
-            y[(j*batch + i)*characters + next] = 1;
-
-            offsets[i] = (offsets[i] + 1) % len;
-
-            if(curr >= characters || curr < 0 || next >= characters || next < 0){
-                error("Bad char");
-            }
-        }
-    }
-    float_pair p;
-    p.x = x;
-    p.y = y;
-    return p;
-}
-
-float_pair get_rnn_data(unsigned char *text, size_t *offsets, int characters, size_t len, int batch, int steps)
-{
-    float *x = calloc(batch * steps * characters, sizeof(float));
-    float *y = calloc(batch * steps * characters, sizeof(float));
-    int i,j;
-    for(i = 0; i < batch; ++i){
-        for(j = 0; j < steps; ++j){
-            unsigned char curr = text[(offsets[i])%len];
-            unsigned char next = text[(offsets[i] + 1)%len];
-
-            x[(j*batch + i)*characters + curr] = 1;
-            y[(j*batch + i)*characters + next] = 1;
-
-            offsets[i] = (offsets[i] + 1) % len;
-
-            if(curr > 255 || curr <= 0 || next > 255 || next <= 0){
-                /*text[(index+j+2)%len] = 0;
-                printf("%ld %d %d %d %d\n", index, j, len, (int)text[index+j], (int)text[index+j+1]);
-                printf("%s", text+index);
-                */
-                error("Bad char");
-            }
-        }
-    }
-    float_pair p;
-    p.x = x;
-    p.y = y;
-    return p;
-}
-
-void reset_rnn_state(network net, int b)
-{
-    int i;
-    for (i = 0; i < net.n; ++i) {
-        #ifdef GPU
-        layer l = net.layers[i];
-        if(l.state_gpu){
-            fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
-        }
-        #endif
-    }
-}
-
-void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear, int tokenized)
-{
-    srand(time(0));
-    unsigned char *text = 0;
-    int *tokens = 0;
-    size_t size;
-    if(tokenized){
-        tokens = read_tokenized_data(filename, &size);
-    } else {
-        FILE *fp = fopen(filename, "rb");
-
-        fseek(fp, 0, SEEK_END); 
-        size = ftell(fp);
-        fseek(fp, 0, SEEK_SET); 
-
-        text = calloc(size+1, sizeof(char));
-        fread(text, 1, size, fp);
-        fclose(fp);
-    }
-
-    char *backup_directory = "/home/pjreddie/backup/";
-    char *base = basecfg(cfgfile);
-    fprintf(stderr, "%s\n", base);
-    float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-
-    int inputs = get_network_input_size(net);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int batch = net.batch;
-    int steps = net.time_steps;
-    if(clear) *net.seen = 0;
-    int i = (*net.seen)/net.batch;
-
-    int streams = batch/steps;
-    size_t *offsets = calloc(streams, sizeof(size_t));
-    int j;
-    for(j = 0; j < streams; ++j){
-        offsets[j] = rand_size_t()%size;
-    }
-
-    clock_t time;
-    while(get_current_batch(net) < net.max_batches){
-        i += 1;
-        time=clock();
-        float_pair p;
-        if(tokenized){
-            p = get_rnn_token_data(tokens, offsets, inputs, size, streams, steps);
-        }else{
-            p = get_rnn_data(text, offsets, inputs, size, streams, steps);
-        }
-
-        float loss = train_network_datum(net, p.x, p.y) / (batch);
-        free(p.x);
-        free(p.y);
-        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-
-        int chars = get_current_batch(net)*batch;
-        fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds, %f epochs\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), (float) chars/size);
-
-        for(j = 0; j < streams; ++j){
-            //printf("%d\n", j);
-            if(rand()%10 == 0){
-                //fprintf(stderr, "Reset\n");
-                offsets[j] = rand_size_t()%size;
-                reset_rnn_state(net, j);
-            }
-        }
-
-        if(i%1000==0){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-        }
-        if(i%10==0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup", backup_directory, base);
-            save_weights(net, buff);
-        }
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
-}
-
-void print_symbol(int n, char **tokens){
-    if(tokens){
-        printf("%s ", tokens[n]);
-    } else {
-        printf("%c", n);
-    }
-}
-
-void test_char_rnn(char *cfgfile, char *weightfile, int num, char *seed, float temp, int rseed, char *token_file)
-{
-    char **tokens = 0;
-    if(token_file){
-        size_t n;
-        tokens = read_tokens(token_file, &n);
-    }
-
-    srand(rseed);
-    char *base = basecfg(cfgfile);
-    fprintf(stderr, "%s\n", base);
-
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int inputs = get_network_input_size(net);
-
-    int i, j;
-    for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp;
-    int c = 0;
-    int len = strlen(seed);
-    float *input = calloc(inputs, sizeof(float));
-
-    /*
-       fill_cpu(inputs, 0, input, 1);
-       for(i = 0; i < 10; ++i){
-       network_predict(net, input);
-       }
-       fill_cpu(inputs, 0, input, 1);
-     */
-
-    for(i = 0; i < len-1; ++i){
-        c = seed[i];
-        input[c] = 1;
-        network_predict(net, input);
-        input[c] = 0;
-        print_symbol(c, tokens);
-    }
-    if(len) c = seed[len-1];
-    print_symbol(c, tokens);
-    for(i = 0; i < num; ++i){
-        input[c] = 1;
-        float *out = network_predict(net, input);
-        input[c] = 0;
-        for(j = 32; j < 127; ++j){
-            //printf("%d %c %f\n",j, j, out[j]);
-        }
-        for(j = 0; j < inputs; ++j){
-            if (out[j] < .0001) out[j] = 0;
-        }
-        c = sample_array(out, inputs);
-        print_symbol(c, tokens);
-    }
-    printf("\n");
-}
-
-void test_tactic_rnn(char *cfgfile, char *weightfile, int num, float temp, int rseed, char *token_file)
-{
-    char **tokens = 0;
-    if(token_file){
-        size_t n;
-        tokens = read_tokens(token_file, &n);
-    }
-
-    srand(rseed);
-    char *base = basecfg(cfgfile);
-    fprintf(stderr, "%s\n", base);
-
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int inputs = get_network_input_size(net);
-
-    int i, j;
-    for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp;
-    int c = 0;
-    float *input = calloc(inputs, sizeof(float));
-    float *out = 0;
-
-    while((c = getc(stdin)) != EOF){
-        input[c] = 1;
-        out = network_predict(net, input);
-        input[c] = 0;
-    }
-    for(i = 0; i < num; ++i){
-        for(j = 0; j < inputs; ++j){
-            if (out[j] < .0001) out[j] = 0;
-        }
-        int next = sample_array(out, inputs);
-        if(c == '.' && next == '\n') break;
-        c = next;
-        print_symbol(c, tokens);
-
-        input[c] = 1;
-        out = network_predict(net, input);
-        input[c] = 0;
-    }
-    printf("\n");
-}
-
-void valid_tactic_rnn(char *cfgfile, char *weightfile, char *seed)
-{
-    char *base = basecfg(cfgfile);
-    fprintf(stderr, "%s\n", base);
-
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int inputs = get_network_input_size(net);
-
-    int count = 0;
-    int words = 1;
-    int c;
-    int len = strlen(seed);
-    float *input = calloc(inputs, sizeof(float));
-    int i;
-    for(i = 0; i < len; ++i){
-        c = seed[i];
-        input[(int)c] = 1;
-        network_predict(net, input);
-        input[(int)c] = 0;
-    }
-    float sum = 0;
-    c = getc(stdin);
-    float log2 = log(2);
-    int in = 0;
-    while(c != EOF){
-        int next = getc(stdin);
-        if(next == EOF) break;
-        if(next < 0 || next >= 255) error("Out of range character");
-
-        input[c] = 1;
-        float *out = network_predict(net, input);
-        input[c] = 0;
-
-        if(c == '.' && next == '\n') in = 0;
-        if(!in) {
-            if(c == '>' && next == '>'){
-                in = 1;
-                ++words;
-            }
-            c = next;
-            continue;
-        }
-        ++count;
-        sum += log(out[next])/log2;
-        c = next;
-        printf("%d %d Perplexity: %4.4f    Word Perplexity: %4.4f\n", count, words, pow(2, -sum/count), pow(2, -sum/words));
-    }
-}
-
-void valid_char_rnn(char *cfgfile, char *weightfile, char *seed)
-{
-    char *base = basecfg(cfgfile);
-    fprintf(stderr, "%s\n", base);
-
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int inputs = get_network_input_size(net);
-
-    int count = 0;
-    int words = 1;
-    int c;
-    int len = strlen(seed);
-    float *input = calloc(inputs, sizeof(float));
-    int i;
-    for(i = 0; i < len; ++i){
-        c = seed[i];
-        input[(int)c] = 1;
-        network_predict(net, input);
-        input[(int)c] = 0;
-    }
-    float sum = 0;
-    c = getc(stdin);
-    float log2 = log(2);
-    while(c != EOF){
-        int next = getc(stdin);
-        if(next == EOF) break;
-        if(next < 0 || next >= 255) error("Out of range character");
-        ++count;
-        if(next == ' ' || next == '\n' || next == '\t') ++words;
-        input[c] = 1;
-        float *out = network_predict(net, input);
-        input[c] = 0;
-        sum += log(out[next])/log2;
-        c = next;
-        printf("%d Perplexity: %4.4f    Word Perplexity: %4.4f\n", count, pow(2, -sum/count), pow(2, -sum/words));
-    }
-}
-
-void vec_char_rnn(char *cfgfile, char *weightfile, char *seed)
-{
-    char *base = basecfg(cfgfile);
-    fprintf(stderr, "%s\n", base);
-
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int inputs = get_network_input_size(net);
-
-    int c;
-    int seed_len = strlen(seed);
-    float *input = calloc(inputs, sizeof(float));
-    int i;
-    char *line;
-    while((line=fgetl(stdin)) != 0){
-        reset_rnn_state(net, 0);
-        for(i = 0; i < seed_len; ++i){
-            c = seed[i];
-            input[(int)c] = 1;
-            network_predict(net, input);
-            input[(int)c] = 0;
-        }
-        strip(line);
-        int str_len = strlen(line);
-        for(i = 0; i < str_len; ++i){
-            c = line[i];
-            input[(int)c] = 1;
-            network_predict(net, input);
-            input[(int)c] = 0;
-        }
-        c = ' ';
-        input[(int)c] = 1;
-        network_predict(net, input);
-        input[(int)c] = 0;
-
-        layer l = net.layers[0];
-        #ifdef GPU
-        cuda_pull_array(l.output_gpu, l.output, l.outputs);
-        #endif
-        printf("%s", line);
-        for(i = 0; i < l.outputs; ++i){
-            printf(",%g", l.output[i]);
-        }
-        printf("\n");
-    }
-}
-
-void run_char_rnn(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-    char *filename = find_char_arg(argc, argv, "-file", "data/shakespeare.txt");
-    char *seed = find_char_arg(argc, argv, "-seed", "\n\n");
-    int len = find_int_arg(argc, argv, "-len", 1000);
-    float temp = find_float_arg(argc, argv, "-temp", .7);
-    int rseed = find_int_arg(argc, argv, "-srand", time(0));
-    int clear = find_arg(argc, argv, "-clear");
-    int tokenized = find_arg(argc, argv, "-tokenized");
-    char *tokens = find_char_arg(argc, argv, "-tokens", 0);
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    if(0==strcmp(argv[2], "train")) train_char_rnn(cfg, weights, filename, clear, tokenized);
-    else if(0==strcmp(argv[2], "valid")) valid_char_rnn(cfg, weights, seed);
-    else if(0==strcmp(argv[2], "validtactic")) valid_tactic_rnn(cfg, weights, seed);
-    else if(0==strcmp(argv[2], "vec")) vec_char_rnn(cfg, weights, seed);
-    else if(0==strcmp(argv[2], "generate")) test_char_rnn(cfg, weights, len, seed, temp, rseed, tokens);
-    else if(0==strcmp(argv[2], "generatetactic")) test_tactic_rnn(cfg, weights, len, temp, rseed, tokens);
-}
diff --git a/image.darknet/inst/include/darknet/src/rnn_layer.c b/image.darknet/inst/include/darknet/src/rnn_layer.c
index 83fda13..8c9b457 100644
--- a/image.darknet/inst/include/darknet/src/rnn_layer.c
+++ b/image.darknet/inst/include/darknet/src/rnn_layer.c
@@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps)
 #endif
 }
 
-layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log)
+layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam)
 {
     fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs);
     batch = batch / steps;
@@ -34,24 +34,24 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps,
     l.batch = batch;
     l.type = RNN;
     l.steps = steps;
-    l.hidden = hidden;
     l.inputs = inputs;
 
-    l.state = calloc(batch*hidden*(steps+1), sizeof(float));
+    l.state = calloc(batch*outputs, sizeof(float));
+    l.prev_state = calloc(batch*outputs, sizeof(float));
 
     l.input_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize);
+    *(l.input_layer) = make_connected_layer(batch*steps, inputs, outputs, activation, batch_normalize, adam);
     l.input_layer->batch = batch;
 
     l.self_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize);
+    *(l.self_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam);
     l.self_layer->batch = batch;
 
     l.output_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize);
+    *(l.output_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam);
     l.output_layer->batch = batch;
 
     l.outputs = outputs;
@@ -65,66 +65,72 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps,
     l.forward_gpu = forward_rnn_layer_gpu;
     l.backward_gpu = backward_rnn_layer_gpu;
     l.update_gpu = update_rnn_layer_gpu;
-    l.state_gpu = cuda_make_array(l.state, batch*hidden*(steps+1));
+    l.state_gpu = cuda_make_array(0, batch*outputs);
+    l.prev_state_gpu = cuda_make_array(0, batch*outputs);
     l.output_gpu = l.output_layer->output_gpu;
     l.delta_gpu = l.output_layer->delta_gpu;
+#ifdef CUDNN
+    cudnnSetTensor4dDescriptor(l.input_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.input_layer->out_c, l.input_layer->out_h, l.input_layer->out_w); 
+    cudnnSetTensor4dDescriptor(l.self_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.self_layer->out_c, l.self_layer->out_h, l.self_layer->out_w); 
+    cudnnSetTensor4dDescriptor(l.output_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.output_layer->out_c, l.output_layer->out_h, l.output_layer->out_w); 
+#endif
 #endif
 
     return l;
 }
 
-void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_rnn_layer(layer l, update_args a)
 {
-    update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.input_layer),  a);
+    update_connected_layer(*(l.self_layer),   a);
+    update_connected_layer(*(l.output_layer), a);
 }
 
-void forward_rnn_layer(layer l, network_state state)
+void forward_rnn_layer(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
     layer output_layer = *(l.output_layer);
 
     fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
-    fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1);
-    fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1);
-    if(state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, self_layer.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, input_layer.delta, 1);
+    if(net.train) fill_cpu(l.outputs * l.batch, 0, l.state, 1);
 
     for (i = 0; i < l.steps; ++i) {
-        s.input = state.input;
+        s.input = net.input;
         forward_connected_layer(input_layer, s);
 
         s.input = l.state;
         forward_connected_layer(self_layer, s);
 
         float *old_state = l.state;
-        if(state.train) l.state += l.hidden*l.batch;
+        if(net.train) l.state += l.outputs*l.batch;
         if(l.shortcut){
-            copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1);
+            copy_cpu(l.outputs * l.batch, old_state, 1, l.state, 1);
         }else{
-            fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+            fill_cpu(l.outputs * l.batch, 0, l.state, 1);
         }
-        axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1);
-        axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
+        axpy_cpu(l.outputs * l.batch, 1, input_layer.output, 1, l.state, 1);
+        axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1);
 
         s.input = l.state;
         forward_connected_layer(output_layer, s);
 
-        state.input += l.inputs*l.batch;
+        net.input += l.inputs*l.batch;
         increment_layer(&input_layer, 1);
         increment_layer(&self_layer, 1);
         increment_layer(&output_layer, 1);
     }
 }
 
-void backward_rnn_layer(layer l, network_state state)
+void backward_rnn_layer(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
@@ -134,34 +140,34 @@ void backward_rnn_layer(layer l, network_state state)
     increment_layer(&self_layer, l.steps-1);
     increment_layer(&output_layer, l.steps-1);
 
-    l.state += l.hidden*l.batch*l.steps;
+    l.state += l.outputs*l.batch*l.steps;
     for (i = l.steps-1; i >= 0; --i) {
-        copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1);
-        axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
+        copy_cpu(l.outputs * l.batch, input_layer.output, 1, l.state, 1);
+        axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1);
 
         s.input = l.state;
         s.delta = self_layer.delta;
         backward_connected_layer(output_layer, s);
 
-        l.state -= l.hidden*l.batch;
+        l.state -= l.outputs*l.batch;
         /*
            if(i > 0){
-           copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1);
-           axpy_cpu(l.hidden * l.batch, 1, self_layer.output - l.hidden*l.batch, 1, l.state, 1);
+           copy_cpu(l.outputs * l.batch, input_layer.output - l.outputs*l.batch, 1, l.state, 1);
+           axpy_cpu(l.outputs * l.batch, 1, self_layer.output - l.outputs*l.batch, 1, l.state, 1);
            }else{
-           fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+           fill_cpu(l.outputs * l.batch, 0, l.state, 1);
            }
          */
 
         s.input = l.state;
-        s.delta = self_layer.delta - l.hidden*l.batch;
+        s.delta = self_layer.delta - l.outputs*l.batch;
         if (i == 0) s.delta = 0;
         backward_connected_layer(self_layer, s);
 
-        copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1);
-        if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1);
-        s.input = state.input + i*l.inputs*l.batch;
-        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
+        copy_cpu(l.outputs*l.batch, self_layer.delta, 1, input_layer.delta, 1);
+        if (i > 0 && l.shortcut) axpy_cpu(l.outputs*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.outputs*l.batch, 1);
+        s.input = net.input + i*l.inputs*l.batch;
+        if(net.delta) s.delta = net.delta + i*l.inputs*l.batch;
         else s.delta = 0;
         backward_connected_layer(input_layer, s);
 
@@ -187,58 +193,56 @@ void push_rnn_layer(layer l)
     push_connected_layer(*(l.output_layer));
 }
 
-void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_rnn_layer_gpu(layer l, update_args a)
 {
-    update_connected_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer_gpu(*(l.input_layer),  a);
+    update_connected_layer_gpu(*(l.self_layer),   a);
+    update_connected_layer_gpu(*(l.output_layer), a);
 }
 
-void forward_rnn_layer_gpu(layer l, network_state state)
+void forward_rnn_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = {0};
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
     layer output_layer = *(l.output_layer);
 
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
-    fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
-    fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
-    if(state.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
+
+    if(net.train) {
+        fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
+    }
 
     for (i = 0; i < l.steps; ++i) {
-        s.input = state.input;
+        s.input_gpu = net.input_gpu;
         forward_connected_layer_gpu(input_layer, s);
 
-        s.input = l.state_gpu;
+        s.input_gpu = l.state_gpu;
         forward_connected_layer_gpu(self_layer, s);
 
-        float *old_state = l.state_gpu;
-        if(state.train) l.state_gpu += l.hidden*l.batch;
-        if(l.shortcut){
-            copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
-        }else{
-            fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
-        }
-        axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
-        axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
+        fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
+        axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
 
-        s.input = l.state_gpu;
+        s.input_gpu = l.state_gpu;
         forward_connected_layer_gpu(output_layer, s);
 
-        state.input += l.inputs*l.batch;
+        net.input_gpu += l.inputs*l.batch;
         increment_layer(&input_layer, 1);
         increment_layer(&self_layer, 1);
         increment_layer(&output_layer, 1);
     }
 }
 
-void backward_rnn_layer_gpu(layer l, network_state state)
+void backward_rnn_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = {0};
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
@@ -246,32 +250,43 @@ void backward_rnn_layer_gpu(layer l, network_state state)
     increment_layer(&input_layer,  l.steps - 1);
     increment_layer(&self_layer,   l.steps - 1);
     increment_layer(&output_layer, l.steps - 1);
-    l.state_gpu += l.hidden*l.batch*l.steps;
+    float *last_input = input_layer.output_gpu;
+    float *last_self = self_layer.output_gpu;
     for (i = l.steps-1; i >= 0; --i) {
+        fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
+        axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
 
-        s.input = l.state_gpu;
-        s.delta = self_layer.delta_gpu;
+        s.input_gpu = l.state_gpu;
+        s.delta_gpu = self_layer.delta_gpu;
         backward_connected_layer_gpu(output_layer, s);
 
-        l.state_gpu -= l.hidden*l.batch;
+        if(i != 0) {
+            fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
+            axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
+            axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
+        }else {
+            copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1);
+        }
 
-        copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
+        copy_gpu(l.outputs*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
 
-        s.input = l.state_gpu;
-        s.delta = self_layer.delta_gpu - l.hidden*l.batch;
-        if (i == 0) s.delta = 0;
+        s.input_gpu = l.state_gpu;
+        s.delta_gpu = (i > 0) ? self_layer.delta_gpu - l.outputs*l.batch : 0;
+        if (i == 0) s.delta_gpu = 0;
         backward_connected_layer_gpu(self_layer, s);
 
-        //copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
-        if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
-        s.input = state.input + i*l.inputs*l.batch;
-        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
-        else s.delta = 0;
+        s.input_gpu = net.input_gpu + i*l.inputs*l.batch;
+        if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch;
+        else s.delta_gpu = 0;
         backward_connected_layer_gpu(input_layer, s);
 
         increment_layer(&input_layer,  -1);
         increment_layer(&self_layer,   -1);
         increment_layer(&output_layer, -1);
     }
+    fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
+    axpy_gpu(l.outputs * l.batch, 1, last_input, 1, l.state_gpu, 1);
+    axpy_gpu(l.outputs * l.batch, 1, last_self, 1, l.state_gpu, 1);
 }
 #endif
diff --git a/image.darknet/inst/include/darknet/src/rnn_layer.h b/image.darknet/inst/include/darknet/src/rnn_layer.h
index bb9478b..270a63f 100644
--- a/image.darknet/inst/include/darknet/src/rnn_layer.h
+++ b/image.darknet/inst/include/darknet/src/rnn_layer.h
@@ -7,16 +7,16 @@
 #include "network.h"
 #define USET
 
-layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log);
+layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam);
 
-void forward_rnn_layer(layer l, network_state state);
-void backward_rnn_layer(layer l, network_state state);
-void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_rnn_layer(layer l, network net);
+void backward_rnn_layer(layer l, network net);
+void update_rnn_layer(layer l, update_args a);
 
 #ifdef GPU
-void forward_rnn_layer_gpu(layer l, network_state state);
-void backward_rnn_layer_gpu(layer l, network_state state);
-void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_rnn_layer_gpu(layer l, network net);
+void backward_rnn_layer_gpu(layer l, network net);
+void update_rnn_layer_gpu(layer l, update_args a);
 void push_rnn_layer(layer l);
 void pull_rnn_layer(layer l);
 #endif
diff --git a/image.darknet/inst/include/darknet/src/route_layer.c b/image.darknet/inst/include/darknet/src/route_layer.c
index dce7118..a8970a4 100644
--- a/image.darknet/inst/include/darknet/src/route_layer.c
+++ b/image.darknet/inst/include/darknet/src/route_layer.c
@@ -1,6 +1,7 @@
 #include "route_layer.h"
 #include "cuda.h"
 #include "blas.h"
+
 #include <stdio.h>
 
 route_layer make_route_layer(int batch, int n, int *input_layers, int *input_sizes)
@@ -70,13 +71,13 @@ void resize_route_layer(route_layer *l, network *net)
     
 }
 
-void forward_route_layer(const route_layer l, network_state state)
+void forward_route_layer(const route_layer l, network net)
 {
     int i, j;
     int offset = 0;
     for(i = 0; i < l.n; ++i){
         int index = l.input_layers[i];
-        float *input = state.net.layers[index].output;
+        float *input = net.layers[index].output;
         int input_size = l.input_sizes[i];
         for(j = 0; j < l.batch; ++j){
             copy_cpu(input_size, input + j*input_size, 1, l.output + offset + j*l.outputs, 1);
@@ -85,13 +86,13 @@ void forward_route_layer(const route_layer l, network_state state)
     }
 }
 
-void backward_route_layer(const route_layer l, network_state state)
+void backward_route_layer(const route_layer l, network net)
 {
     int i, j;
     int offset = 0;
     for(i = 0; i < l.n; ++i){
         int index = l.input_layers[i];
-        float *delta = state.net.layers[index].delta;
+        float *delta = net.layers[index].delta;
         int input_size = l.input_sizes[i];
         for(j = 0; j < l.batch; ++j){
             axpy_cpu(input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1);
@@ -101,31 +102,31 @@ void backward_route_layer(const route_layer l, network_state state)
 }
 
 #ifdef GPU
-void forward_route_layer_gpu(const route_layer l, network_state state)
+void forward_route_layer_gpu(const route_layer l, network net)
 {
     int i, j;
     int offset = 0;
     for(i = 0; i < l.n; ++i){
         int index = l.input_layers[i];
-        float *input = state.net.layers[index].output_gpu;
+        float *input = net.layers[index].output_gpu;
         int input_size = l.input_sizes[i];
         for(j = 0; j < l.batch; ++j){
-            copy_ongpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1);
+            copy_gpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1);
         }
         offset += input_size;
     }
 }
 
-void backward_route_layer_gpu(const route_layer l, network_state state)
+void backward_route_layer_gpu(const route_layer l, network net)
 {
     int i, j;
     int offset = 0;
     for(i = 0; i < l.n; ++i){
         int index = l.input_layers[i];
-        float *delta = state.net.layers[index].delta_gpu;
+        float *delta = net.layers[index].delta_gpu;
         int input_size = l.input_sizes[i];
         for(j = 0; j < l.batch; ++j){
-            axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
+            axpy_gpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
         }
         offset += input_size;
     }
diff --git a/image.darknet/inst/include/darknet/src/route_layer.h b/image.darknet/inst/include/darknet/src/route_layer.h
index 45467d9..1d40330 100644
--- a/image.darknet/inst/include/darknet/src/route_layer.h
+++ b/image.darknet/inst/include/darknet/src/route_layer.h
@@ -6,13 +6,13 @@
 typedef layer route_layer;
 
 route_layer make_route_layer(int batch, int n, int *input_layers, int *input_size);
-void forward_route_layer(const route_layer l, network_state state);
-void backward_route_layer(const route_layer l, network_state state);
+void forward_route_layer(const route_layer l, network net);
+void backward_route_layer(const route_layer l, network net);
 void resize_route_layer(route_layer *l, network *net);
 
 #ifdef GPU
-void forward_route_layer_gpu(const route_layer l, network_state state);
-void backward_route_layer_gpu(const route_layer l, network_state state);
+void forward_route_layer_gpu(const route_layer l, network net);
+void backward_route_layer_gpu(const route_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/shortcut_layer.c b/image.darknet/inst/include/darknet/src/shortcut_layer.c
index 8bca50f..49d17f5 100644
--- a/image.darknet/inst/include/darknet/src/shortcut_layer.c
+++ b/image.darknet/inst/include/darknet/src/shortcut_layer.c
@@ -1,12 +1,14 @@
 #include "shortcut_layer.h"
 #include "cuda.h"
 #include "blas.h"
+#include "activations.h"
+
 #include <stdio.h>
 #include <assert.h>
 
 layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2)
 {
-    fprintf(stderr,"Shortcut Layer: %d\n", index);
+    fprintf(stderr, "res  %3d                %4d x%4d x%4d   ->  %4d x%4d x%4d\n",index, w2,h2,c2, w,h,c);
     layer l = {0};
     l.type = SHORTCUT;
     l.batch = batch;
@@ -36,32 +38,53 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
     return l;
 }
 
-void forward_shortcut_layer(const layer l, network_state state)
+void resize_shortcut_layer(layer *l, int w, int h)
+{
+    assert(l->w == l->out_w);
+    assert(l->h == l->out_h);
+    l->w = l->out_w = w;
+    l->h = l->out_h = h;
+    l->outputs = w*h*l->out_c;
+    l->inputs = l->outputs;
+    l->delta =  realloc(l->delta, l->outputs*l->batch*sizeof(float));
+    l->output = realloc(l->output, l->outputs*l->batch*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu   = cuda_make_array(l->delta,  l->outputs*l->batch);
+#endif
+    
+}
+
+
+void forward_shortcut_layer(const layer l, network net)
 {
-    copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
-    shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
+    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
+    shortcut_cpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.alpha, l.beta, l.output);
     activate_array(l.output, l.outputs*l.batch, l.activation);
 }
 
-void backward_shortcut_layer(const layer l, network_state state)
+void backward_shortcut_layer(const layer l, network net)
 {
     gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
-    axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1);
-    shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
+    axpy_cpu(l.outputs*l.batch, l.alpha, l.delta, 1, net.delta, 1);
+    shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, 1, l.beta, net.layers[l.index].delta);
 }
 
 #ifdef GPU
-void forward_shortcut_layer_gpu(const layer l, network_state state)
+void forward_shortcut_layer_gpu(const layer l, network net)
 {
-    copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
-    shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    shortcut_gpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.alpha, l.beta, l.output_gpu);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
 }
 
-void backward_shortcut_layer_gpu(const layer l, network_state state)
+void backward_shortcut_layer_gpu(const layer l, network net)
 {
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
-    axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, state.delta, 1);
-    shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, state.net.layers[l.index].delta_gpu);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    axpy_gpu(l.outputs*l.batch, l.alpha, l.delta_gpu, 1, net.delta_gpu, 1);
+    shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, 1, l.beta, net.layers[l.index].delta_gpu);
 }
 #endif
diff --git a/image.darknet/inst/include/darknet/src/shortcut_layer.h b/image.darknet/inst/include/darknet/src/shortcut_layer.h
index c09a809..5f684fc 100644
--- a/image.darknet/inst/include/darknet/src/shortcut_layer.h
+++ b/image.darknet/inst/include/darknet/src/shortcut_layer.h
@@ -5,12 +5,13 @@
 #include "network.h"
 
 layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2);
-void forward_shortcut_layer(const layer l, network_state state);
-void backward_shortcut_layer(const layer l, network_state state);
+void forward_shortcut_layer(const layer l, network net);
+void backward_shortcut_layer(const layer l, network net);
+void resize_shortcut_layer(layer *l, int w, int h);
 
 #ifdef GPU
-void forward_shortcut_layer_gpu(const layer l, network_state state);
-void backward_shortcut_layer_gpu(const layer l, network_state state);
+void forward_shortcut_layer_gpu(const layer l, network net);
+void backward_shortcut_layer_gpu(const layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/softmax_layer.c b/image.darknet/inst/include/darknet/src/softmax_layer.c
index 5d15314..9cbc6be 100644
--- a/image.darknet/inst/include/darknet/src/softmax_layer.c
+++ b/image.darknet/inst/include/darknet/src/softmax_layer.c
@@ -1,6 +1,7 @@
 #include "softmax_layer.h"
 #include "blas.h"
 #include "cuda.h"
+
 #include <float.h>
 #include <math.h>
 #include <stdlib.h>
@@ -17,8 +18,10 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups)
     l.groups = groups;
     l.inputs = inputs;
     l.outputs = inputs;
+    l.loss = calloc(inputs*batch, sizeof(float));
     l.output = calloc(inputs*batch, sizeof(float));
     l.delta = calloc(inputs*batch, sizeof(float));
+    l.cost = calloc(1, sizeof(float));
 
     l.forward = forward_softmax_layer;
     l.backward = backward_softmax_layer;
@@ -27,45 +30,35 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups)
     l.backward_gpu = backward_softmax_layer_gpu;
 
     l.output_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.loss_gpu = cuda_make_array(l.loss, inputs*batch); 
     l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
     #endif
     return l;
 }
 
-void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output)
+void forward_softmax_layer(const softmax_layer l, network net)
 {
-    int b;
-    for(b = 0; b < batch; ++b){
+    if(l.softmax_tree){
         int i;
         int count = 0;
-        for(i = 0; i < hierarchy->groups; ++i){
-            int group_size = hierarchy->group_size[i];
-            softmax(input+b*inputs + count, group_size, temp, output+b*inputs + count);
+        for (i = 0; i < l.softmax_tree->groups; ++i) {
+            int group_size = l.softmax_tree->group_size[i];
+            softmax_cpu(net.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output + count);
             count += group_size;
         }
+    } else {
+        softmax_cpu(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output);
     }
-}
 
-void forward_softmax_layer(const softmax_layer l, network_state state)
-{
-    int b;
-    int inputs = l.inputs / l.groups;
-    int batch = l.batch * l.groups;
-    if(l.softmax_tree){
-        softmax_tree(state.input, batch, inputs, l.temperature, l.softmax_tree, l.output);
-    } else {
-        for(b = 0; b < batch; ++b){
-            softmax(state.input+b*inputs, inputs, l.temperature, l.output+b*inputs);
-        }
+    if(net.truth && !l.noloss){
+        softmax_x_ent_cpu(l.batch*l.inputs, l.output, net.truth, l.delta, l.loss);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
     }
 }
 
-void backward_softmax_layer(const softmax_layer l, network_state state)
+void backward_softmax_layer(const softmax_layer l, network net)
 {
-    int i;
-    for(i = 0; i < l.inputs*l.batch; ++i){
-        state.delta[i] += l.delta[i];
-    }
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
 }
 
 #ifdef GPU
@@ -75,26 +68,40 @@ void pull_softmax_layer_output(const softmax_layer layer)
     cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch);
 }
 
-void forward_softmax_layer_gpu(const softmax_layer l, network_state state)
+void forward_softmax_layer_gpu(const softmax_layer l, network net)
 {
-    int inputs = l.inputs / l.groups;
-    int batch = l.batch * l.groups;
     if(l.softmax_tree){
+        softmax_tree(net.input_gpu, 1, l.batch, l.inputs, l.temperature, l.output_gpu, *l.softmax_tree);
+        /*
         int i;
         int count = 0;
         for (i = 0; i < l.softmax_tree->groups; ++i) {
             int group_size = l.softmax_tree->group_size[i];
-            softmax_gpu(state.input+count, group_size, inputs, batch, l.temperature, l.output_gpu + count);
+            softmax_gpu(net.input_gpu + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count);
             count += group_size;
         }
+        */
     } else {
-        softmax_gpu(state.input, inputs, inputs, batch, l.temperature, l.output_gpu);
+        if(l.spatial){
+            softmax_gpu(net.input_gpu, l.c, l.batch*l.c, l.inputs/l.c, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu);
+        }else{
+            softmax_gpu(net.input_gpu, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu);
+        }
+    }
+    if(net.truth && !l.noloss){
+        softmax_x_ent_gpu(l.batch*l.inputs, l.output_gpu, net.truth_gpu, l.delta_gpu, l.loss_gpu);
+        if(l.softmax_tree){
+            mask_gpu(l.batch*l.inputs, l.delta_gpu, SECRET_NUM, net.truth_gpu, 0);
+            mask_gpu(l.batch*l.inputs, l.loss_gpu, SECRET_NUM, net.truth_gpu, 0);
+        }
+        cuda_pull_array(l.loss_gpu, l.loss, l.batch*l.inputs);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
     }
 }
 
-void backward_softmax_layer_gpu(const softmax_layer layer, network_state state)
+void backward_softmax_layer_gpu(const softmax_layer layer, network net)
 {
-    axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, state.delta, 1);
+    axpy_gpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta_gpu, 1);
 }
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/softmax_layer.h b/image.darknet/inst/include/darknet/src/softmax_layer.h
index 821a8dd..2e3ffe0 100644
--- a/image.darknet/inst/include/darknet/src/softmax_layer.h
+++ b/image.darknet/inst/include/darknet/src/softmax_layer.h
@@ -7,13 +7,13 @@ typedef layer softmax_layer;
 
 void softmax_array(float *input, int n, float temp, float *output);
 softmax_layer make_softmax_layer(int batch, int inputs, int groups);
-void forward_softmax_layer(const softmax_layer l, network_state state);
-void backward_softmax_layer(const softmax_layer l, network_state state);
+void forward_softmax_layer(const softmax_layer l, network net);
+void backward_softmax_layer(const softmax_layer l, network net);
 
 #ifdef GPU
 void pull_softmax_layer_output(const softmax_layer l);
-void forward_softmax_layer_gpu(const softmax_layer l, network_state state);
-void backward_softmax_layer_gpu(const softmax_layer l, network_state state);
+void forward_softmax_layer_gpu(const softmax_layer l, network net);
+void backward_softmax_layer_gpu(const softmax_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/stb_image.h b/image.darknet/inst/include/darknet/src/stb_image.h
index d0fa9c2..d9c21bc 100644
--- a/image.darknet/inst/include/darknet/src/stb_image.h
+++ b/image.darknet/inst/include/darknet/src/stb_image.h
@@ -1,5 +1,5 @@
-/* stb_image - v2.06 - public domain image loader - http://nothings.org/stb_image.h
-                                     no warranty implied; use at your own risk
+/* stb_image - v2.19 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
 
    Do this:
       #define STB_IMAGE_IMPLEMENTATION
@@ -21,17 +21,20 @@
           avoid problematic images and only need the trivial interface
 
       JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
-      PNG 1/2/4/8-bit-per-channel (16 bpc not supported)
+      PNG 1/2/4/8/16-bit-per-channel
 
       TGA (not sure what subset, if a subset)
       BMP non-1bpp, non-RLE
-      PSD (composited view only, no extra channels)
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
 
       GIF (*comp always reports as 4-channel)
       HDR (radiance rgbE format)
       PIC (Softimage PIC)
       PNM (PPM and PGM binary only)
 
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
       - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
       - decode from arbitrary I/O callbacks
       - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
@@ -39,176 +42,65 @@
    Full documentation under "DOCUMENTATION" below.
 
 
-   Revision 2.00 release notes:
-
-      - Progressive JPEG is now supported.
-
-      - PPM and PGM binary formats are now supported, thanks to Ken Miller.
-
-      - x86 platforms now make use of SSE2 SIMD instructions for
-        JPEG decoding, and ARM platforms can use NEON SIMD if requested.
-        This work was done by Fabian "ryg" Giesen. SSE2 is used by
-        default, but NEON must be enabled explicitly; see docs.
-
-        With other JPEG optimizations included in this version, we see
-        2x speedup on a JPEG on an x86 machine, and a 1.5x speedup
-        on a JPEG on an ARM machine, relative to previous versions of this
-        library. The same results will not obtain for all JPGs and for all
-        x86/ARM machines. (Note that progressive JPEGs are significantly
-        slower to decode than regular JPEGs.) This doesn't mean that this
-        is the fastest JPEG decoder in the land; rather, it brings it
-        closer to parity with standard libraries. If you want the fastest
-        decode, look elsewhere. (See "Philosophy" section of docs below.)
-
-        See final bullet items below for more info on SIMD.
-
-      - Added STBI_MALLOC, STBI_REALLOC, and STBI_FREE macros for replacing
-        the memory allocator. Unlike other STBI libraries, these macros don't
-        support a context parameter, so if you need to pass a context in to
-        the allocator, you'll have to store it in a global or a thread-local
-        variable.
-
-      - Split existing STBI_NO_HDR flag into two flags, STBI_NO_HDR and
-        STBI_NO_LINEAR.
-            STBI_NO_HDR:     suppress implementation of .hdr reader format
-            STBI_NO_LINEAR:  suppress high-dynamic-range light-linear float API
-
-      - You can suppress implementation of any of the decoders to reduce
-        your code footprint by #defining one or more of the following
-        symbols before creating the implementation.
-
-            STBI_NO_JPEG
-            STBI_NO_PNG
-            STBI_NO_BMP
-            STBI_NO_PSD
-            STBI_NO_TGA
-            STBI_NO_GIF
-            STBI_NO_HDR
-            STBI_NO_PIC
-            STBI_NO_PNM   (.ppm and .pgm)
-
-      - You can request *only* certain decoders and suppress all other ones
-        (this will be more forward-compatible, as addition of new decoders
-        doesn't require you to disable them explicitly):
-
-            STBI_ONLY_JPEG
-            STBI_ONLY_PNG
-            STBI_ONLY_BMP
-            STBI_ONLY_PSD
-            STBI_ONLY_TGA
-            STBI_ONLY_GIF
-            STBI_ONLY_HDR
-            STBI_ONLY_PIC
-            STBI_ONLY_PNM   (.ppm and .pgm)
-
-         Note that you can define multiples of these, and you will get all
-         of them ("only x" and "only y" is interpreted to mean "only x&y").
-
-       - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
-         want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
-
-      - Compilation of all SIMD code can be suppressed with
-            #define STBI_NO_SIMD
-        It should not be necessary to disable SIMD unless you have issues
-        compiling (e.g. using an x86 compiler which doesn't support SSE
-        intrinsics or that doesn't support the method used to detect
-        SSE2 support at run-time), and even those can be reported as
-        bugs so I can refine the built-in compile-time checking to be
-        smarter.
-
-      - The old STBI_SIMD system which allowed installing a user-defined
-        IDCT etc. has been removed. If you need this, don't upgrade. My
-        assumption is that almost nobody was doing this, and those who
-        were will find the built-in SIMD more satisfactory anyway.
-
-      - RGB values computed for JPEG images are slightly different from
-        previous versions of stb_image. (This is due to using less
-        integer precision in SIMD.) The C code has been adjusted so
-        that the same RGB values will be computed regardless of whether
-        SIMD support is available, so your app should always produce
-        consistent results. But these results are slightly different from
-        previous versions. (Specifically, about 3% of available YCbCr values
-        will compute different RGB results from pre-1.49 versions by +-1;
-        most of the deviating values are one smaller in the G channel.)
-
-      - If you must produce consistent results with previous versions of
-        stb_image, #define STBI_JPEG_OLD and you will get the same results
-        you used to; however, you will not get the SIMD speedups for
-        the YCbCr-to-RGB conversion step (although you should still see
-        significant JPEG speedup from the other changes).
-
-        Please note that STBI_JPEG_OLD is a temporary feature; it will be
-        removed in future versions of the library. It is only intended for
-        near-term back-compatibility use.
-
-
-   Latest revision history:
-      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
-      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
-      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
-      2.03  (2015-04-12) additional corruption checking
-                         stbi_set_flip_vertically_on_load
-                         fix NEON support; fix mingw support
-      2.02  (2015-01-19) fix incorrect assert, fix warning
-      2.01  (2015-01-17) fix various warnings
-      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
-      2.00  (2014-12-25) optimize JPEG, including x86 SSE2 & ARM NEON SIMD
-                         progressive JPEG
-                         PGM/PPM support
-                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
-                         STBI_NO_*, STBI_ONLY_*
-                         GIF bugfix
-      1.48  (2014-12-14) fix incorrectly-named assert()
-      1.47  (2014-12-14) 1/2/4-bit PNG support (both grayscale and paletted)
-                         optimize PNG
-                         fix bug in interlaced PNG with user-specified channel count
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
 
    See end of file for full revision history.
 
 
  ============================    Contributors    =========================
 
- Image formats                                Bug fixes & warning fixes
-    Sean Barrett (jpeg, png, bmp)                Marc LeBlanc
-    Nicolas Schulz (hdr, psd)                    Christpher Lloyd
-    Jonathan Dummer (tga)                        Dave Moore
-    Jean-Marc Lienher (gif)                      Won Chun
-    Tom Seddon (pic)                             the Horde3D community
-    Thatcher Ulrich (psd)                        Janez Zemva
-    Ken Miller (pgm, ppm)                        Jonathan Blow
-                                                 Laurent Gomila
-                                                 Aruelien Pocheville
- Extensions, features                            Ryamond Barbiero
-    Jetro Lauha (stbi_info)                      David Woo
-    Martin "SpartanJ" Golini (stbi_info)         Martin Golini
-    James "moose2000" Brown (iPhone PNG)         Roy Eltham
-    Ben "Disch" Wenger (io callbacks)            Luke Graham
-    Omar Cornut (1/2/4-bit PNG)                  Thomas Ruf
-    Nicolas Guillemot (vertical flip)            John Bartholomew
-                                                 Ken Hamada
- Optimizations & bugfixes                        Cort Stratton
-    Fabian "ryg" Giesen                          Blazej Dariusz Roszkowski
-    Arseny Kapoulkine                            Thibault Reuille
-                                                 Paul Du Bois
-                                                 Guillaume George
-  If your name should be here but                Jerry Jansson
-  isn't, let Sean know.                          Hayaki Saito
-                                                 Johan Duparc
-                                                 Ronny Chevalier
-                                                 Michal Cichon
-                                                 Tero Hanninen
-                                                 Sergio Gonzalez
-                                                 Cass Everitt
-                                                 Engin Manap
-                                                 Martins Mozeiko
-                                                 Joseph Thomson
-                                                 Phil Jordan
-
-License:
-   This software is in the public domain. Where that dedication is not
-   recognized, you are granted a perpetual, irrevocable license to copy
-   and modify this file however you want.
-
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine
+    John-Mark Allen
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson     Phil Jordan
+    Dave Moore              Roy Eltham         Hayaki Saito       Nathan Reed
+    Won Chun                Luke Graham        Johan Duparc       Nick Verigakis
+    the Horde3D community   Thomas Ruf         Ronny Chevalier    github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon      github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen      github:svdijk
+    Laurent Gomila          Cort Stratton      Sergio Gonzalez    github:snagar
+    Aruelien Pocheville     Thibault Reuille   Cass Everitt       github:Zelex
+    Ryamond Barbiero        Paul Du Bois       Engin Manap        github:grim210
+    Aldo Culquicondor       Philipp Wiesemann  Dale Weiler        github:sammyhw
+    Oriol Ferrer Mesia      Josh Tobin         Matthew Gregan     github:phprus
+    Julian Raschke          Gregory Mullen     Baldur Karlsson    github:poppolopoppo
+    Christian Floisand      Kevin Schmidt                         github:darealshinji
+    Blazej Dariusz Roszkowski                                     github:Michaelangel007
 */
 
 #ifndef STBI_INCLUDE_STB_IMAGE_H
@@ -217,10 +109,8 @@
 // DOCUMENTATION
 //
 // Limitations:
-//    - no 16-bit-per-channel PNG
 //    - no 12-bit-per-channel JPEG
 //    - no JPEGs with arithmetic coding
-//    - no 1-bit BMP
 //    - GIF always returns *comp=4
 //
 // Basic usage (see HDR discussion below for HDR usage):
@@ -233,10 +123,10 @@
 //    stbi_image_free(data)
 //
 // Standard parameters:
-//    int *x       -- outputs image width in pixels
-//    int *y       -- outputs image height in pixels
-//    int *comp    -- outputs # of image components in image file
-//    int req_comp -- if non-zero, # of image components requested in result
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
 //
 // The return value from an image loader is an 'unsigned char *' which points
 // to the pixel data, or NULL on an allocation failure or if the image is
@@ -244,11 +134,12 @@
 // with each pixel consisting of N interleaved 8-bit components; the first
 // pixel pointed to is top-left-most in the image. There is no padding between
 // image scanlines or between pixels, regardless of format. The number of
-// components N is 'req_comp' if req_comp is non-zero, or *comp otherwise.
-// If req_comp is non-zero, *comp has the number of components that _would_
-// have been output otherwise. E.g. if you set req_comp to 4, you will always
-// get RGBA output, but you can check *comp to see if it's trivially opaque
-// because e.g. there were only 3 channels in the source image.
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
 //
 // An output image with N components has the following components interleaved
 // in this order in each pixel:
@@ -260,10 +151,10 @@
 //       4           red, green, blue, alpha
 //
 // If image loading fails for any reason, the return value will be NULL,
-// and *x, *y, *comp will be unchanged. The function stbi_failure_reason()
-// can be queried for an extremely brief, end-user unfriendly explanation
-// of why the load failed. Define STBI_NO_FAILURE_STRINGS to avoid
-// compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
 // more user-friendly ones.
 //
 // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
@@ -282,13 +173,13 @@
 // and for best performance I may provide less-easy-to-use APIs that give higher
 // performance, in addition to the easy to use ones. Nevertheless, it's important
 // to keep in mind that from the standpoint of you, a client of this library,
-// all you care about is #1 and #3, and stb libraries do not emphasize #3 above all.
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
 //
 // Some secondary priorities arise directly from the first two, some of which
 // make more explicit reasons why performance can't be emphasized.
 //
 //    - Portable ("ease of use")
-//    - Small footprint ("easy to maintain")
+//    - Small source code footprint ("easy to maintain")
 //    - No dependencies ("ease of use")
 //
 // ===========================================================================
@@ -320,13 +211,6 @@
 // (at least this is true for iOS and Android). Therefore, the NEON support is
 // toggled by a build flag: define STBI_NEON to get NEON loops.
 //
-// The output of the JPEG decoder is slightly different from versions where
-// SIMD support was introduced (that is, for versions before 1.49). The
-// difference is only +-1 in the 8-bit RGB channels, and only on a small
-// fraction of pixels. You can force the pre-1.49 behavior by defining
-// STBI_JPEG_OLD, but this will disable some of the SIMD decoding path
-// and hence cost some performance.
-//
 // If for some reason you do not want to use any of SIMD code, or if
 // you have issues compiling it, you can disable it entirely by
 // defining STBI_NO_SIMD.
@@ -382,6 +266,41 @@
 // says there's premultiplied data (currently only happens in iPhone images,
 // and only if iPhone convert-to-rgb processing is on).
 //
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
 
 
 #ifndef STBI_NO_STDIO
@@ -392,7 +311,7 @@
 
 enum
 {
-   STBI_default = 0, // only used for req_comp
+   STBI_default = 0, // only used for desired_channels
 
    STBI_grey       = 1,
    STBI_grey_alpha = 2,
@@ -401,6 +320,7 @@ enum
 };
 
 typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
 
 #ifdef __cplusplus
 extern "C" {
@@ -428,34 +348,60 @@ typedef struct
    int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
 } stbi_io_callbacks;
 
-STBIDEF stbi_uc *stbi_load               (char              const *filename,           int *x, int *y, int *comp, int req_comp);
-STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *comp, int req_comp);
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *comp, int req_comp);
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
 
 #ifndef STBI_NO_STDIO
-STBIDEF stbi_uc *stbi_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
 // for stbi_load_from_file, file pointer is left pointing immediately after image
 #endif
 
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
 #ifndef STBI_NO_LINEAR
-   STBIDEF float *stbi_loadf                 (char const *filename,           int *x, int *y, int *comp, int req_comp);
-   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
-   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
 
    #ifndef STBI_NO_STDIO
-   STBIDEF float *stbi_loadf_from_file  (FILE *f,                int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
    #endif
 #endif
 
 #ifndef STBI_NO_HDR
    STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
    STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
-#endif
+#endif // STBI_NO_HDR
 
 #ifndef STBI_NO_LINEAR
    STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
    STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
-#endif // STBI_NO_HDR
+#endif // STBI_NO_LINEAR
 
 // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
 STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
@@ -476,11 +422,14 @@ STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
 // get image dimensions & components without fully decoding
 STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
 STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
 
 #ifndef STBI_NO_STDIO
-STBIDEF int      stbi_info            (char const *filename,     int *x, int *y, int *comp);
-STBIDEF int      stbi_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
-
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
 #endif
 
 
@@ -561,9 +510,10 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
 #include <stddef.h> // ptrdiff_t on osx
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h>  // ldexp
+#include <math.h>  // ldexp, pow
 #endif
 
 #ifndef STBI_NO_STDIO
@@ -619,18 +569,22 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
    #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
 #endif
 
-#if defined(STBI_MALLOC) && defined(STBI_FREE) && defined(STBI_REALLOC)
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
 // ok
-#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC)
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
 // ok
 #else
-#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC."
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
 #endif
 
 #ifndef STBI_MALLOC
-#define STBI_MALLOC(sz)    malloc(sz)
-#define STBI_REALLOC(p,sz) realloc(p,sz)
-#define STBI_FREE(p)       free(p)
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
 #endif
 
 // x86/x64 detection
@@ -640,12 +594,14 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #define STBI__X86_TARGET
 #endif
 
-#if defined(__GNUC__) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET)) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
-// NOTE: not clear do we actually need this for the 64-bit path?
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
 // gcc doesn't support sse2 intrinsics unless you compile with -msse2,
-// (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
-// this is just broken and gcc are jerks for not fixing it properly
-// http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
 #define STBI_NO_SIMD
 #endif
 
@@ -664,7 +620,7 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #define STBI_NO_SIMD
 #endif
 
-#if !defined(STBI_NO_SIMD) && defined(STBI__X86_TARGET)
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
 #define STBI_SSE2
 #include <emmintrin.h>
 
@@ -693,7 +649,7 @@ static int stbi__cpuid3(void)
 
 #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
 
-static int stbi__sse2_available()
+static int stbi__sse2_available(void)
 {
    int info3 = stbi__cpuid3();
    return ((info3 >> 26) & 1) != 0;
@@ -701,16 +657,12 @@ static int stbi__sse2_available()
 #else // assume GCC-style if not VC++
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 
-static int stbi__sse2_available()
+static int stbi__sse2_available(void)
 {
-#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 // GCC 4.8 or later
-   // GCC 4.8+ has a nice way to do this
-   return __builtin_cpu_supports("sse2");
-#else
-   // portable way to do this, preferably without using GCC inline ASM?
-   // just bail for now.
-   return 0;
-#endif
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
 }
 #endif
 #endif
@@ -749,7 +701,7 @@ typedef struct
    stbi_uc buffer_start[128];
 
    stbi_uc *img_buffer, *img_buffer_end;
-   stbi_uc *img_buffer_original;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
 } stbi__context;
 
 
@@ -761,7 +713,7 @@ static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
    s->io.read = NULL;
    s->read_from_callbacks = 0;
    s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
-   s->img_buffer_end = (stbi_uc *) buffer+len;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
 }
 
 // initialize a callback-based context
@@ -773,6 +725,7 @@ static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *
    s->read_from_callbacks = 1;
    s->img_buffer_original = s->buffer_start;
    stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
 }
 
 #ifndef STBI_NO_STDIO
@@ -814,59 +767,76 @@ static void stbi__rewind(stbi__context *s)
    // but we just rewind to the beginning of the initial buffer, because
    // we only use it after doing 'test', which only ever looks at at most 92 bytes
    s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
 }
 
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
 #ifndef STBI_NO_JPEG
 static int      stbi__jpeg_test(stbi__context *s);
-static stbi_uc *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNG
 static int      stbi__png_test(stbi__context *s);
-static stbi_uc *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_BMP
 static int      stbi__bmp_test(stbi__context *s);
-static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_TGA
 static int      stbi__tga_test(stbi__context *s);
-static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PSD
 static int      stbi__psd_test(stbi__context *s);
-static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
 static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_HDR
 static int      stbi__hdr_test(stbi__context *s);
-static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PIC
 static int      stbi__pic_test(stbi__context *s);
-static stbi_uc *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_GIF
 static int      stbi__gif_test(stbi__context *s);
-static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
 static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNM
 static int      stbi__pnm_test(stbi__context *s);
-static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
@@ -889,6 +859,81 @@ static void *stbi__malloc(size_t size)
     return STBI_MALLOC(size);
 }
 
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
 // stbi__err - error
 // stbi__errpf - error returning pointer to float
 // stbi__errpuc - error returning pointer to unsigned char
@@ -901,8 +946,8 @@ static void *stbi__malloc(size_t size)
    #define stbi__err(x,y)  stbi__err(x)
 #endif
 
-#define stbi__errpf(x,y)   ((float *) (stbi__err(x,y)?NULL:NULL))
-#define stbi__errpuc(x,y)  ((unsigned char *) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
 
 STBIDEF void stbi_image_free(void *retval_from_stbi_load)
 {
@@ -924,33 +969,38 @@ STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
     stbi__vertically_flip_on_load = flag_true_if_should_flip;
 }
 
-static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
 {
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
    #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp);
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PNG
-   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp);
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_BMP
-   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp);
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_GIF
-   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp);
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PSD
-   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp);
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
    #endif
    #ifndef STBI_NO_PIC
-   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp);
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PNM
-   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp);
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
    #endif
 
    #ifndef STBI_NO_HDR
    if (stbi__hdr_test(s)) {
-      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp);
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
       return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
    }
    #endif
@@ -958,58 +1008,138 @@ static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *com
    #ifndef STBI_NO_TGA
    // test tga last because it's a crappy test!
    if (stbi__tga_test(s))
-      return stbi__tga_load(s,x,y,comp,req_comp);
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
    #endif
 
    return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
 }
 
-static unsigned char *stbi__load_flip(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
 {
-   unsigned char *result = stbi__load_main(s, x, y, comp, req_comp);
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
 
-   if (stbi__vertically_flip_on_load && result != NULL) {
-      int w = *x, h = *y;
-      int depth = req_comp ? req_comp : *comp;
-      int row,col,z;
-      stbi_uc temp;
-
-      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
-      for (row = 0; row < (h>>1); row++) {
-         for (col = 0; col < w; col++) {
-            for (z = 0; z < depth; z++) {
-               temp = result[(row * w + col) * depth + z];
-               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
-               result[((h - row - 1) * w + col) * depth + z] = temp;
-            }
-         }
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
       }
    }
+}
 
-   return result;
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel); 
+      bytes += slice_size; 
+   }
+}
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   if (ri.bits_per_channel != 8) {
+      STBI_ASSERT(ri.bits_per_channel == 16);
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
 }
 
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   if (ri.bits_per_channel != 16) {
+      STBI_ASSERT(ri.bits_per_channel == 8);
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) || !defined(STBI_NO_LINEAR)
 static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
 {
    if (stbi__vertically_flip_on_load && result != NULL) {
-      int w = *x, h = *y;
-      int depth = req_comp ? req_comp : *comp;
-      int row,col,z;
-      float temp;
-
-      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
-      for (row = 0; row < (h>>1); row++) {
-         for (col = 0; col < w; col++) {
-            for (z = 0; z < depth; z++) {
-               temp = result[(row * w + col) * depth + z];
-               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
-               result[((h - row - 1) * w + col) * depth + z] = temp;
-            }
-         }
-      }
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
    }
 }
-
+#endif
 
 #ifndef STBI_NO_STDIO
 
@@ -1041,28 +1171,83 @@ STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req
    unsigned char *result;
    stbi__context s;
    stbi__start_file(&s,f);
-   result = stbi__load_flip(&s,x,y,comp,req_comp);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
    if (result) {
       // need to 'unget' all the characters in the IO buffer
       fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
    }
    return result;
 }
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
 #endif //!STBI_NO_STDIO
 
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
 STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_mem(&s,buffer,len);
-   return stbi__load_flip(&s,x,y,comp,req_comp);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
 }
 
 STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__load_flip(&s,x,y,comp,req_comp);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s; 
+   stbi__start_mem(&s,buffer,len); 
+   
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp ); 
+   }
+
+   return result; 
 }
+#endif
 
 #ifndef STBI_NO_LINEAR
 static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
@@ -1070,13 +1255,14 @@ static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int
    unsigned char *data;
    #ifndef STBI_NO_HDR
    if (stbi__hdr_test(s)) {
-      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp);
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
       if (hdr_data)
          stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
       return hdr_data;
    }
    #endif
-   data = stbi__load_flip(s, x, y, comp, req_comp);
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
    if (data)
       return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
    return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
@@ -1146,13 +1332,18 @@ STBIDEF int      stbi_is_hdr          (char const *filename)
    return result;
 }
 
-STBIDEF int      stbi_is_hdr_from_file(FILE *f)
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
 {
    #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
    stbi__context s;
    stbi__start_file(&s,f);
-   return stbi__hdr_test(&s);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
    #else
+   STBI_NOTUSED(f);
    return 0;
    #endif
 }
@@ -1165,18 +1356,21 @@ STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void
    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
    return stbi__hdr_test(&s);
    #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
    return 0;
    #endif
 }
 
-static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+#ifndef STBI_NO_LINEAR
 static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
 
-#ifndef STBI_NO_LINEAR
 STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
 STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
 #endif
 
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
 STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
 STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
 
@@ -1285,17 +1479,23 @@ static stbi__uint32 stbi__get32be(stbi__context *s)
    return (z << 16) + stbi__get16be(s);
 }
 
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
 static int stbi__get16le(stbi__context *s)
 {
    int z = stbi__get8(s);
    return z + (stbi__get8(s) << 8);
 }
+#endif
 
+#ifndef STBI_NO_BMP
 static stbi__uint32 stbi__get32le(stbi__context *s)
 {
    stbi__uint32 z = stbi__get16le(s);
    return z + (stbi__get16le(s) << 16);
 }
+#endif
 
 #define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
 
@@ -1324,7 +1524,7 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
    if (req_comp == img_n) return data;
    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-   good = (unsigned char *) stbi__malloc(req_comp * x * y);
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
    if (good == NULL) {
       STBI_FREE(data);
       return stbi__errpuc("outofmem", "Out of memory");
@@ -1334,26 +1534,75 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
       unsigned char *src  = data + j * x * img_n   ;
       unsigned char *dest = good + j * x * req_comp;
 
-      #define COMBO(a,b)  ((a)*8+(b))
-      #define CASE(a,b)   case COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0], dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0);
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
       // convert source image with img_n components to one with req_comp components;
       // avoid switch per pixel, so use switch per scanline and massive macros
-      switch (COMBO(img_n, req_comp)) {
-         CASE(1,2) dest[0]=src[0], dest[1]=255; break;
-         CASE(1,3) dest[0]=dest[1]=dest[2]=src[0]; break;
-         CASE(1,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=255; break;
-         CASE(2,1) dest[0]=src[0]; break;
-         CASE(2,3) dest[0]=dest[1]=dest[2]=src[0]; break;
-         CASE(2,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1]; break;
-         CASE(3,4) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255; break;
-         CASE(3,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
-         CASE(3,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = 255; break;
-         CASE(4,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
-         CASE(4,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; break;
-         CASE(4,3) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2]; break;
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0], dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]), dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]), dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];                       } break;
          default: STBI_ASSERT(0);
       }
-      #undef CASE
+      #undef STBI__CASE
    }
 
    STBI_FREE(data);
@@ -1364,7 +1613,9 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
 static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 {
    int i,k,n;
-   float *output = (float *) stbi__malloc(x * y * comp * sizeof(float));
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
    if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
    // compute number of non-alpha components
    if (comp & 1) n = comp; else n = comp-1;
@@ -1384,7 +1635,9 @@ static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
 {
    int i,k,n;
-   stbi_uc *output = (stbi_uc *) stbi__malloc(x * y * comp);
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
    if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
    // compute number of non-alpha components
    if (comp & 1) n = comp; else n = comp-1;
@@ -1449,7 +1702,7 @@ typedef struct
    stbi__context *s;
    stbi__huffman huff_dc[4];
    stbi__huffman huff_ac[4];
-   stbi_uc dequant[4][64];
+   stbi__uint16 dequant[4][64];
    stbi__int16 fast_ac[4][1 << FAST_BITS];
 
 // sizes for components, interleaved MCUs
@@ -1485,6 +1738,9 @@ typedef struct
    int            succ_high;
    int            succ_low;
    int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
 
    int scan_n, order[4];
    int restart_interval, todo;
@@ -1497,7 +1753,8 @@ typedef struct
 
 static int stbi__build_huffman(stbi__huffman *h, int *count)
 {
-   int i,j,k=0,code;
+   int i,j,k=0;
+   unsigned int code;
    // build size list for each symbol (from JPEG spec)
    for (i=0; i < 16; ++i)
       for (j=0; j < count[i]; ++j)
@@ -1513,7 +1770,7 @@ static int stbi__build_huffman(stbi__huffman *h, int *count)
       if (h->size[k] == j) {
          while (h->size[k] == j)
             h->code[k++] = (stbi__uint16) (code++);
-         if (code-1 >= (1 << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
       }
       // compute largest code + 1 for this size, preshifted as needed later
       h->maxcode[j] = code << (16-j);
@@ -1554,10 +1811,10 @@ static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
             // magnitude code followed by receive_extend code
             int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
             int m = 1 << (magbits - 1);
-            if (k < m) k += (-1 << magbits) + 1;
+            if (k < m) k += (~0U << magbits) + 1;
             // if the result is small enough, we can fit it in fast_ac table
             if (k >= -128 && k <= 127)
-               fast_ac[i] = (stbi__int16) ((k << 8) + (run << 4) + (len + magbits));
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
          }
       }
    }
@@ -1566,9 +1823,10 @@ static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
 static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
 {
    do {
-      int b = j->nomore ? 0 : stbi__get8(j->s);
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
       if (b == 0xff) {
          int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
          if (c != 0) {
             j->marker = (unsigned char) c;
             j->nomore = 1;
@@ -1581,7 +1839,7 @@ static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
 }
 
 // (1 << n) - 1
-static stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
 
 // decode a jpeg huffman value from the bitstream
 stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
@@ -1634,7 +1892,7 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
 }
 
 // bias[n] = (-1<<n) + 1
-static int const stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
 
 // combined JPEG 'receive' and JPEG 'extend', since baseline
 // always extends everything it receives.
@@ -1677,7 +1935,7 @@ stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
 
 // given a value that's at position X in the zigzag stream,
 // where does it appear in the 8x8 matrix coded as row-major?
-static stbi_uc stbi__jpeg_dezigzag[64+15] =
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
 {
     0,  1,  8, 16,  9,  2,  3, 10,
    17, 24, 32, 25, 18, 11,  4,  5,
@@ -1693,7 +1951,7 @@ static stbi_uc stbi__jpeg_dezigzag[64+15] =
 };
 
 // decode one 64-entry block--
-static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi_uc *dequant)
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
 {
    int diff,dc,k;
    int t;
@@ -1903,7 +2161,7 @@ stbi_inline static stbi_uc stbi__clamp(int x)
 }
 
 #define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
-#define stbi__fsh(x)  ((x) << 12)
+#define stbi__fsh(x)  ((x) * 4096)
 
 // derived from jidctint -- DCT_ISLOW
 #define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
@@ -1958,7 +2216,7 @@ static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
          //    (1|2|3|4|5|6|7)==0          0     seconds
          //    all separate               -0.047 seconds
          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
-         int dcterm = d[0] << 2;
+         int dcterm = d[0]*4;
          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
       } else {
          STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
@@ -2402,7 +2660,7 @@ static stbi_uc stbi__get_marker(stbi__jpeg *j)
    x = stbi__get8(j->s);
    if (x != 0xff) return STBI__MARKER_none;
    while (x == 0xff)
-      x = stbi__get8(j->s);
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
    return x;
 }
 
@@ -2417,7 +2675,7 @@ static void stbi__jpeg_reset(stbi__jpeg *j)
    j->code_bits = 0;
    j->code_buffer = 0;
    j->nomore = 0;
-   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
    j->marker = STBI__MARKER_none;
    j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
    j->eob_run = 0;
@@ -2549,7 +2807,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
    }
 }
 
-static void stbi__jpeg_dequantize(short *data, stbi_uc *dequant)
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
 {
    int i;
    for (i=0; i < 64; ++i)
@@ -2591,13 +2849,14 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
          L = stbi__get16be(z->s)-2;
          while (L > 0) {
             int q = stbi__get8(z->s);
-            int p = q >> 4;
+            int p = q >> 4, sixteen = (p != 0);
             int t = q & 15,i;
-            if (p != 0) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
             if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
             for (i=0; i < 64; ++i)
-               z->dequant[t][stbi__jpeg_dezigzag[i]] = stbi__get8(z->s);
-            L -= 65;
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
          }
          return L==0;
 
@@ -2630,12 +2889,50 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
          }
          return L==0;
    }
+
    // check for comment block or APP blocks
    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
-      stbi__skip(z->s, stbi__get16be(z->s)-2);
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
       return 1;
    }
-   return 0;
+
+   return stbi__err("unknown marker","Corrupt JPEG");
 }
 
 // after we see SOS
@@ -2678,6 +2975,28 @@ static int stbi__process_scan_header(stbi__jpeg *z)
    return 1;
 }
 
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
 static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 {
    stbi__context *s = z->s;
@@ -2687,7 +3006,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
    s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
    s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
    c = stbi__get8(s);
-   if (c != 3 && c != 1) return stbi__err("bad component count","Corrupt JPEG");    // JFIF requires
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
    s->img_n = c;
    for (i=0; i < c; ++i) {
       z->img_comp[i].data = NULL;
@@ -2696,11 +3015,12 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 
    if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
 
+   z->rgb = 0;
    for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
       z->img_comp[i].id = stbi__get8(s);
-      if (z->img_comp[i].id != i+1)   // JFIF requires
-         if (z->img_comp[i].id != i)  // some version of jpegtran outputs non-JFIF-compliant files!
-            return stbi__err("bad component ID","Corrupt JPEG");
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
       q = stbi__get8(s);
       z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
       z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
@@ -2709,7 +3029,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 
    if (scan != STBI__SCAN_load) return 1;
 
-   if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
 
    for (i=0; i < s->img_n; ++i) {
       if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
@@ -2721,6 +3041,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
    z->img_v_max = v_max;
    z->img_mcu_w = h_max * 8;
    z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
    z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
    z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
 
@@ -2732,28 +3053,27 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
       // the bogus oversized data from using interleaved MCUs and their
       // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
       // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
       z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
       z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
-      z->img_comp[i].raw_data = stbi__malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
-
-      if (z->img_comp[i].raw_data == NULL) {
-         for(--i; i >= 0; --i) {
-            STBI_FREE(z->img_comp[i].raw_data);
-            z->img_comp[i].data = NULL;
-         }
-         return stbi__err("outofmem", "Out of memory");
-      }
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
       // align blocks for idct using mmx/sse
       z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
-      z->img_comp[i].linebuf = NULL;
       if (z->progressive) {
-         z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
-         z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
-         z->img_comp[i].raw_coeff = STBI_MALLOC(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
          z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
-      } else {
-         z->img_comp[i].coeff = 0;
-         z->img_comp[i].raw_coeff = 0;
       }
    }
 
@@ -2772,6 +3092,8 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
 {
    int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
    z->marker = STBI__MARKER_none; // initialize cached marker to empty
    m = stbi__get_marker(z);
    if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
@@ -2813,12 +3135,15 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j)
                if (x == 255) {
                   j->marker = stbi__get8(j->s);
                   break;
-               } else if (x != 0) {
-                  return stbi__err("junk before marker", "Corrupt JPEG");
                }
             }
             // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
          }
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
       } else {
          if (!stbi__process_marker(j, m)) return 0;
       }
@@ -3037,38 +3362,9 @@ static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_
    return out;
 }
 
-#ifdef STBI_JPEG_OLD
-// this is the same YCbCr-to-RGB calculation that stb_image has used
-// historically before the algorithm changes in 1.49
-#define float2fixed(x)  ((int) ((x) * 65536 + 0.5))
-static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
-{
-   int i;
-   for (i=0; i < count; ++i) {
-      int y_fixed = (y[i] << 16) + 32768; // rounding
-      int r,g,b;
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      r = y_fixed + cr*float2fixed(1.40200f);
-      g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
-      b = y_fixed                            + cb*float2fixed(1.77200f);
-      r >>= 16;
-      g >>= 16;
-      b >>= 16;
-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
-      out[0] = (stbi_uc)r;
-      out[1] = (stbi_uc)g;
-      out[2] = (stbi_uc)b;
-      out[3] = 255;
-      out += step;
-   }
-}
-#else
 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
 // to make sure the code produces the same results in both SIMD and scalar
-#define float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
 static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
 {
    int i;
@@ -3077,9 +3373,9 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
       int r,g,b;
       int cr = pcr[i] - 128;
       int cb = pcb[i] - 128;
-      r = y_fixed +  cr* float2fixed(1.40200f);
-      g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                               +   cb* float2fixed(1.77200f);
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
       r >>= 20;
       g >>= 20;
       b >>= 20;
@@ -3093,7 +3389,6 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
       out += step;
    }
 }
-#endif
 
 #if defined(STBI_SSE2) || defined(STBI_NEON)
 static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
@@ -3212,9 +3507,9 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
       int r,g,b;
       int cr = pcr[i] - 128;
       int cb = pcb[i] - 128;
-      r = y_fixed + cr* float2fixed(1.40200f);
-      g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                             +   cb* float2fixed(1.77200f);
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
       r >>= 20;
       g >>= 20;
       b >>= 20;
@@ -3240,18 +3535,14 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
 #ifdef STBI_SSE2
    if (stbi__sse2_available()) {
       j->idct_block_kernel = stbi__idct_simd;
-      #ifndef STBI_JPEG_OLD
       j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-      #endif
       j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
    }
 #endif
 
 #ifdef STBI_NEON
    j->idct_block_kernel = stbi__idct_simd;
-   #ifndef STBI_JPEG_OLD
    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-   #endif
    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 #endif
 }
@@ -3259,23 +3550,7 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
 // clean up the temporary component buffers
 static void stbi__cleanup_jpeg(stbi__jpeg *j)
 {
-   int i;
-   for (i=0; i < j->s->img_n; ++i) {
-      if (j->img_comp[i].raw_data) {
-         STBI_FREE(j->img_comp[i].raw_data);
-         j->img_comp[i].raw_data = NULL;
-         j->img_comp[i].data = NULL;
-      }
-      if (j->img_comp[i].raw_coeff) {
-         STBI_FREE(j->img_comp[i].raw_coeff);
-         j->img_comp[i].raw_coeff = 0;
-         j->img_comp[i].coeff = 0;
-      }
-      if (j->img_comp[i].linebuf) {
-         STBI_FREE(j->img_comp[i].linebuf);
-         j->img_comp[i].linebuf = NULL;
-      }
-   }
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
 }
 
 typedef struct
@@ -3288,9 +3563,16 @@ typedef struct
    int ypos;    // which pre-expansion row we're on
 } stbi__resample;
 
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
 static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
 {
-   int n, decode_n;
+   int n, decode_n, is_rgb;
    z->s->img_n = 0; // make stbi__cleanup_jpeg safe
 
    // validate req_comp
@@ -3300,9 +3582,11 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
    if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
 
    // determine actual number of components to generate
-   n = req_comp ? req_comp : z->s->img_n;
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
 
-   if (z->s->img_n == 3 && n < 3)
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
       decode_n = 1;
    else
       decode_n = z->s->img_n;
@@ -3339,7 +3623,7 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
       }
 
       // can't error after this so, this is safe
-      output = (stbi_uc *) stbi__malloc(n * z->s->img_x * z->s->img_y + 1);
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
       if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
 
       // now go ahead and resample
@@ -3362,7 +3646,39 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
          if (n >= 3) {
             stbi_uc *y = coutput[0];
             if (z->s->img_n == 3) {
-               z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
             } else
                for (i=0; i < z->s->img_x; ++i) {
                   out[0] = out[1] = out[2] = y[i];
@@ -3370,37 +3686,70 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
                   out += n;
                }
          } else {
-            stbi_uc *y = coutput[0];
-            if (n == 1)
-               for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
-            else
-               for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255;
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255;
+            }
          }
       }
       stbi__cleanup_jpeg(z);
       *out_x = z->s->img_x;
       *out_y = z->s->img_y;
-      if (comp) *comp  = z->s->img_n; // report original components, not output
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
       return output;
    }
 }
 
-static unsigned char *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
-   stbi__jpeg j;
-   j.s = s;
-   stbi__setup_jpeg(&j);
-   return load_jpeg_image(&j, x,y,comp,req_comp);
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
 }
 
 static int stbi__jpeg_test(stbi__context *s)
 {
    int r;
-   stbi__jpeg j;
-   j.s = s;
-   stbi__setup_jpeg(&j);
-   r = stbi__decode_jpeg_header(&j, STBI__SCAN_type);
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
    stbi__rewind(s);
+   STBI_FREE(j);
    return r;
 }
 
@@ -3412,15 +3761,18 @@ static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
    }
    if (x) *x = j->s->img_x;
    if (y) *y = j->s->img_y;
-   if (comp) *comp = j->s->img_n;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
    return 1;
 }
 
 static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   stbi__jpeg j;
-   j.s = s;
-   return stbi__jpeg_info_raw(&j, x, y, comp);
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
 }
 #endif
 
@@ -3466,7 +3818,7 @@ stbi_inline static int stbi__bit_reverse(int v, int bits)
    return stbi__bitreverse16(v) >> (16-bits);
 }
 
-static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
 {
    int i,k=0;
    int code, next_code[16], sizes[17];
@@ -3501,10 +3853,10 @@ static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
          z->size [c] = (stbi_uc     ) s;
          z->value[c] = (stbi__uint16) i;
          if (s <= STBI__ZFAST_BITS) {
-            int k = stbi__bit_reverse(next_code[s],s);
-            while (k < (1 << STBI__ZFAST_BITS)) {
-               z->fast[k] = fastv;
-               k += (1 << s);
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
             }
          }
          ++next_code[s];
@@ -3543,7 +3895,7 @@ static void stbi__fill_bits(stbi__zbuf *z)
 {
    do {
       STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
-      z->code_buffer |= stbi__zget8(z) << z->num_bits;
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
       z->num_bits += 8;
    } while (z->num_bits <= 24);
 }
@@ -3593,14 +3945,15 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
 static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
 {
    char *q;
-   int cur, limit;
+   int cur, limit, old_limit;
    z->zout = zout;
    if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
    cur   = (int) (z->zout     - z->zout_start);
-   limit = (int) (z->zout_end - z->zout_start);
+   limit = old_limit = (int) (z->zout_end - z->zout_start);
    while (cur + n > limit)
       limit *= 2;
-   q = (char *) STBI_REALLOC(z->zout_start, limit);
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
    if (q == NULL) return stbi__err("outofmem", "Out of memory");
    z->zout_start = q;
    z->zout       = q + cur;
@@ -3608,18 +3961,18 @@ static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room
    return 1;
 }
 
-static int stbi__zlength_base[31] = {
+static const int stbi__zlength_base[31] = {
    3,4,5,6,7,8,9,10,11,13,
    15,17,19,23,27,31,35,43,51,59,
    67,83,99,115,131,163,195,227,258,0,0 };
 
-static int stbi__zlength_extra[31]=
+static const int stbi__zlength_extra[31]=
 { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
 
-static int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
 
-static int stbi__zdist_extra[32] =
+static const int stbi__zdist_extra[32] =
 { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
 
 static int stbi__parse_huffman_block(stbi__zbuf *a)
@@ -3666,7 +4019,7 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
 
 static int stbi__compute_huffman_codes(stbi__zbuf *a)
 {
-   static stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
    stbi__zhuffman z_codelength;
    stbi_uc lencodes[286+32+137];//padding for maximum single op
    stbi_uc codelength_sizes[19];
@@ -3675,6 +4028,7 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a)
    int hlit  = stbi__zreceive(a,5) + 257;
    int hdist = stbi__zreceive(a,5) + 1;
    int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
 
    memset(codelength_sizes, 0, sizeof(codelength_sizes));
    for (i=0; i < hclen; ++i) {
@@ -3684,33 +4038,35 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a)
    if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
 
    n = 0;
-   while (n < hlit + hdist) {
+   while (n < ntot) {
       int c = stbi__zhuffman_decode(a, &z_codelength);
       if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
       if (c < 16)
          lencodes[n++] = (stbi_uc) c;
-      else if (c == 16) {
-         c = stbi__zreceive(a,2)+3;
-         memset(lencodes+n, lencodes[n-1], c);
-         n += c;
-      } else if (c == 17) {
-         c = stbi__zreceive(a,3)+3;
-         memset(lencodes+n, 0, c);
-         n += c;
-      } else {
-         STBI_ASSERT(c == 18);
-         c = stbi__zreceive(a,7)+11;
-         memset(lencodes+n, 0, c);
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17)
+            c = stbi__zreceive(a,3)+3;
+         else {
+            STBI_ASSERT(c == 18);
+            c = stbi__zreceive(a,7)+11;
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
          n += c;
       }
    }
-   if (n != hlit+hdist) return stbi__err("bad codelengths","Corrupt PNG");
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
    if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
    if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
    return 1;
 }
 
-static int stbi__parse_uncomperssed_block(stbi__zbuf *a)
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
 {
    stbi_uc header[4];
    int len,nlen,k;
@@ -3752,9 +4108,24 @@ static int stbi__parse_zlib_header(stbi__zbuf *a)
    return 1;
 }
 
-// @TODO: should statically initialize these for optimal thread safety
-static stbi_uc stbi__zdefault_length[288], stbi__zdefault_distance[32];
-static void stbi__init_zdefaults(void)
+static const stbi_uc stbi__zdefault_length[288] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
 {
    int i;   // use <= to match clearly with spec
    for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
@@ -3764,6 +4135,7 @@ static void stbi__init_zdefaults(void)
 
    for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
 }
+*/
 
 static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
 {
@@ -3776,13 +4148,12 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
       final = stbi__zreceive(a,1);
       type = stbi__zreceive(a,2);
       if (type == 0) {
-         if (!stbi__parse_uncomperssed_block(a)) return 0;
+         if (!stbi__parse_uncompressed_block(a)) return 0;
       } else if (type == 3) {
          return 0;
       } else {
          if (type == 1) {
             // use fixed code lengths
-            if (!stbi__zdefault_distance[31]) stbi__init_zdefaults();
             if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
             if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
          } else {
@@ -3907,7 +4278,7 @@ static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
 
 static int stbi__check_png_header(stbi__context *s)
 {
-   static stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
    int i;
    for (i=0; i < 8; ++i)
       if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
@@ -3918,6 +4289,7 @@ typedef struct
 {
    stbi__context *s;
    stbi_uc *idata, *expanded, *out;
+   int depth;
 } stbi__png;
 
 
@@ -3952,35 +4324,40 @@ static int stbi__paeth(int a, int b, int c)
    return c;
 }
 
-static stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
 
 // create the png data from post-deflated data
 static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
 {
+   int bytes = (depth == 16? 2 : 1);
    stbi__context *s = a->s;
-   stbi__uint32 i,j,stride = x*out_n;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
    stbi__uint32 img_len, img_width_bytes;
    int k;
    int img_n = s->img_n; // copy it into a local for later
 
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
    STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
-   a->out = (stbi_uc *) stbi__malloc(x * y * out_n); // extra bytes to write off the end into
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
    if (!a->out) return stbi__err("outofmem", "Out of memory");
 
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
    img_width_bytes = (((img_n * x * depth) + 7) >> 3);
    img_len = (img_width_bytes + 1) * y;
-   if (s->img_x == x && s->img_y == y) {
-      if (raw_len != img_len) return stbi__err("not enough pixels","Corrupt PNG");
-   } else { // interlaced:
-      if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
-   }
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
 
    for (j=0; j < y; ++j) {
       stbi_uc *cur = a->out + stride*j;
-      stbi_uc *prior = cur - stride;
+      stbi_uc *prior;
       int filter = *raw++;
-      int filter_bytes = img_n;
-      int width = x;
+
       if (filter > 4)
          return stbi__err("invalid filter","Corrupt PNG");
 
@@ -3990,6 +4367,7 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
          filter_bytes = 1;
          width = img_width_bytes;
       }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
 
       // if first row, use special filter that doesn't sample previous row
       if (j == 0) filter = first_row_filter[filter];
@@ -4013,6 +4391,14 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
          raw += img_n;
          cur += out_n;
          prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
       } else {
          raw += 1;
          cur += 1;
@@ -4021,38 +4407,47 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
 
       // this is a little gross, so that we don't switch per-pixel or per-component
       if (depth < 8 || img_n == out_n) {
-         int nk = (width - 1)*img_n;
-         #define CASE(f) \
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
              case f:     \
                 for (k=0; k < nk; ++k)
          switch (filter) {
             // "none" filter turns into a memcpy here; make that explicit.
             case STBI__F_none:         memcpy(cur, raw, nk); break;
-            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); break;
-            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); break;
-            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); break;
-            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); break;
-            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
          }
-         #undef CASE
+         #undef STBI__CASE
          raw += nk;
       } else {
          STBI_ASSERT(img_n+1 == out_n);
-         #define CASE(f) \
+         #define STBI__CASE(f) \
              case f:     \
-                for (i=x-1; i >= 1; --i, cur[img_n]=255,raw+=img_n,cur+=out_n,prior+=out_n) \
-                   for (k=0; k < img_n; ++k)
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
          switch (filter) {
-            CASE(STBI__F_none)         cur[k] = raw[k]; break;
-            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-out_n]); break;
-            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-out_n])>>1)); break;
-            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-out_n],prior[k],prior[k-out_n])); break;
-            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-out_n] >> 1)); break;
-            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-out_n],0,0)); break;
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
          }
-         #undef CASE
       }
    }
 
@@ -4109,25 +4504,36 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
             if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
          }
          if (img_n != out_n) {
+            int q;
             // insert alpha = 255
-            stbi_uc *cur = a->out + stride*j;
-            int i;
+            cur = a->out + stride*j;
             if (img_n == 1) {
-               for (i=x-1; i >= 0; --i) {
-                  cur[i*2+1] = 255;
-                  cur[i*2+0] = cur[i];
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
                }
             } else {
                STBI_ASSERT(img_n == 3);
-               for (i=x-1; i >= 0; --i) {
-                  cur[i*4+3] = 255;
-                  cur[i*4+2] = cur[i*3+2];
-                  cur[i*4+1] = cur[i*3+1];
-                  cur[i*4+0] = cur[i*3+0];
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
                }
             }
          }
       }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
    }
 
    return 1;
@@ -4135,13 +4541,15 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
 
 static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
 {
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
    stbi_uc *final;
    int p;
    if (!interlaced)
       return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
 
    // de-interlacing
-   final = (stbi_uc *) stbi__malloc(a->s->img_x * a->s->img_y * out_n);
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
    for (p=0; p < 7; ++p) {
       int xorig[] = { 0,4,0,2,0,1,0 };
       int yorig[] = { 0,0,4,0,2,0,1 };
@@ -4161,8 +4569,8 @@ static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint3
             for (i=0; i < x; ++i) {
                int out_y = j*yspc[p]+yorig[p];
                int out_x = i*xspc[p]+xorig[p];
-               memcpy(final + out_y*a->s->img_x*out_n + out_x*out_n,
-                      a->out + (j*x+i)*out_n, out_n);
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
             }
          }
          STBI_FREE(a->out);
@@ -4200,12 +4608,37 @@ static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
    return 1;
 }
 
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
 static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
 {
    stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
    stbi_uc *p, *temp_out, *orig = a->out;
 
-   p = (stbi_uc *) stbi__malloc(pixel_count * pal_img_n);
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
    if (p == NULL) return stbi__err("outofmem", "Out of memory");
 
    // between here and free(out) below, exitting would leak
@@ -4271,9 +4704,10 @@ static void stbi__de_iphone(stbi__png *z)
             stbi_uc a = p[3];
             stbi_uc t = p[0];
             if (a) {
-               p[0] = p[2] * 255 / a;
-               p[1] = p[1] * 255 / a;
-               p[2] =  t   * 255 / a;
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
             } else {
                p[0] = p[2];
                p[2] = t;
@@ -4292,14 +4726,15 @@ static void stbi__de_iphone(stbi__png *z)
    }
 }
 
-#define STBI__PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
 
 static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
 {
    stbi_uc palette[1024], pal_img_n=0;
    stbi_uc has_trans=0, tc[3];
+   stbi__uint16 tc16[3];
    stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
-   int first=1,k,interlace=0, color=0, depth=0, is_iphone=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
    stbi__context *s = z->s;
 
    z->expanded = NULL;
@@ -4324,8 +4759,9 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
             s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
             s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
-            depth = stbi__get8(s);  if (depth != 1 && depth != 2 && depth != 4 && depth != 8)  return stbi__err("1/2/4/8-bit only","PNG not supported: 1/2/4/8-bit only");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
             color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
             if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
             comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
             filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
@@ -4373,8 +4809,11 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
                if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
                has_trans = 1;
-               for (k=0; k < s->img_n; ++k)
-                  tc[k] = (stbi_uc) (stbi__get16be(s) & 255) * stbi__depth_scale_table[depth]; // non 8-bit images will be larger
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
             }
             break;
          }
@@ -4385,11 +4824,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
             if ((int)(ioff + c.length) < (int)ioff) return 0;
             if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
                stbi_uc *p;
                if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
                while (ioff + c.length > idata_limit)
                   idata_limit *= 2;
-               p = (stbi_uc *) STBI_REALLOC(z->idata, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
                z->idata = p;
             }
             if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
@@ -4403,7 +4844,7 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (scan != STBI__SCAN_load) return 1;
             if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
             // initial guess for decoded data size to avoid unnecessary reallocs
-            bpl = (s->img_x * depth + 7) / 8; // bytes per line, per component
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
             raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
             z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
             if (z->expanded == NULL) return 0; // zlib should set error
@@ -4412,9 +4853,14 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                s->img_out_n = s->img_n+1;
             else
                s->img_out_n = s->img_n;
-            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, depth, color, interlace)) return 0;
-            if (has_trans)
-               if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
             if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
                stbi__de_iphone(z);
             if (pal_img_n) {
@@ -4424,6 +4870,9 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (req_comp >= 3) s->img_out_n = req_comp;
                if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
                   return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
             }
             STBI_FREE(z->expanded); z->expanded = NULL;
             return 1;
@@ -4451,21 +4900,28 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
    }
 }
 
-static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp)
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
 {
-   unsigned char *result=NULL;
+   void *result=NULL;
    if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
    if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth < 8)
+         ri->bits_per_channel = 8;
+      else
+         ri->bits_per_channel = p->depth;
       result = p->out;
       p->out = NULL;
       if (req_comp && req_comp != p->s->img_out_n) {
-         result = stbi__convert_format(result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
          p->s->img_out_n = req_comp;
          if (result == NULL) return result;
       }
       *x = p->s->img_x;
       *y = p->s->img_y;
-      if (n) *n = p->s->img_out_n;
+      if (n) *n = p->s->img_n;
    }
    STBI_FREE(p->out);      p->out      = NULL;
    STBI_FREE(p->expanded); p->expanded = NULL;
@@ -4474,11 +4930,11 @@ static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req
    return result;
 }
 
-static unsigned char *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi__png p;
    p.s = s;
-   return stbi__do_png(&p, x,y,comp,req_comp);
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
 }
 
 static int stbi__png_test(stbi__context *s)
@@ -4507,6 +4963,19 @@ static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
    p.s = s;
    return stbi__png_info_raw(&p, x, y, comp);
 }
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
 #endif
 
 // Microsoft/Windows BMP image
@@ -4558,36 +5027,46 @@ static int stbi__bitcount(unsigned int a)
    return a & 0xff;
 }
 
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
 static int stbi__shiftsigned(int v, int shift, int bits)
 {
-   int result;
-   int z=0;
-
-   if (shift < 0) v <<= -shift;
-   else v >>= shift;
-   result = v;
-
-   z = bits;
-   while (z < 8) {
-      result += v >> z;
-      z += bits;
-   }
-   return result;
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v >= 0 && v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
 }
 
-static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+typedef struct
 {
-   stbi_uc *out;
-   unsigned int mr=0,mg=0,mb=0,ma=0, fake_a=0;
-   stbi_uc pal[256][4];
-   int psize=0,i,j,compress=0,width;
-   int bpp, flip_vertically, pad, target, offset, hsz;
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+} stbi__bmp_data;
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
    if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
    stbi__get32le(s); // discard filesize
    stbi__get16le(s); // discard reserved
    stbi__get16le(s); // discard reserved
-   offset = stbi__get32le(s);
-   hsz = stbi__get32le(s);
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+
    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
    if (hsz == 12) {
       s->img_x = stbi__get16le(s);
@@ -4597,15 +5076,9 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
       s->img_y = stbi__get32le(s);
    }
    if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
-   bpp = stbi__get16le(s);
-   if (bpp == 1) return stbi__errpuc("monochrome", "BMP type not supported: 1-bit");
-   flip_vertically = ((int) s->img_y) > 0;
-   s->img_y = abs((int) s->img_y);
-   if (hsz == 12) {
-      if (bpp < 24)
-         psize = (offset - 14 - 24) / 3;
-   } else {
-      compress = stbi__get32le(s);
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
       if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
       stbi__get32le(s); // discard sizeof
       stbi__get32le(s); // discard hres
@@ -4619,27 +5092,25 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
             stbi__get32le(s);
             stbi__get32le(s);
          }
-         if (bpp == 16 || bpp == 32) {
-            mr = mg = mb = 0;
+         if (info->bpp == 16 || info->bpp == 32) {
             if (compress == 0) {
-               if (bpp == 32) {
-                  mr = 0xffu << 16;
-                  mg = 0xffu <<  8;
-                  mb = 0xffu <<  0;
-                  ma = 0xffu << 24;
-                  fake_a = 1; // @TODO: check for cases like alpha value is all 0 and switch it to 255
-                  STBI_NOTUSED(fake_a);
+               if (info->bpp == 32) {
+                  info->mr = 0xffu << 16;
+                  info->mg = 0xffu <<  8;
+                  info->mb = 0xffu <<  0;
+                  info->ma = 0xffu << 24;
+                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
                } else {
-                  mr = 31u << 10;
-                  mg = 31u <<  5;
-                  mb = 31u <<  0;
+                  info->mr = 31u << 10;
+                  info->mg = 31u <<  5;
+                  info->mb = 31u <<  0;
                }
             } else if (compress == 3) {
-               mr = stbi__get32le(s);
-               mg = stbi__get32le(s);
-               mb = stbi__get32le(s);
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
                // not documented, but generated by photoshop and handled by mspaint
-               if (mr == mg && mg == mb) {
+               if (info->mr == info->mg && info->mg == info->mb) {
                   // ?!?!?
                   return stbi__errpuc("bad BMP", "bad BMP");
                }
@@ -4647,11 +5118,13 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
                return stbi__errpuc("bad BMP", "bad BMP");
          }
       } else {
-         STBI_ASSERT(hsz == 108 || hsz == 124);
-         mr = stbi__get32le(s);
-         mg = stbi__get32le(s);
-         mb = stbi__get32le(s);
-         ma = stbi__get32le(s);
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
          stbi__get32le(s); // discard color space
          for (i=0; i < 12; ++i)
             stbi__get32le(s); // discard color space parameters
@@ -4662,63 +5135,119 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
             stbi__get32le(s); // discard reserved
          }
       }
-      if (bpp < 16)
-         psize = (offset - 14 - hsz) >> 2;
    }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - 14 - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - 14 - info.hsz) >> 2;
+   }
+
    s->img_n = ma ? 4 : 3;
    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
       target = req_comp;
    else
       target = s->img_n; // if they want monochrome, we'll post-convert
-   out = (stbi_uc *) stbi__malloc(target * s->img_x * s->img_y);
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   if (bpp < 16) {
+   if (info.bpp < 16) {
       int z=0;
       if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
       for (i=0; i < psize; ++i) {
          pal[i][2] = stbi__get8(s);
          pal[i][1] = stbi__get8(s);
          pal[i][0] = stbi__get8(s);
-         if (hsz != 12) stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
          pal[i][3] = 255;
       }
-      stbi__skip(s, offset - 14 - hsz - psize * (hsz == 12 ? 3 : 4));
-      if (bpp == 4) width = (s->img_x + 1) >> 1;
-      else if (bpp == 8) width = s->img_x;
+      stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
       else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
       pad = (-width)&3;
-      for (j=0; j < (int) s->img_y; ++j) {
-         for (i=0; i < (int) s->img_x; i += 2) {
-            int v=stbi__get8(s),v2=0;
-            if (bpp == 4) {
-               v2 = v & 15;
-               v >>= 4;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
             }
-            out[z++] = pal[v][0];
-            out[z++] = pal[v][1];
-            out[z++] = pal[v][2];
-            if (target == 4) out[z++] = 255;
-            if (i+1 == (int) s->img_x) break;
-            v = (bpp == 8) ? stbi__get8(s) : v2;
-            out[z++] = pal[v][0];
-            out[z++] = pal[v][1];
-            out[z++] = pal[v][2];
-            if (target == 4) out[z++] = 255;
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
          }
-         stbi__skip(s, pad);
       }
    } else {
       int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
       int z = 0;
       int easy=0;
-      stbi__skip(s, offset - 14 - hsz);
-      if (bpp == 24) width = 3 * s->img_x;
-      else if (bpp == 16) width = 2*s->img_x;
+      stbi__skip(s, info.offset - 14 - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
       else /* bpp = 32 and pad = 0 */ width=0;
       pad = (-width) & 3;
-      if (bpp == 24) {
+      if (info.bpp == 24) {
          easy = 1;
-      } else if (bpp == 32) {
+      } else if (info.bpp == 32) {
          if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
             easy = 2;
       }
@@ -4739,22 +5268,31 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
                out[z+0] = stbi__get8(s);
                z += 3;
                a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
                if (target == 4) out[z++] = a;
             }
          } else {
+            int bpp = info.bpp;
             for (i=0; i < (int) s->img_x; ++i) {
                stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
-               int a;
+               unsigned int a;
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
                a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
                if (target == 4) out[z++] = STBI__BYTECAST(a);
             }
          }
          stbi__skip(s, pad);
       }
    }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
    if (flip_vertically) {
       stbi_uc t;
       for (j=0; j < (int) s->img_y>>1; ++j) {
@@ -4781,20 +5319,55 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
 // Targa Truevision - TGA
 // by Jonathan Dummer
 #ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
 static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
 {
-    int tga_w, tga_h, tga_comp;
-    int sz;
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
     stbi__get8(s);                   // discard Offset
-    sz = stbi__get8(s);              // color type
-    if( sz > 1 ) {
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
         stbi__rewind(s);
         return 0;      // only RGB or indexed allowed
     }
-    sz = stbi__get8(s);              // image type
-    // only RGB or grey allowed, +/- RLE
-    if ((sz != 1) && (sz != 2) && (sz != 3) && (sz != 9) && (sz != 10) && (sz != 11)) return 0;
-    stbi__skip(s,9);
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
     tga_w = stbi__get16le(s);
     if( tga_w < 1 ) {
         stbi__rewind(s);
@@ -4805,45 +5378,81 @@ static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
         stbi__rewind(s);
         return 0;   // test height
     }
-    sz = stbi__get8(s);               // bits per pixel
-    // only RGB or RGBA or grey allowed
-    if ((sz != 8) && (sz != 16) && (sz != 24) && (sz != 32)) {
-        stbi__rewind(s);
-        return 0;
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
     }
-    tga_comp = sz;
     if (x) *x = tga_w;
     if (y) *y = tga_h;
-    if (comp) *comp = tga_comp / 8;
+    if (comp) *comp = tga_comp;
     return 1;                   // seems to have passed everything
 }
 
 static int stbi__tga_test(stbi__context *s)
 {
-   int res;
-   int sz;
+   int res = 0;
+   int sz, tga_color_type;
    stbi__get8(s);      //   discard Offset
-   sz = stbi__get8(s);   //   color type
-   if ( sz > 1 ) return 0;   //   only RGB or indexed allowed
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
    sz = stbi__get8(s);   //   image type
-   if ( (sz != 1) && (sz != 2) && (sz != 3) && (sz != 9) && (sz != 10) && (sz != 11) ) return 0;   //   only RGB or grey allowed, +/- RLE
-   stbi__get16be(s);      //   discard palette start
-   stbi__get16be(s);      //   discard palette length
-   stbi__get8(s);         //   discard bits per palette color entry
-   stbi__get16be(s);      //   discard x origin
-   stbi__get16be(s);      //   discard y origin
-   if ( stbi__get16be(s) < 1 ) return 0;      //   test width
-   if ( stbi__get16be(s) < 1 ) return 0;      //   test height
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
    sz = stbi__get8(s);   //   bits per pixel
-   if ( (sz != 8) && (sz != 16) && (sz != 24) && (sz != 32) )
-      res = 0;
-   else
-      res = 1;
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
    stbi__rewind(s);
    return res;
 }
 
-static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    //   read in the TGA header stuff
    int tga_offset = stbi__get8(s);
@@ -4858,16 +5467,18 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
    int tga_width = stbi__get16le(s);
    int tga_height = stbi__get16le(s);
    int tga_bits_per_pixel = stbi__get8(s);
-   int tga_comp = tga_bits_per_pixel / 8;
+   int tga_comp, tga_rgb16=0;
    int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
    //   image data
    unsigned char *tga_data;
    unsigned char *tga_palette = NULL;
    int i, j;
-   unsigned char raw_data[4];
+   unsigned char raw_data[4] = {0};
    int RLE_count = 0;
    int RLE_repeating = 0;
    int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
 
    //   do a tiny bit of precessing
    if ( tga_image_type >= 8 )
@@ -4875,41 +5486,33 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
       tga_image_type -= 8;
       tga_is_RLE = 1;
    }
-   /* int tga_alpha_bits = tga_inverted & 15; */
    tga_inverted = 1 - ((tga_inverted >> 5) & 1);
 
-   //   error check
-   if ( //(tga_indexed) ||
-      (tga_width < 1) || (tga_height < 1) ||
-      (tga_image_type < 1) || (tga_image_type > 3) ||
-      ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16) &&
-      (tga_bits_per_pixel != 24) && (tga_bits_per_pixel != 32))
-      )
-   {
-      return NULL; // we don't report this as a bad TGA because we don't even know if it's TGA
-   }
-
    //   If I'm paletted, then I'll use the number of bits from the palette
-   if ( tga_indexed )
-   {
-      tga_comp = tga_palette_bits / 8;
-   }
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
 
    //   tga info
    *x = tga_width;
    *y = tga_height;
    if (comp) *comp = tga_comp;
 
-   tga_data = (unsigned char*)stbi__malloc( (size_t)tga_width * tga_height * tga_comp );
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
    if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
 
    // skip to the data's starting position (offset usually = 0)
    stbi__skip(s, tga_offset );
 
-   if ( !tga_indexed && !tga_is_RLE) {
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
       for (i=0; i < tga_height; ++i) {
-         int y = tga_inverted ? tga_height -i - 1 : i;
-         stbi_uc *tga_row = tga_data + y*tga_width*tga_comp;
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
          stbi__getn(s, tga_row, tga_width * tga_comp);
       }
    } else  {
@@ -4919,15 +5522,22 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
          //   any data to skip? (offset usually = 0)
          stbi__skip(s, tga_palette_start );
          //   load the palette
-         tga_palette = (unsigned char*)stbi__malloc( tga_palette_len * tga_palette_bits / 8 );
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
          if (!tga_palette) {
             STBI_FREE(tga_data);
             return stbi__errpuc("outofmem", "Out of memory");
          }
-         if (!stbi__getn(s, tga_palette, tga_palette_len * tga_palette_bits / 8 )) {
-            STBI_FREE(tga_data);
-            STBI_FREE(tga_palette);
-            return stbi__errpuc("bad palette", "Corrupt TGA");
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
          }
       }
       //   load the data
@@ -4957,23 +5567,22 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
             //   load however much data we did have
             if ( tga_indexed )
             {
-               //   read in 1 byte, then perform the lookup
-               int pal_idx = stbi__get8(s);
-               if ( pal_idx >= tga_palette_len )
-               {
-                  //   invalid index
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
                   pal_idx = 0;
                }
-               pal_idx *= tga_bits_per_pixel / 8;
-               for (j = 0; j*8 < tga_bits_per_pixel; ++j)
-               {
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
                   raw_data[j] = tga_palette[pal_idx+j];
                }
-            } else
-            {
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
                //   read in the data raw
-               for (j = 0; j*8 < tga_bits_per_pixel; ++j)
-               {
+               for (j = 0; j < tga_comp; ++j) {
                   raw_data[j] = stbi__get8(s);
                }
             }
@@ -5012,8 +5621,8 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
       }
    }
 
-   // swap RGB
-   if (tga_comp >= 3)
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
    {
       unsigned char* tga_pixel = tga_data;
       for (i=0; i < tga_width * tga_height; ++i)
@@ -5049,13 +5658,53 @@ static int stbi__psd_test(stbi__context *s)
    return r;
 }
 
-static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
 {
-   int   pixelCount;
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
    int channelCount, compression;
-   int channel, i, count, len;
+   int channel, i;
+   int bitdepth;
    int w,h;
    stbi_uc *out;
+   STBI_NOTUSED(ri);
 
    // Check identifier
    if (stbi__get32be(s) != 0x38425053)   // "8BPS"
@@ -5078,8 +5727,9 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
    w = stbi__get32be(s);
 
    // Make sure the depth is 8 bits.
-   if (stbi__get16be(s) != 8)
-      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 bit");
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
 
    // Make sure the color mode is RGB.
    // Valid options are:
@@ -5111,8 +5761,18 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
    if (compression > 1)
       return stbi__errpuc("bad compression", "PSD has an unknown compression format");
 
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
    // Create the destination image.
-   out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
    pixelCount = w*h;
 
@@ -5144,61 +5804,86 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
                *p = (channel == 3 ? 255 : 0);
          } else {
             // Read the RLE data.
-            count = 0;
-            while (count < pixelCount) {
-               len = stbi__get8(s);
-               if (len == 128) {
-                  // No-op.
-               } else if (len < 128) {
-                  // Copy next len+1 bytes literally.
-                  len++;
-                  count += len;
-                  while (len) {
-                     *p = stbi__get8(s);
-                     p += 4;
-                     len--;
-                  }
-               } else if (len > 128) {
-                  stbi_uc   val;
-                  // Next -len+1 bytes in the dest are replicated from next source byte.
-                  // (Interpret len as a negative 8-bit int.)
-                  len ^= 0x0FF;
-                  len += 2;
-                  val = stbi__get8(s);
-                  count += len;
-                  while (len) {
-                     *p = val;
-                     p += 4;
-                     len--;
-                  }
-               }
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
             }
          }
       }
 
    } else {
       // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
-      // where each channel consists of an 8-bit value for each pixel in the image.
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
 
       // Read the data by channel.
       for (channel = 0; channel < 4; channel++) {
-         stbi_uc *p;
-
-         p = out + channel;
-         if (channel > channelCount) {
+         if (channel >= channelCount) {
             // Fill this channel with default data.
-            for (i = 0; i < pixelCount; i++, p += 4)
-               *p = channel == 3 ? 255 : 0;
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
          } else {
-            // Read the data.
-            for (i = 0; i < pixelCount; i++, p += 4)
-               *p = stbi__get8(s);
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
          }
       }
    }
 
+   // convert to desired output format
    if (req_comp && req_comp != 4) {
-      out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
       if (out == NULL) return out; // stbi__convert_format frees input on failure
    }
 
@@ -5350,7 +6035,6 @@ static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *c
 
                   if (count >= 128) { // Repeated
                      stbi_uc value[4];
-                     int i;
 
                      if (count==128)
                         count = stbi__get16be(s);
@@ -5383,10 +6067,13 @@ static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *c
    return result;
 }
 
-static stbi_uc *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp)
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
 {
    stbi_uc *result;
-   int i, x,y;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
 
    for (i=0; i<92; ++i)
       stbi__get8(s);
@@ -5394,14 +6081,14 @@ static stbi_uc *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int re
    x = stbi__get16be(s);
    y = stbi__get16be(s);
    if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
-   if ((1 << 28) / x < y) return stbi__errpuc("too large", "Image too large to decode");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
 
    stbi__get32be(s); //skip `ratio'
    stbi__get16be(s); //skip `fields'
    stbi__get16be(s); //skip `pad'
 
    // intermediate buffer is RGBA
-   result = (stbi_uc *) stbi__malloc(x*y*4);
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
    memset(result, 0xff, x*y*4);
 
    if (!stbi__pic_load_core(s,x,y,comp, result)) {
@@ -5439,10 +6126,12 @@ typedef struct
 {
    int w,h;
    stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history; 
    int flags, bgindex, ratio, transparent, eflags;
    stbi_uc  pal[256][4];
    stbi_uc lpal[256][4];
-   stbi__gif_lzw codes[4096];
+   stbi__gif_lzw codes[8192];
    stbi_uc *color_table;
    int parse, step;
    int lflags;
@@ -5450,6 +6139,7 @@ typedef struct
    int max_x, max_y;
    int cur_x, cur_y;
    int line_size;
+   int delay;
 } stbi__gif;
 
 static int stbi__gif_test_raw(stbi__context *s)
@@ -5510,19 +6200,22 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
 
 static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
 {
-   stbi__gif g;
-   if (!stbi__gif_header(s, &g, comp, 1)) {
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
       stbi__rewind( s );
       return 0;
    }
-   if (x) *x = g.w;
-   if (y) *y = g.h;
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
    return 1;
 }
 
 static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 {
    stbi_uc *p, *c;
+   int idx; 
 
    // recurse to decode the prefixes, since the linked-list is backwards,
    // and working backwards through an interleaved image would be nasty
@@ -5531,10 +6224,12 @@ static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 
    if (g->cur_y >= g->max_y) return;
 
-   p = &g->out[g->cur_x + g->cur_y];
-   c = &g->color_table[g->codes[code].suffix * 4];
+   idx = g->cur_x + g->cur_y; 
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;  
 
-   if (c[3] >= 128) {
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels; 
       p[0] = c[2];
       p[1] = c[1];
       p[2] = c[0];
@@ -5557,7 +6252,7 @@ static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
 {
    stbi_uc lzw_cs;
-   stbi__int32 len, code;
+   stbi__int32 len, init_code;
    stbi__uint32 first;
    stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
    stbi__gif_lzw *p;
@@ -5570,10 +6265,10 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
    codemask = (1 << codesize) - 1;
    bits = 0;
    valid_bits = 0;
-   for (code = 0; code < clear; code++) {
-      g->codes[code].prefix = -1;
-      g->codes[code].first = (stbi_uc) code;
-      g->codes[code].suffix = (stbi_uc) code;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
    }
 
    // support no starting clear code
@@ -5608,11 +6303,16 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
                stbi__skip(s,len);
             return g->out;
          } else if (code <= avail) {
-            if (first) return stbi__errpuc("no clear code", "Corrupt GIF");
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
 
             if (oldcode >= 0) {
                p = &g->codes[avail++];
-               if (avail > 4096)        return stbi__errpuc("too many codes", "Corrupt GIF");
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
                p->prefix = (stbi__int16) oldcode;
                p->first = g->codes[oldcode].first;
                p->suffix = (code == avail) ? p->first : g->codes[code].first;
@@ -5634,43 +6334,70 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
    }
 }
 
-static void stbi__fill_gif_background(stbi__gif *g)
-{
-   int i;
-   stbi_uc *c = g->pal[g->bgindex];
-   // @OPTIMIZE: write a dword at a time
-   for (i = 0; i < g->w * g->h * 4; i += 4) {
-      stbi_uc *p  = &g->out[i];
-      p[0] = c[2];
-      p[1] = c[1];
-      p[2] = c[0];
-      p[3] = c[3];
-   }
-}
-
 // this function is designed to support animated gifs, although stb_image doesn't support it
-static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp)
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
 {
-   int i;
-   stbi_uc *old_out = 0;
+   int dispose; 
+   int first_frame; 
+   int pi; 
+   int pcount; 
 
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0; 
    if (g->out == 0) {
       if (!stbi__gif_header(s, g, comp,0))     return 0; // stbi__g_failure_reason set by stbi__gif_header
       g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
+      g->background = (stbi_uc *) stbi__malloc(4 * g->w * g->h); 
+      g->history = (stbi_uc *) stbi__malloc(g->w * g->h); 
       if (g->out == 0)                      return stbi__errpuc("outofmem", "Out of memory");
-      stbi__fill_gif_background(g);
+
+      // image is treated as "tranparent" at the start - ie, nothing overwrites the current background; 
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to teh color that was there the previous frame. 
+      memset( g->out, 0x00, 4 * g->w * g->h ); 
+      memset( g->background, 0x00, 4 * g->w * g->h ); // state of the background (starts transparent)
+      memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+      first_frame = 1; 
    } else {
-      // animated-gif-only path
-      if (((g->eflags & 0x1C) >> 2) == 3) {
-         old_out = g->out;
-         g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
-         if (g->out == 0)                   return stbi__errpuc("outofmem", "Out of memory");
-         memcpy(g->out, old_out, g->w*g->h*4);
+      // second frame - how do we dispoase of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2; 
+      pcount = g->w * g->h; 
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
       }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 ); 
+            }
+         }
+      } else if (dispose == 2) { 
+         // restore what was changed last frame to background before that frame; 
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 ); 
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just 
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame; 
+      memcpy( g->background, g->out, 4 * g->w * g->h ); 
    }
 
+   // clear my history; 
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
    for (;;) {
-      switch (stbi__get8(s)) {
+      int tag = stbi__get8(s); 
+      switch (tag) {
          case 0x2C: /* Image Descriptor */
          {
             stbi__int32 x, y, w, h;
@@ -5705,38 +6432,60 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
                stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
                g->color_table = (stbi_uc *) g->lpal;
             } else if (g->flags & 0x80) {
-               for (i=0; i < 256; ++i)  // @OPTIMIZE: stbi__jpeg_reset only the previous transparent
-                  g->pal[i][3] = 255;
-               if (g->transparent >= 0 && (g->eflags & 0x01))
-                  g->pal[g->transparent][3] = 0;
                g->color_table = (stbi_uc *) g->pal;
             } else
-               return stbi__errpuc("missing color table", "Corrupt GIF");
-
+               return stbi__errpuc("missing color table", "Corrupt GIF");            
+            
             o = stbi__process_gif_raster(s, g);
             if (o == NULL) return NULL;
 
-            if (req_comp && req_comp != 4)
-               o = stbi__convert_format(o, 4, req_comp, g->w, g->h);
+            // if this was the first frame, 
+            pcount = g->w * g->h; 
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be; 
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 ); 
+                  }
+               }
+            }
+
             return o;
          }
 
          case 0x21: // Comment Extension.
          {
             int len;
-            if (stbi__get8(s) == 0xF9) { // Graphic Control Extension.
+            int ext = stbi__get8(s); 
+            if (ext == 0xF9) { // Graphic Control Extension.
                len = stbi__get8(s);
                if (len == 4) {
                   g->eflags = stbi__get8(s);
-                  stbi__get16le(s); // delay
-                  g->transparent = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255; 
+                  } 
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0; 
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1); 
+                     g->transparent = -1; 
+                  }
                } else {
                   stbi__skip(s, len);
                   break;
                }
-            }
-            while ((len = stbi__get8(s)) != 0)
+            } 
+            while ((len = stbi__get8(s)) != 0) {
                stbi__skip(s, len);
+            }
             break;
          }
 
@@ -5749,19 +6498,90 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
    }
 }
 
-static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0; 
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0; 
+      stbi__gif g;
+      int stride; 
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0; 
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers; 
+            stride = g.w * g.h * 4; 
+         
+            if (out) {
+               out = (stbi_uc*) STBI_REALLOC( out, layers * stride ); 
+               if (delays) {
+                  *delays = (int*) STBI_REALLOC( *delays, sizeof(int) * layers ); 
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride ); 
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) ); 
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride ); 
+            if (layers >= 2) {
+               two_back = out - 2 * stride; 
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay; 
+            }
+         }
+      } while (u != 0); 
+
+      // free temp buffer; 
+      STBI_FREE(g.out); 
+      STBI_FREE(g.history); 
+      STBI_FREE(g.background); 
+
+      // do the final conversion after loading everything; 
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers; 
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type."); 
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi_uc *u = 0;
    stbi__gif g;
    memset(&g, 0, sizeof(g));
 
-   u = stbi__gif_load_next(s, &g, comp, req_comp);
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
    if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
    if (u) {
       *x = g.w;
       *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames. 
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
    }
 
+   // free buffers needed for multiple frame loading; 
+   STBI_FREE(g.history);
+   STBI_FREE(g.background); 
+
    return u;
 }
 
@@ -5775,20 +6595,24 @@ static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
 // Radiance RGBE HDR loader
 // originally by Nicolas Schulz
 #ifndef STBI_NO_HDR
-static int stbi__hdr_test_core(stbi__context *s)
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
 {
-   const char *signature = "#?RADIANCE\n";
    int i;
    for (i=0; signature[i]; ++i)
       if (stbi__get8(s) != signature[i])
-         return 0;
+          return 0;
+   stbi__rewind(s);
    return 1;
 }
 
 static int stbi__hdr_test(stbi__context* s)
 {
-   int r = stbi__hdr_test_core(s);
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
    stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
    return r;
 }
 
@@ -5842,7 +6666,7 @@ static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
    }
 }
 
-static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    char buffer[STBI__HDR_BUFLEN];
    char *token;
@@ -5853,10 +6677,12 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    int len;
    unsigned char count, value;
    int i, j, k, c1,c2, z;
-
+   const char *headerToken;
+   STBI_NOTUSED(ri);
 
    // Check identifier
-   if (strcmp(stbi__hdr_gettoken(s,buffer), "#?RADIANCE") != 0)
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
       return stbi__errpf("not HDR", "Corrupt HDR image");
 
    // Parse header
@@ -5885,8 +6711,13 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    if (comp) *comp = 3;
    if (req_comp == 0) req_comp = 3;
 
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
    // Read data
-   hdr_data = (float *) stbi__malloc(height * width * req_comp * sizeof(float));
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
 
    // Load image data
    // image data is stored as some number of sca
@@ -5925,20 +6756,29 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
          len <<= 8;
          len |= stbi__get8(s);
          if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
-         if (scanline == NULL) scanline = (stbi_uc *) stbi__malloc(width * 4);
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
 
          for (k = 0; k < 4; ++k) {
+            int nleft;
             i = 0;
-            while (i < width) {
+            while ((nleft = width - i) > 0) {
                count = stbi__get8(s);
                if (count > 128) {
                   // Run
                   value = stbi__get8(s);
                   count -= 128;
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                   for (z = 0; z < count; ++z)
                      scanline[i++ * 4 + k] = value;
                } else {
                   // Dump
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                   for (z = 0; z < count; ++z)
                      scanline[i++ * 4 + k] = stbi__get8(s);
                }
@@ -5947,7 +6787,8 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
          for (i=0; i < width; ++i)
             stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
       }
-      STBI_FREE(scanline);
+      if (scanline)
+         STBI_FREE(scanline);
    }
 
    return hdr_data;
@@ -5958,8 +6799,13 @@ static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
    char buffer[STBI__HDR_BUFLEN];
    char *token;
    int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
 
-   if (strcmp(stbi__hdr_gettoken(s,buffer), "#?RADIANCE") != 0) {
+   if (stbi__hdr_test(s) == 0) {
        stbi__rewind( s );
        return 0;
    }
@@ -5996,29 +6842,17 @@ static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
 #ifndef STBI_NO_BMP
 static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int hsz;
-   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') {
-       stbi__rewind( s );
-       return 0;
-   }
-   stbi__skip(s,12);
-   hsz = stbi__get32le(s);
-   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (hsz == 12) {
-      *x = stbi__get16le(s);
-      *y = stbi__get16le(s);
-   } else {
-      *x = stbi__get32le(s);
-      *y = stbi__get32le(s);
-   }
-   if (stbi__get16le(s) != 1) {
-       stbi__rewind( s );
-       return 0;
-   }
-   *comp = stbi__get16le(s) / 8;
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   stbi__rewind( s );
+   if (p == NULL)
+      return 0;
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) *comp = info.ma ? 4 : 3;
    return 1;
 }
 #endif
@@ -6026,7 +6860,10 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
 #ifndef STBI_NO_PSD
 static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int channelCount;
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
    if (stbi__get32be(s) != 0x38425053) {
        stbi__rewind( s );
        return 0;
@@ -6043,7 +6880,8 @@ static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
    }
    *y = stbi__get32be(s);
    *x = stbi__get32be(s);
-   if (stbi__get16be(s) != 8) {
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
        stbi__rewind( s );
        return 0;
    }
@@ -6054,22 +6892,61 @@ static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
    *comp = 4;
    return 1;
 }
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   (void) stbi__get32be(s);
+   (void) stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
 #endif
 
 #ifndef STBI_NO_PIC
 static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int act_comp=0,num_packets=0,chained;
+   int act_comp=0,num_packets=0,chained,dummy;
    stbi__pic_packet packets[10];
 
-   stbi__skip(s, 92);
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
 
    *x = stbi__get16be(s);
    *y = stbi__get16be(s);
-   if (stbi__at_eof(s))  return 0;
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
    if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
-       stbi__rewind( s );
-       return 0;
+      stbi__rewind( s );
+      return 0;
    }
 
    stbi__skip(s, 8);
@@ -6129,16 +7006,22 @@ static int      stbi__pnm_test(stbi__context *s)
    return 1;
 }
 
-static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi_uc *out;
+   STBI_NOTUSED(ri);
+
    if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
       return 0;
+
    *x = s->img_x;
    *y = s->img_y;
-   *comp = s->img_n;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "PNM too large");
 
-   out = (stbi_uc *) stbi__malloc(s->img_n * s->img_x * s->img_y);
+   out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
    stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
 
@@ -6156,8 +7039,16 @@ static int      stbi__pnm_isspace(char c)
 
 static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
 {
-   while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
-      *c = (char) stbi__get8(s);
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
 }
 
 static int      stbi__pnm_isdigit(char c)
@@ -6179,16 +7070,20 @@ static int      stbi__pnm_getinteger(stbi__context *s, char *c)
 
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int maxv;
+   int maxv, dummy;
    char c, p, t;
 
-   stbi__rewind( s );
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
 
    // Get identifier
    p = (char) stbi__get8(s);
    t = (char) stbi__get8(s);
    if (p != 'P' || (t != '5' && t != '6')) {
-       stbi__rewind( s );
+       stbi__rewind(s);
        return 0;
    }
 
@@ -6254,6 +7149,19 @@ static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
    return stbi__err("unknown image type", "Image not of any known type, or corrupt");
 }
 
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   return 0;
+}
+
 #ifndef STBI_NO_STDIO
 STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
 {
@@ -6275,6 +7183,27 @@ STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
    fseek(f,pos,SEEK_SET);
    return r;
 }
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
 #endif // !STBI_NO_STDIO
 
 STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
@@ -6291,10 +7220,63 @@ STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int
    return stbi__info_main(&s,x,y,comp);
 }
 
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
 #endif // STB_IMAGE_IMPLEMENTATION
 
 /*
    revision history:
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
       2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
       2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
       2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
@@ -6435,3 +7417,46 @@ STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int
       0.50  (2006-11-19)
               first released version
 */
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/image.darknet/inst/include/darknet/src/stb_image_write.h b/image.darknet/inst/include/darknet/src/stb_image_write.h
index f5250b3..c05e958 100644
--- a/image.darknet/inst/include/darknet/src/stb_image_write.h
+++ b/image.darknet/inst/include/darknet/src/stb_image_write.h
@@ -1,7 +1,6 @@
-/* stb_image_write - v0.98 - public domain - http://nothings.org/stb/stb_image_write.h
-   writes out PNG/BMP/TGA images to C stdio - Sean Barrett 2010
-                            no warranty implied; use at your own risk
-
+/* stb_image_write - v1.09 - public domain - http://nothings.org/stb/stb_image_write.h
+   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
+                                     no warranty implied; use at your own risk
 
    Before #including,
 
@@ -11,31 +10,67 @@
 
    Will probably not work correctly with strict-aliasing optimizations.
 
+   If using a modern Microsoft Compiler, non-safe versions of CRT calls may cause 
+   compilation warnings or even errors. To avoid this, also before #including,
+
+       #define STBI_MSC_SECURE_CRT
+
 ABOUT:
 
    This header file is a library for writing images to C stdio. It could be
    adapted to write to memory or a general streaming interface; let me know.
 
    The PNG output is not optimal; it is 20-50% larger than the file
-   written by a decent optimizing implementation. This library is designed
-   for source code compactness and simplicitly, not optimal image file size
-   or run-time performance.
+   written by a decent optimizing implementation; though providing a custom
+   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
+   This library is designed for source code compactness and simplicity,
+   not optimal image file size or run-time performance.
 
 BUILDING:
 
    You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
    You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
    malloc,realloc,free.
-   You can define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
+   for PNG compression (instead of the builtin one), it must have the following signature:
+   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
+   The returned data will be freed with STBIW_FREE() (free() by default),
+   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
 
 USAGE:
 
-   There are four functions, one for each image file format:
+   There are five functions, one for each image file format:
 
      int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
      int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
      int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
-     int stbi_write_hdr(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
+     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
+
+   There are also five equivalent functions that use an arbitrary write function. You are
+   expected to open/close your file-equivalent before and after calling these:
+
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
+
+   where the callback is:
+      void stbi_write_func(void *context, void *data, int size);
+
+   You can configure it with these global variables:
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
+      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
+      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+
+
+   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
+   functions, so the library will not use stdio.h at all. However, this will
+   also disable HDR writing, because it requires stdio for formatted output.
 
    Each function returns 0 on failure and non-0 on success.
 
@@ -59,63 +94,138 @@
    writer, both because it is in BGR order and because it may have padding
    at the end of the line.)
 
+   PNG allows you to set the deflate compression level by setting the global
+   variable 'stbi_write_png_compression_level' (it defaults to 8).
+
    HDR expects linear float data. Since the format is always 32-bit rgb(e)
    data, alpha (if provided) is discarded, and for monochrome data it is
    replicated across all three channels.
 
+   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
+   data, set the global variable 'stbi_write_tga_with_rle' to 0.
+   
+   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
+   Higher quality looks better but results in a bigger image.
+   JPEG baseline (no JPEG progressive).
+
 CREDITS:
 
-   PNG/BMP/TGA
-      Sean Barrett
-   HDR
-      Baldur Karlsson
-   TGA monochrome:
-      Jean-Sebastien Guay
-   misc enhancements:
-      Tim Kelsey
+
+   Sean Barrett           -    PNG/BMP/TGA 
+   Baldur Karlsson        -    HDR
+   Jean-Sebastien Guay    -    TGA monochrome
+   Tim Kelsey             -    misc enhancements
+   Alan Hickman           -    TGA RLE
+   Emmanuel Julien        -    initial file IO callback implementation
+   Jon Olick              -    original jo_jpeg.cpp code
+   Daniel Gibson          -    integrate JPEG, allow external zlib
+   Aarni Koskela          -    allow choosing PNG filter
+
    bugfixes:
       github:Chribba
+      Guillaume Chereau
+      github:jry2
+      github:romigrou
+      Sergio Gonzalez
+      Jonas Karlsson
+      Filip Wasil
+      Thatcher Ulrich
+      github:poppolopoppo
+      Patrick Boettcher
+      github:xeekworx
+      Cap Petschulat
+      Simon Rodriguez
+      Ivan Tikhonov
+      github:ignotion
+      Adam Schackart
+
+LICENSE
+
+  See end of file for license information.
+
 */
 
 #ifndef INCLUDE_STB_IMAGE_WRITE_H
 #define INCLUDE_STB_IMAGE_WRITE_H
 
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+#ifndef STBIWDEF
+#ifdef STB_IMAGE_WRITE_STATIC
+#define STBIWDEF  static
+#else
 #ifdef __cplusplus
-extern "C" {
+#define STBIWDEF  extern "C"
+#else
+#define STBIWDEF  extern
+#endif
+#endif
 #endif
 
-extern int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
-extern int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
-extern int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
-extern int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+extern int stbi_write_tga_with_rle;
+extern int stbi_write_png_compression_level;
+extern int stbi_write_force_png_filter;
+#endif
 
-#ifdef __cplusplus
-}
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
 #endif
 
+typedef void stbi_write_func(void *context, void *data, int size);
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
+
+STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
+
 #endif//INCLUDE_STB_IMAGE_WRITE_H
 
 #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
 
+#ifdef _WIN32
+   #ifndef _CRT_SECURE_NO_WARNINGS
+   #define _CRT_SECURE_NO_WARNINGS
+   #endif
+   #ifndef _CRT_NONSTDC_NO_DEPRECATE
+   #define _CRT_NONSTDC_NO_DEPRECATE
+   #endif
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+#include <stdio.h>
+#endif // STBI_WRITE_NO_STDIO
+
 #include <stdarg.h>
 #include <stdlib.h>
-#include <stdio.h>
 #include <string.h>
 #include <math.h>
 
-#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && defined(STBIW_REALLOC)
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
 // ok
-#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC)
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
 // ok
 #else
-#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC."
+#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
 #endif
 
 #ifndef STBIW_MALLOC
-#define STBIW_MALLOC(sz)    malloc(sz)
-#define STBIW_REALLOC(p,sz) realloc(p,sz)
-#define STBIW_FREE(p)       free(p)
+#define STBIW_MALLOC(sz)        malloc(sz)
+#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
+#define STBIW_FREE(p)           free(p)
+#endif
+
+#ifndef STBIW_REALLOC_SIZED
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
 #endif
+
+
 #ifndef STBIW_MEMMOVE
 #define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
 #endif
@@ -126,22 +236,90 @@ extern int stbi_write_hdr(char const *filename, int w, int h, int comp, const fl
 #define STBIW_ASSERT(x) assert(x)
 #endif
 
+#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi__flip_vertically_on_write=0;
+static int stbi_write_png_compression_level = 8;
+static int stbi_write_tga_with_rle = 1;
+static int stbi_write_force_png_filter = -1;
+#else
+int stbi_write_png_compression_level = 8;
+int stbi__flip_vertically_on_write=0;
+int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+#endif
+
+STBIWDEF void stbi_flip_vertically_on_write(int flag)
+{
+   stbi__flip_vertically_on_write = flag;
+}
+
+typedef struct
+{
+   stbi_write_func *func;
+   void *context;
+} stbi__write_context;
+
+// initialize a callback-based context
+static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
+{
+   s->func    = c;
+   s->context = context;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbi__stdio_write(void *context, void *data, int size)
+{
+   fwrite(data,1,size,(FILE*) context);
+}
+
+static int stbi__start_write_file(stbi__write_context *s, const char *filename)
+{
+   FILE *f;
+#ifdef STBI_MSC_SECURE_CRT
+   if (fopen_s(&f, filename, "wb"))
+      f = NULL;
+#else
+   f = fopen(filename, "wb");
+#endif
+   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
+   return f != NULL;
+}
+
+static void stbi__end_write_file(stbi__write_context *s)
+{
+   fclose((FILE *)s->context);
+}
+
+#endif // !STBI_WRITE_NO_STDIO
+
 typedef unsigned int stbiw_uint32;
 typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
 
-static void writefv(FILE *f, const char *fmt, va_list v)
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
 {
    while (*fmt) {
       switch (*fmt++) {
          case ' ': break;
-         case '1': { unsigned char x = (unsigned char) va_arg(v, int); fputc(x,f); break; }
-         case '2': { int x = va_arg(v,int); unsigned char b[2];
-                     b[0] = (unsigned char) x; b[1] = (unsigned char) (x>>8);
-                     fwrite(b,2,1,f); break; }
-         case '4': { stbiw_uint32 x = va_arg(v,int); unsigned char b[4];
-                     b[0]=(unsigned char)x; b[1]=(unsigned char)(x>>8);
-                     b[2]=(unsigned char)(x>>16); b[3]=(unsigned char)(x>>24);
-                     fwrite(b,4,1,f); break; }
+         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
+                     s->func(s->context,&x,1);
+                     break; }
+         case '2': { int x = va_arg(v,int);
+                     unsigned char b[2];
+                     b[0] = STBIW_UCHAR(x);
+                     b[1] = STBIW_UCHAR(x>>8);
+                     s->func(s->context,b,2);
+                     break; }
+         case '4': { stbiw_uint32 x = va_arg(v,int);
+                     unsigned char b[4];
+                     b[0]=STBIW_UCHAR(x);
+                     b[1]=STBIW_UCHAR(x>>8);
+                     b[2]=STBIW_UCHAR(x>>16);
+                     b[3]=STBIW_UCHAR(x>>24);
+                     s->func(s->context,b,4);
+                     break; }
          default:
             STBIW_ASSERT(0);
             return;
@@ -149,22 +327,70 @@ static void writefv(FILE *f, const char *fmt, va_list v)
    }
 }
 
-static void write3(FILE *f, unsigned char a, unsigned char b, unsigned char c)
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
+{
+   va_list v;
+   va_start(v, fmt);
+   stbiw__writefv(s, fmt, v);
+   va_end(v);
+}
+
+static void stbiw__putc(stbi__write_context *s, unsigned char c)
+{
+   s->func(s->context, &c, 1);
+}
+
+static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
 {
    unsigned char arr[3];
    arr[0] = a, arr[1] = b, arr[2] = c;
-   fwrite(arr, 3, 1, f);
+   s->func(s->context, arr, 3);
 }
 
-static void write_pixels(FILE *f, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
 {
    unsigned char bg[3] = { 255, 0, 255}, px[3];
+   int k;
+
+   if (write_alpha < 0)
+      s->func(s->context, &d[comp - 1], 1);
+
+   switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+      case 1:
+         if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+         else
+            s->func(s->context, d, 1);  // monochrome TGA
+         break;
+      case 4:
+         if (!write_alpha) {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+            break;
+         }
+         /* FALLTHROUGH */
+      case 3:
+         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+         break;
+   }
+   if (write_alpha > 0)
+      s->func(s->context, &d[comp - 1], 1);
+}
+
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+{
    stbiw_uint32 zero = 0;
-   int i,j,k, j_end;
+   int i,j, j_end;
 
    if (y <= 0)
       return;
 
+   if (stbi__flip_vertically_on_write)
+      vdir *= -1;
+
    if (vdir < 0)
       j_end = -1, j = y-1;
    else
@@ -173,73 +399,157 @@ static void write_pixels(FILE *f, int rgb_dir, int vdir, int x, int y, int comp,
    for (; j != j_end; j += vdir) {
       for (i=0; i < x; ++i) {
          unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
-         if (write_alpha < 0)
-            fwrite(&d[comp-1], 1, 1, f);
-         switch (comp) {
-            case 1: fwrite(d, 1, 1, f);
-                    break;
-            case 2: if (expand_mono)
-                       write3(f, d[0],d[0],d[0]); // monochrome bmp
-                    else
-                       fwrite(d, 1, 1, f);  // monochrome TGA
-                    break;
-            case 4:
-               if (!write_alpha) {
-                  // composite against pink background
-                  for (k=0; k < 3; ++k)
-                     px[k] = bg[k] + ((d[k] - bg[k]) * d[3])/255;
-                  write3(f, px[1-rgb_dir],px[1],px[1+rgb_dir]);
-                  break;
-               }
-               /* FALLTHROUGH */
-            case 3:
-               write3(f, d[1-rgb_dir],d[1],d[1+rgb_dir]);
-               break;
-         }
-         if (write_alpha > 0)
-            fwrite(&d[comp-1], 1, 1, f);
+         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
       }
-      fwrite(&zero,scanline_pad,1,f);
+      s->func(s->context, &zero, scanline_pad);
    }
 }
 
-static int outfile(char const *filename, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
 {
-   FILE *f;
-   if (y < 0 || x < 0) return 0;
-   f = fopen(filename, "wb");
-   if (f) {
+   if (y < 0 || x < 0) {
+      return 0;
+   } else {
       va_list v;
       va_start(v, fmt);
-      writefv(f, fmt, v);
+      stbiw__writefv(s, fmt, v);
       va_end(v);
-      write_pixels(f,rgb_dir,vdir,x,y,comp,data,alpha,pad,expand_mono);
-      fclose(f);
+      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
+      return 1;
    }
-   return f != NULL;
 }
 
-int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
 {
    int pad = (-x*3) & 3;
-   return outfile(filename,-1,-1,x,y,comp,1,(void *) data,0,pad,
+   return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
            "11 4 22 4" "4 44 22 444444",
            'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
             40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
 }
 
-int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_bmp_core(&s, x, y, comp, data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_bmp_core(&s, x, y, comp, data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif //!STBI_WRITE_NO_STDIO
+
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
 {
    int has_alpha = (comp == 2 || comp == 4);
    int colorbytes = has_alpha ? comp-1 : comp;
    int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
-   return outfile(filename, -1,-1, x, y, comp, 0, (void *) data, has_alpha, 0,
-                  "111 221 2222 11", 0,0,format, 0,0,0, 0,0,x,y, (colorbytes+has_alpha)*8, has_alpha*8);
+
+   if (y < 0 || x < 0)
+      return 0;
+
+   if (!stbi_write_tga_with_rle) {
+      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
+         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+   } else {
+      int i,j,k;
+      int jend, jdir;
+
+      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+
+      if (stbi__flip_vertically_on_write) {
+         j = 0;
+         jend = y;
+         jdir = 1;
+      } else {
+         j = y-1;
+         jend = -1;
+         jdir = -1;
+      }
+      for (; j != jend; j += jdir) {
+         unsigned char *row = (unsigned char *) data + j * x * comp;
+         int len;
+
+         for (i = 0; i < x; i += len) {
+            unsigned char *begin = row + i * comp;
+            int diff = 1;
+            len = 1;
+
+            if (i < x - 1) {
+               ++len;
+               diff = memcmp(begin, row + (i + 1) * comp, comp);
+               if (diff) {
+                  const unsigned char *prev = begin;
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (memcmp(prev, row + k * comp, comp)) {
+                        prev += comp;
+                        ++len;
+                     } else {
+                        --len;
+                        break;
+                     }
+                  }
+               } else {
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (!memcmp(begin, row + k * comp, comp)) {
+                        ++len;
+                     } else {
+                        break;
+                     }
+                  }
+               }
+            }
+
+            if (diff) {
+               unsigned char header = STBIW_UCHAR(len - 1);
+               s->func(s->context, &header, 1);
+               for (k = 0; k < len; ++k) {
+                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+               }
+            } else {
+               unsigned char header = STBIW_UCHAR(len - 129);
+               s->func(s->context, &header, 1);
+               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+            }
+         }
+      }
+   }
+   return 1;
+}
+
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
 }
+#endif
 
 // *************************************************************************************************
 // Radiance RGBE HDR writer
 // by Baldur Karlsson
+
 #define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
 
 void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
@@ -247,7 +557,7 @@ void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
    int exponent;
    float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
 
-   if (maxcomp < 1e-32) {
+   if (maxcomp < 1e-32f) {
       rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
    } else {
       float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
@@ -259,27 +569,27 @@ void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
    }
 }
 
-void stbiw__write_run_data(FILE *f, int length, unsigned char databyte)
+void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
 {
-   unsigned char lengthbyte = (unsigned char) (length+128);
+   unsigned char lengthbyte = STBIW_UCHAR(length+128);
    STBIW_ASSERT(length+128 <= 255);
-   fwrite(&lengthbyte, 1, 1, f);
-   fwrite(&databyte, 1, 1, f);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, &databyte, 1);
 }
 
-void stbiw__write_dump_data(FILE *f, int length, unsigned char *data)
+void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
 {
-   unsigned char lengthbyte = (unsigned char )(length & 0xff);
+   unsigned char lengthbyte = STBIW_UCHAR(length);
    STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
-   fwrite(&lengthbyte, 1, 1, f);
-   fwrite(data, length, 1, f);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, data, length);
 }
 
-void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scratch, const float *scanline)
+void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
 {
    unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
    unsigned char rgbe[4];
-   float linear[3] = {0};
+   float linear[3];
    int x;
 
    scanlineheader[2] = (width&0xff00)>>8;
@@ -288,31 +598,31 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
    /* skip RLE for images too small or large */
    if (width < 8 || width >= 32768) {
       for (x=0; x < width; x++) {
-         switch (comp) {
+         switch (ncomp) {
             case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*comp + 2];
-                    linear[1] = scanline[x*comp + 1];
-                    linear[0] = scanline[x*comp + 0];
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
                     break;
-            case 2: /* fallthrough */
-            case 1: linear[0] = linear[1] = linear[2] = scanline[x*comp + 0];
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
                     break;
          }
          stbiw__linear_to_rgbe(rgbe, linear);
-         fwrite(rgbe, 4, 1, f);
+         s->func(s->context, rgbe, 4);
       }
    } else {
       int c,r;
       /* encode into scratch buffer */
       for (x=0; x < width; x++) {
-         switch(comp) {
+         switch(ncomp) {
             case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*comp + 2];
-                    linear[1] = scanline[x*comp + 1];
-                    linear[0] = scanline[x*comp + 0];
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
                     break;
-            case 2: /* fallthrough */
-            case 1: linear[0] = linear[1] = linear[2] = scanline[x*comp + 0];
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
                     break;
          }
          stbiw__linear_to_rgbe(rgbe, linear);
@@ -322,7 +632,7 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
          scratch[x + width*3] = rgbe[3];
       }
 
-      fwrite(scanlineheader, 4, 1, f);
+      s->func(s->context, scanlineheader, 4);
 
       /* RLE each component separately */
       for (c=0; c < 4; c++) {
@@ -343,7 +653,7 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
             while (x < r) {
                int len = r-x;
                if (len > 128) len = 128;
-               stbiw__write_dump_data(f, len, &comp[x]);
+               stbiw__write_dump_data(s, len, &comp[x]);
                x += len;
             }
             // if there's a run, output it
@@ -355,7 +665,7 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
                while (x < r) {
                   int len = r-x;
                   if (len > 127) len = 127;
-                  stbiw__write_run_data(f, len, comp[x]);
+                  stbiw__write_run_data(s, len, comp[x]);
                   x += len;
                }
             }
@@ -364,28 +674,59 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
    }
 }
 
-int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
 {
-   int i;
-   FILE *f;
-   if (y <= 0 || x <= 0 || data == NULL) return 0;
-   f = fopen(filename, "wb");
-   if (f) {
-      /* Each component is stored separately. Allocate scratch space for full output scanline. */
+   if (y <= 0 || x <= 0 || data == NULL)
+      return 0;
+   else {
+      // Each component is stored separately. Allocate scratch space for full output scanline.
       unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
-      fprintf(f, "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n"      );
-      fprintf(f, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n"                 , y, x);
+      int i, len;
+      char buffer[128];
+      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+      s->func(s->context, header, sizeof(header)-1);
+
+#ifdef STBI_MSC_SECURE_CRT
+      len = sprintf_s(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#else
+      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#endif
+      s->func(s->context, buffer, len);
+
       for(i=0; i < y; i++)
-         stbiw__write_hdr_scanline(f, x, comp, scratch, data + comp*i*x);
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i)*x);
       STBIW_FREE(scratch);
-      fclose(f);
+      return 1;
    }
-   return f != NULL;
 }
 
-/////////////////////////////////////////////////////////
-// PNG
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif // STBI_WRITE_NO_STDIO
+
 
+//////////////////////////////////////////////////////////////////////////////
+//
+// PNG writer
+//
+
+#ifndef STBIW_ZLIB_COMPRESS
 // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
 #define stbiw__sbraw(a) ((int *) (a) - 2)
 #define stbiw__sbm(a)   stbiw__sbraw(a)[0]
@@ -402,7 +743,7 @@ int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *da
 static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
 {
    int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
-   void *p = STBIW_REALLOC(*arr ? stbiw__sbraw(*arr) : 0, itemsize * m + sizeof(int)*2);
+   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
    STBIW_ASSERT(p);
    if (p) {
       if (!*arr) ((int *) p)[1] = 0;
@@ -415,7 +756,7 @@ static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
 static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
 {
    while (*bitcount >= 8) {
-      stbiw__sbpush(data, (unsigned char) *bitbuffer);
+      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
       *bitbuffer >>= 8;
       *bitcount -= 8;
    }
@@ -466,8 +807,14 @@ static unsigned int stbiw__zhash(unsigned char *data)
 
 #define stbiw__ZHASH   16384
 
+#endif // STBIW_ZLIB_COMPRESS
+
 unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
 {
+#ifdef STBIW_ZLIB_COMPRESS
+   // user provided a zlib compress implementation, use that
+   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else // use builtin
    static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
    static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
    static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
@@ -475,7 +822,9 @@ unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_l
    unsigned int bitbuf=0;
    int i,j, bitcount=0;
    unsigned char *out = NULL;
-   unsigned char **hash_table[stbiw__ZHASH]; // 64KB on the stack!
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
+   if (hash_table == NULL)
+      return NULL;
    if (quality < 5) quality = 5;
 
    stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
@@ -547,43 +896,77 @@ unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_l
 
    for (i=0; i < stbiw__ZHASH; ++i)
       (void) stbiw__sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
 
    {
       // compute adler32 on input
-      unsigned int i=0, s1=1, s2=0, blocklen = data_len % 5552;
-      int j=0;
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
       while (j < data_len) {
          for (i=0; i < blocklen; ++i) s1 += data[j+i], s2 += s1;
          s1 %= 65521, s2 %= 65521;
          j += blocklen;
          blocklen = 5552;
       }
-      stbiw__sbpush(out, (unsigned char) (s2 >> 8));
-      stbiw__sbpush(out, (unsigned char) s2);
-      stbiw__sbpush(out, (unsigned char) (s1 >> 8));
-      stbiw__sbpush(out, (unsigned char) s1);
+      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s2));
+      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s1));
    }
    *out_len = stbiw__sbn(out);
    // make returned pointer freeable
    STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
    return (unsigned char *) stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
 }
 
-unsigned int stbiw__crc32(unsigned char *buffer, int len)
+static unsigned int stbiw__crc32(unsigned char *buffer, int len)
 {
-   static unsigned int crc_table[256];
+   static unsigned int crc_table[256] =
+   {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+   };
+
    unsigned int crc = ~0u;
-   int i,j;
-   if (crc_table[1] == 0)
-      for(i=0; i < 256; i++)
-         for (crc_table[i]=i, j=0; j < 8; ++j)
-            crc_table[i] = (crc_table[i] >> 1) ^ (crc_table[i] & 1 ? 0xedb88320 : 0);
+   int i;
    for (i=0; i < len; ++i)
       crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
    return ~crc;
 }
 
-#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=(unsigned char)(a),(o)[1]=(unsigned char)(b),(o)[2]=(unsigned char)(c),(o)[3]=(unsigned char)(d),(o)+=4)
+#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
 #define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
 #define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
 
@@ -596,66 +979,94 @@ static void stbiw__wpcrc(unsigned char **data, int len)
 static unsigned char stbiw__paeth(int a, int b, int c)
 {
    int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return (unsigned char) a;
-   if (pb <= pc) return (unsigned char) b;
-   return (unsigned char) c;
+   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+   if (pb <= pc) return STBIW_UCHAR(b);
+   return STBIW_UCHAR(c);
+}
+
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
+{
+   static int mapping[] = { 0,1,2,3,4 };
+   static int firstmap[] = { 0,1,0,5,6 };
+   int *mymap = (y != 0) ? mapping : firstmap;
+   int i;
+   int type = mymap[filter_type];
+   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
+   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+   for (i = 0; i < n; ++i) {
+      switch (type) {
+         case 0: line_buffer[i] = z[i]; break;
+         case 1: line_buffer[i] = z[i]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
+         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
+         case 5: line_buffer[i] = z[i]; break;
+         case 6: line_buffer[i] = z[i]; break;
+      }
+   }
+   for (i=n; i < width*n; ++i) {
+      switch (type) {
+         case 0: line_buffer[i] = z[i]; break;
+         case 1: line_buffer[i] = z[i] - z[i-n]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
+         case 4: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
+         case 5: line_buffer[i] = z[i] - (z[i-n]>>1); break;
+         case 6: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+      }
+   }
 }
 
 unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
 {
+   int force_filter = stbi_write_force_png_filter;
    int ctype[5] = { -1, 0, 4, 2, 6 };
    unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
    unsigned char *out,*o, *filt, *zlib;
    signed char *line_buffer;
-   int i,j,k,p,zlen;
+   int j,zlen;
 
    if (stride_bytes == 0)
       stride_bytes = x * n;
 
+   if (force_filter >= 5) {
+      force_filter = -1;
+   }
+
    filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
    line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
    for (j=0; j < y; ++j) {
-      static int mapping[] = { 0,1,2,3,4 };
-      static int firstmap[] = { 0,1,0,5,6 };
-      int *mymap = j ? mapping : firstmap;
-      int best = 0, bestval = 0x7fffffff;
-      for (p=0; p < 2; ++p) {
-         for (k= p?best:0; k < 5; ++k) {
-            int type = mymap[k],est=0;
-            unsigned char *z = pixels + stride_bytes*j;
-            for (i=0; i < n; ++i)
-               switch (type) {
-                  case 0: line_buffer[i] = z[i]; break;
-                  case 1: line_buffer[i] = z[i]; break;
-                  case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
-                  case 3: line_buffer[i] = z[i] - (z[i-stride_bytes]>>1); break;
-                  case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-stride_bytes],0)); break;
-                  case 5: line_buffer[i] = z[i]; break;
-                  case 6: line_buffer[i] = z[i]; break;
-               }
-            for (i=n; i < x*n; ++i) {
-               switch (type) {
-                  case 0: line_buffer[i] = z[i]; break;
-                  case 1: line_buffer[i] = z[i] - z[i-n]; break;
-                  case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
-                  case 3: line_buffer[i] = z[i] - ((z[i-n] + z[i-stride_bytes])>>1); break;
-                  case 4: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-stride_bytes], z[i-stride_bytes-n]); break;
-                  case 5: line_buffer[i] = z[i] - (z[i-n]>>1); break;
-                  case 6: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
-               }
-            }
-            if (p) break;
-            for (i=0; i < x*n; ++i)
+      int filter_type;
+      if (force_filter > -1) {
+         filter_type = force_filter;
+         stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, force_filter, line_buffer);
+      } else { // Estimate the best filter by running through all of them:
+         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+         for (filter_type = 0; filter_type < 5; filter_type++) {
+            stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, filter_type, line_buffer);
+
+            // Estimate the entropy of the line using this filter; the less, the better.
+            est = 0;
+            for (i = 0; i < x*n; ++i) {
                est += abs((signed char) line_buffer[i]);
-            if (est < bestval) { bestval = est; best = k; }
+            }
+            if (est < best_filter_val) {
+               best_filter_val = est;
+               best_filter = filter_type;
+            }
+         }
+         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+            stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, best_filter, line_buffer);
+            filter_type = best_filter;
          }
       }
-      // when we get here, best contains the filter type, and line_buffer contains the data
-      filt[j*(x*n+1)] = (unsigned char) best;
+      // when we get here, filter_type contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) filter_type;
       STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
    }
    STBIW_FREE(line_buffer);
-   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, 8); // increase 8 to get smaller but use more memory
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
    STBIW_FREE(filt);
    if (!zlib) return 0;
 
@@ -671,7 +1082,7 @@ unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, in
    stbiw__wp32(o, x);
    stbiw__wp32(o, y);
    *o++ = 8;
-   *o++ = (unsigned char) ctype[n];
+   *o++ = STBIW_UCHAR(ctype[n]);
    *o++ = 0;
    *o++ = 0;
    *o++ = 0;
@@ -693,22 +1104,407 @@ unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, in
    return out;
 }
 
-int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
 {
    FILE *f;
    int len;
    unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
-   if (!png) return 0;
+   if (png == NULL) return 0;
+#ifdef STBI_MSC_SECURE_CRT
+   if (fopen_s(&f, filename, "wb"))
+      f = NULL;
+#else
    f = fopen(filename, "wb");
+#endif
    if (!f) { STBIW_FREE(png); return 0; }
    fwrite(png, 1, len, f);
    fclose(f);
    STBIW_FREE(png);
    return 1;
 }
+#endif
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   func(context, png, len);
+   STBIW_FREE(png);
+   return 1;
+}
+
+
+/* ***************************************************************************
+ *
+ * JPEG writer
+ *
+ * This is based on Jon Olick's jo_jpeg.cpp:
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ */
+
+static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
+      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
+
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
+   int bitBuf = *bitBufP, bitCnt = *bitCntP;
+   bitCnt += bs[1];
+   bitBuf |= bs[0] << (24 - bitCnt);
+   while(bitCnt >= 8) {
+      unsigned char c = (bitBuf >> 16) & 255;
+      stbiw__putc(s, c);
+      if(c == 255) {
+         stbiw__putc(s, 0);
+      }
+      bitBuf <<= 8;
+      bitCnt -= 8;
+   }
+   *bitBufP = bitBuf;
+   *bitCntP = bitCnt;
+}
+
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
+   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+   float z1, z2, z3, z4, z5, z11, z13;
+
+   float tmp0 = d0 + d7;
+   float tmp7 = d0 - d7;
+   float tmp1 = d1 + d6;
+   float tmp6 = d1 - d6;
+   float tmp2 = d2 + d5;
+   float tmp5 = d2 - d5;
+   float tmp3 = d3 + d4;
+   float tmp4 = d3 - d4;
+
+   // Even part
+   float tmp10 = tmp0 + tmp3;   // phase 2
+   float tmp13 = tmp0 - tmp3;
+   float tmp11 = tmp1 + tmp2;
+   float tmp12 = tmp1 - tmp2;
+
+   d0 = tmp10 + tmp11;       // phase 3
+   d4 = tmp10 - tmp11;
+
+   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+   d2 = tmp13 + z1;       // phase 5
+   d6 = tmp13 - z1;
+
+   // Odd part
+   tmp10 = tmp4 + tmp5;       // phase 2
+   tmp11 = tmp5 + tmp6;
+   tmp12 = tmp6 + tmp7;
+
+   // The rotator is modified from fig 4-8 to avoid extra negations.
+   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+   z2 = tmp10 * 0.541196100f + z5; // c2-c6
+   z4 = tmp12 * 1.306562965f + z5; // c2+c6
+   z3 = tmp11 * 0.707106781f; // c4
+
+   z11 = tmp7 + z3;      // phase 5
+   z13 = tmp7 - z3;
+
+   *d5p = z13 + z2;         // phase 6
+   *d3p = z13 - z2;
+   *d1p = z11 + z4;
+   *d7p = z11 - z4;
+
+   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+}
+
+static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
+   int tmp1 = val < 0 ? -val : val;
+   val = val < 0 ? val-1 : val;
+   bits[1] = 1;
+   while(tmp1 >>= 1) {
+      ++bits[1];
+   }
+   bits[0] = val & ((1<<bits[1])-1);
+}
+
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
+   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
+   int dataOff, i, diff, end0pos;
+   int DU[64];
+
+   // DCT rows
+   for(dataOff=0; dataOff<64; dataOff+=8) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
+   }
+   // DCT columns
+   for(dataOff=0; dataOff<8; ++dataOff) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+8], &CDU[dataOff+16], &CDU[dataOff+24], &CDU[dataOff+32], &CDU[dataOff+40], &CDU[dataOff+48], &CDU[dataOff+56]);
+   }
+   // Quantize/descale/zigzag the coefficients
+   for(i=0; i<64; ++i) {
+      float v = CDU[i]*fdtbl[i];
+      // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+      // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+      DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+   }
+
+   // Encode DC
+   diff = DU[0] - DC;
+   if (diff == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+   } else {
+      unsigned short bits[2];
+      stbiw__jpg_calcBits(diff, bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   // Encode ACs
+   end0pos = 63;
+   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
+   }
+   // end0pos = first element in reverse order !=0
+   if(end0pos == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+      return DU[0];
+   }
+   for(i = 1; i <= end0pos; ++i) {
+      int startpos = i;
+      int nrzeroes;
+      unsigned short bits[2];
+      for (; DU[i]==0 && i<=end0pos; ++i) {
+      }
+      nrzeroes = i-startpos;
+      if ( nrzeroes >= 16 ) {
+         int lng = nrzeroes>>4;
+         int nrmarker;
+         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
+            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+         nrzeroes &= 15;
+      }
+      stbiw__jpg_calcBits(DU[i], bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   if(end0pos != 63) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+   }
+   return DU[0];
+}
+
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+   // Constants that don't pollute global namespace
+   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
+   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
+   static const unsigned char std_ac_luminance_values[] = {
+      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
+      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
+      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
+      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
+      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
+      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
+   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
+   static const unsigned char std_ac_chrominance_values[] = {
+      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
+      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
+      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
+      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
+      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
+      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   // Huffman tables
+   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
+   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
+   static const unsigned short YAC_HT[256][2] = {
+      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const unsigned short UVAC_HT[256][2] = {
+      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
+                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
+   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
+                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
+   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 
+                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+
+   int row, col, i, k;
+   float fdtbl_Y[64], fdtbl_UV[64];
+   unsigned char YTable[64], UVTable[64];
+
+   if(!data || !width || !height || comp > 4 || comp < 1) {
+      return 0;
+   }
+
+   quality = quality ? quality : 90;
+   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+
+   for(i = 0; i < 64; ++i) {
+      int uvti, yti = (YQT[i]*quality+50)/100;
+      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
+      uvti = (UVQT[i]*quality+50)/100;
+      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+   }
+
+   for(row = 0, k = 0; row < 8; ++row) {
+      for(col = 0; col < 8; ++col, ++k) {
+         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      }
+   }
+
+   // Write Headers
+   {
+      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
+      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
+      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
+                                      3,1,0x11,0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+      s->func(s->context, (void*)head0, sizeof(head0));
+      s->func(s->context, (void*)YTable, sizeof(YTable));
+      stbiw__putc(s, 1);
+      s->func(s->context, UVTable, sizeof(UVTable));
+      s->func(s->context, (void*)head1, sizeof(head1));
+      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+      stbiw__putc(s, 0x10); // HTYACinfo
+      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+      stbiw__putc(s, 1); // HTUDCinfo
+      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+      stbiw__putc(s, 0x11); // HTUACinfo
+      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+      s->func(s->context, (void*)head2, sizeof(head2));
+   }
+
+   // Encode 8x8 macroblocks
+   {
+      static const unsigned short fillBits[] = {0x7F, 7};
+      const unsigned char *imageData = (const unsigned char *)data;
+      int DCY=0, DCU=0, DCV=0;
+      int bitBuf=0, bitCnt=0;
+      // comp == 2 is grey+alpha (alpha is ignored)
+      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      int x, y, pos;
+      for(y = 0; y < height; y += 8) {
+         for(x = 0; x < width; x += 8) {
+            float YDU[64], UDU[64], VDU[64];
+            for(row = y, pos = 0; row < y+8; ++row) {
+               for(col = x; col < x+8; ++col, ++pos) {
+                  int p = (stbi__flip_vertically_on_write ? height-1-row : row)*width*comp + col*comp;
+                  float r, g, b;
+                  if(row >= height) {
+                     p -= width*comp*(row+1 - height);
+                  }
+                  if(col >= width) {
+                     p -= comp*(col+1 - width);
+                  }
+
+                  r = imageData[p+0];
+                  g = imageData[p+ofsG];
+                  b = imageData[p+ofsB];
+                  YDU[pos]=+0.29900f*r+0.58700f*g+0.11400f*b-128;
+                  UDU[pos]=-0.16874f*r-0.33126f*g+0.50000f*b;
+                  VDU[pos]=+0.50000f*r-0.41869f*g-0.08131f*b;
+               }
+            }
+
+            DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+            DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+            DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+         }
+      }
+
+      // Do the bit alignment of the EOI marker
+      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+   }
+
+   // EOI
+   stbiw__putc(s, 0xFF);
+   stbiw__putc(s, 0xD9);
+
+   return 1;
+}
+
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+}
+
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
 #endif // STB_IMAGE_WRITE_IMPLEMENTATION
 
 /* Revision history
+      1.09  (2018-02-11)
+             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
+      1.08  (2018-01-29)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
+      1.07  (2017-07-24)
+             doc fix
+      1.06 (2017-07-23)
+             writing JPEG (using Jon Olick's code)
+      1.05   ???
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
+      1.02 (2016-04-02)
+             avoid allocating large structures on the stack
+      1.01 (2016-01-16)
+             STBIW_REALLOC_SIZED: support allocators with no realloc support
+             avoid race-condition in crc initialization
+             minor compile issues
+      1.00 (2015-09-14)
+             installable file IO function
+      0.99 (2015-09-13)
+             warning fixes; TGA rle support
       0.98 (2015-04-08)
              added STBIW_MALLOC, STBIW_ASSERT etc
       0.97 (2015-01-18)
@@ -728,3 +1524,45 @@ int stbi_write_png(char const *filename, int x, int y, int comp, const void *dat
              first public release
       0.90   first internal release
 */
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/image.darknet/inst/include/darknet/src/super.c b/image.darknet/inst/include/darknet/src/super.c
deleted file mode 100644
index 63e9860..0000000
--- a/image.darknet/inst/include/darknet/src/super.c
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "network.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-void train_super(char *cfgfile, char *weightfile)
-{
-    char *train_images = "/data/imagenet/imagenet1k.train.list";
-    char *backup_directory = "/home/pjreddie/backup/";
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = net.batch*net.subdivisions;
-    int i = *net.seen/imgs;
-    data train, buffer;
-
-
-    list *plist = get_paths(train_images);
-    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.scale = 4;
-    args.paths = paths;
-    args.n = imgs;
-    args.m = plist->size;
-    args.d = &buffer;
-    args.type = SUPER_DATA;
-
-    pthread_t load_thread = load_data_in_thread(args);
-    clock_t time;
-    //while(i*imgs < N*120){
-    while(get_current_batch(net) < net.max_batches){
-        i += 1;
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data_in_thread(args);
-
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-
-        time=clock();
-        float loss = train_network(net, train);
-        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-
-        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
-        if(i%1000==0){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-        }
-        if(i%100==0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup", backup_directory, base);
-            save_weights(net, buff);
-        }
-        free_data(train);
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
-}
-
-void test_super(char *cfgfile, char *weightfile, char *filename)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(2222222);
-
-    clock_t time;
-    char buff[256];
-    char *input = buff;
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        }else{
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image im = load_image_color(input, 0, 0);
-        resize_network(&net, im.w, im.h);
-        printf("%d %d\n", im.w, im.h);
-
-        float *X = im.data;
-        time=clock();
-        network_predict(net, X);
-        image out = get_network_image(net);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        save_image(out, "out");
-
-        free_image(im);
-        if (filename) break;
-    }
-}
-
-
-void run_super(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    char *filename = (argc > 5) ? argv[5] : 0;
-    if(0==strcmp(argv[2], "train")) train_super(cfg, weights);
-    else if(0==strcmp(argv[2], "test")) test_super(cfg, weights, filename);
-    /*
-    else if(0==strcmp(argv[2], "valid")) validate_super(cfg, weights);
-    */
-}
diff --git a/image.darknet/inst/include/darknet/src/swag.c b/image.darknet/inst/include/darknet/src/swag.c
deleted file mode 100644
index 2cb3093..0000000
--- a/image.darknet/inst/include/darknet/src/swag.c
+++ /dev/null
@@ -1,91 +0,0 @@
-#include "network.h"
-#include "detection_layer.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "box.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-void train_swag(char *cfgfile, char *weightfile)
-{
-    char *train_images = "data/voc.0712.trainval";
-    char *backup_directory = "/home/pjreddie/backup/";
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = net.batch*net.subdivisions;
-    int i = *net.seen/imgs;
-    data train, buffer;
-
-    layer l = net.layers[net.n - 1];
-
-    int side = l.side;
-    int classes = l.classes;
-    float jitter = l.jitter;
-
-    list *plist = get_paths(train_images);
-    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.paths = paths;
-    args.n = imgs;
-    args.m = plist->size;
-    args.classes = classes;
-    args.jitter = jitter;
-    args.num_boxes = side;
-    args.d = &buffer;
-    args.type = REGION_DATA;
-
-    pthread_t load_thread = load_data_in_thread(args);
-    clock_t time;
-    //while(i*imgs < N*120){
-    while(get_current_batch(net) < net.max_batches){
-        i += 1;
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data_in_thread(args);
-
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-
-        time=clock();
-        float loss = train_network(net, train);
-        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-
-        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
-        if(i%1000==0 || i == 600){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-        }
-        free_data(train);
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
-}
-
-void run_swag(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    if(0==strcmp(argv[2], "train")) train_swag(cfg, weights);
-}
diff --git a/image.darknet/inst/include/darknet/src/tag.c b/image.darknet/inst/include/darknet/src/tag.c
deleted file mode 100644
index 1e43e7d..0000000
--- a/image.darknet/inst/include/darknet/src/tag.c
+++ /dev/null
@@ -1,153 +0,0 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-void train_tag(char *cfgfile, char *weightfile, int clear)
-{
-    srand(time(0));
-    float avg_loss = -1;
-    char *base = basecfg(cfgfile);
-    char *backup_directory = "/home/pjreddie/backup/";
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    if(clear) *net.seen = 0;
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = 1024;
-    list *plist = get_paths("/home/pjreddie/tag/train.list");
-    char **paths = (char **)list_to_array(plist);
-    printf("%d\n", plist->size);
-    int N = plist->size;
-    clock_t time;
-    pthread_t load_thread;
-    data train;
-    data buffer;
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-
-    args.min = net.w;
-    args.max = net.max_crop;
-    args.size = net.w;
-
-    args.paths = paths;
-    args.classes = net.outputs;
-    args.n = imgs;
-    args.m = N;
-    args.d = &buffer;
-    args.type = TAG_DATA;
-
-    args.angle = net.angle;
-    args.exposure = net.exposure;
-    args.saturation = net.saturation;
-    args.hue = net.hue;
-
-    fprintf(stderr, "%d classes\n", net.outputs);
-
-    load_thread = load_data_in_thread(args);
-    int epoch = (*net.seen)/N;
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-
-        load_thread = load_data_in_thread(args);
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-        time=clock();
-        float loss = train_network(net, train);
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-        free_data(train);
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
-            save_weights(net, buff);
-        }
-        if(get_current_batch(net)%100 == 0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup",backup_directory,base);
-            save_weights(net, buff);
-        }
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s.weights", backup_directory, base);
-    save_weights(net, buff);
-
-    pthread_join(load_thread, 0);
-    free_data(buffer);
-    free_network(net);
-    free_ptrs((void**)paths, plist->size);
-    free_list(plist);
-    free(base);
-}
-
-void test_tag(char *cfgfile, char *weightfile, char *filename)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(2222222);
-    int i = 0;
-    char **names = get_labels("data/tags.txt");
-    clock_t time;
-    int indexes[10];
-    char buff[256];
-    char *input = buff;
-    int size = net.w;
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        }else{
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image im = load_image_color(input, 0, 0);
-        image r = resize_min(im, size);
-        resize_network(&net, r.w, r.h);
-        printf("%d %d\n", r.w, r.h);
-
-        float *X = r.data;
-        time=clock();
-        float *predictions = network_predict(net, X);
-        top_predictions(net, 10, indexes);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        for(i = 0; i < 10; ++i){
-            int index = indexes[i];
-            printf("%.1f%%: %s\n", predictions[index]*100, names[index]);
-        }
-        if(r.data != im.data) free_image(r);
-        free_image(im);
-        if (filename) break;
-    }
-}
-
-
-void run_tag(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    int clear = find_arg(argc, argv, "-clear");
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    char *filename = (argc > 5) ? argv[5] : 0;
-    if(0==strcmp(argv[2], "train")) train_tag(cfg, weights, clear);
-    else if(0==strcmp(argv[2], "test")) test_tag(cfg, weights, filename);
-}
-
diff --git a/image.darknet/inst/include/darknet/src/tree.c b/image.darknet/inst/include/darknet/src/tree.c
index dd44515..67b6d43 100644
--- a/image.darknet/inst/include/darknet/src/tree.c
+++ b/image.darknet/inst/include/darknet/src/tree.c
@@ -24,33 +24,33 @@ void change_leaves(tree *t, char *leaf_list)
     fprintf(stderr, "Found %d leaves.\n", found);
 }
 
-float get_hierarchy_probability(float *x, tree *hier, int c)
+float get_hierarchy_probability(float *x, tree *hier, int c, int stride)
 {
     float p = 1;
     while(c >= 0){
-        p = p * x[c];
+        p = p * x[c*stride];
         c = hier->parent[c];
     }
     return p;
 }
 
-void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves)
+void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves, int stride)
 {
     int j;
     for(j = 0; j < n; ++j){
         int parent = hier->parent[j];
         if(parent >= 0){
-            predictions[j] *= predictions[parent]; 
+            predictions[j*stride] *= predictions[parent*stride]; 
         }
     }
     if(only_leaves){
         for(j = 0; j < n; ++j){
-            if(!hier->leaf[j]) predictions[j] = 0;
+            if(!hier->leaf[j]) predictions[j*stride] = 0;
         }
     }
 }
 
-int hierarchy_top_prediction(float *predictions, tree *hier, float thresh)
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride)
 {
     float p = 1;
     int group = 0;
@@ -61,7 +61,7 @@ int hierarchy_top_prediction(float *predictions, tree *hier, float thresh)
 
         for(i = 0; i < hier->group_size[group]; ++i){
             int index = i + hier->group_offset[group];
-            float val = predictions[i + hier->group_offset[group]];
+            float val = predictions[(i + hier->group_offset[group])*stride];
             if(val > max){
                 max_i = index;
                 max = val;
@@ -71,6 +71,8 @@ int hierarchy_top_prediction(float *predictions, tree *hier, float thresh)
             p = p*max;
             group = hier->child[max_i];
             if(hier->child[max_i] < 0) return max_i;
+        } else if (group == 0){
+            return max_i;
         } else {
             return hier->parent[hier->group_offset[group]];
         }
diff --git a/image.darknet/inst/include/darknet/src/tree.h b/image.darknet/inst/include/darknet/src/tree.h
index dbd4c39..3802b8e 100644
--- a/image.darknet/inst/include/darknet/src/tree.h
+++ b/image.darknet/inst/include/darknet/src/tree.h
@@ -1,23 +1,8 @@
 #ifndef TREE_H
 #define TREE_H
+#include "darknet.h"
 
-typedef struct{
-    int *leaf;
-    int n;
-    int *parent;
-    int *child;
-    int *group;
-    char **name;
-
-    int groups;
-    int *group_size;
-    int *group_offset;
-} tree;
-
-tree *read_tree(char *filename);
-void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves);
-void change_leaves(tree *t, char *leaf_list);
-int hierarchy_top_prediction(float *predictions, tree *hier, float thresh);
-float get_hierarchy_probability(float *x, tree *hier, int c);
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride);
+float get_hierarchy_probability(float *x, tree *hier, int c, int stride);
 
 #endif
diff --git a/image.darknet/inst/include/darknet/src/upsample_layer.c b/image.darknet/inst/include/darknet/src/upsample_layer.c
new file mode 100644
index 0000000..605f21f
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/upsample_layer.c
@@ -0,0 +1,106 @@
+#include "upsample_layer.h"
+#include "cuda.h"
+#include "blas.h"
+
+#include <stdio.h>
+
+layer make_upsample_layer(int batch, int w, int h, int c, int stride)
+{
+    layer l = {0};
+    l.type = UPSAMPLE;
+    l.batch = batch;
+    l.w = w;
+    l.h = h;
+    l.c = c;
+    l.out_w = w*stride;
+    l.out_h = h*stride;
+    l.out_c = c;
+    if(stride < 0){
+        stride = -stride;
+        l.reverse=1;
+        l.out_w = w/stride;
+        l.out_h = h/stride;
+    }
+    l.stride = stride;
+    l.outputs = l.out_w*l.out_h*l.out_c;
+    l.inputs = l.w*l.h*l.c;
+    l.delta =  calloc(l.outputs*batch, sizeof(float));
+    l.output = calloc(l.outputs*batch, sizeof(float));;
+
+    l.forward = forward_upsample_layer;
+    l.backward = backward_upsample_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_upsample_layer_gpu;
+    l.backward_gpu = backward_upsample_layer_gpu;
+
+    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+    #endif
+    if(l.reverse) fprintf(stderr, "downsample         %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    else fprintf(stderr, "upsample           %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    return l;
+}
+
+void resize_upsample_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+    l->out_w = w*l->stride;
+    l->out_h = h*l->stride;
+    if(l->reverse){
+        l->out_w = w/l->stride;
+        l->out_h = h/l->stride;
+    }
+    l->outputs = l->out_w*l->out_h*l->out_c;
+    l->inputs = l->h*l->w*l->c;
+    l->delta =  realloc(l->delta, l->outputs*l->batch*sizeof(float));
+    l->output = realloc(l->output, l->outputs*l->batch*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu   = cuda_make_array(l->delta,  l->outputs*l->batch);
+#endif
+    
+}
+
+void forward_upsample_layer(const layer l, network net)
+{
+    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+    if(l.reverse){
+        upsample_cpu(l.output, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input);
+    }else{
+        upsample_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output);
+    }
+}
+
+void backward_upsample_layer(const layer l, network net)
+{
+    if(l.reverse){
+        upsample_cpu(l.delta, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, net.delta);
+    }else{
+        upsample_cpu(net.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta);
+    }
+}
+
+#ifdef GPU
+void forward_upsample_layer_gpu(const layer l, network net)
+{
+    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    if(l.reverse){
+        upsample_gpu(l.output_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input_gpu);
+    }else{
+        upsample_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output_gpu);
+    }
+}
+
+void backward_upsample_layer_gpu(const layer l, network net)
+{
+    if(l.reverse){
+        upsample_gpu(l.delta_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, net.delta_gpu);
+    }else{
+        upsample_gpu(net.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta_gpu);
+    }
+}
+#endif
diff --git a/image.darknet/inst/include/darknet/src/upsample_layer.h b/image.darknet/inst/include/darknet/src/upsample_layer.h
new file mode 100644
index 0000000..86790d1
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/upsample_layer.h
@@ -0,0 +1,15 @@
+#ifndef UPSAMPLE_LAYER_H
+#define UPSAMPLE_LAYER_H
+#include "darknet.h"
+
+layer make_upsample_layer(int batch, int w, int h, int c, int stride);
+void forward_upsample_layer(const layer l, network net);
+void backward_upsample_layer(const layer l, network net);
+void resize_upsample_layer(layer *l, int w, int h);
+
+#ifdef GPU
+void forward_upsample_layer_gpu(const layer l, network net);
+void backward_upsample_layer_gpu(const layer l, network net);
+#endif
+
+#endif
diff --git a/image.darknet/inst/include/darknet/src/utils.c b/image.darknet/inst/include/darknet/src/utils.c
index b5181d7..626b467 100644
--- a/image.darknet/inst/include/darknet/src/utils.c
+++ b/image.darknet/inst/include/darknet/src/utils.c
@@ -6,9 +6,56 @@
 #include <unistd.h>
 #include <float.h>
 #include <limits.h>
+#include <time.h>
+#include <sys/time.h>
 
 #include "utils.h"
 
+
+/*
+// old timing. is it better? who knows!!
+double get_wall_time()
+{
+    struct timeval time;
+    if (gettimeofday(&time,NULL)){
+        return 0;
+    }
+    return (double)time.tv_sec + (double)time.tv_usec * .000001;
+}
+*/
+
+double what_time_is_it_now()
+{
+    struct timeval time;
+    if (gettimeofday(&time,NULL)){
+        return 0;
+    }
+    return (double)time.tv_sec + (double)time.tv_usec * .000001;
+}
+
+int *read_intlist(char *gpu_list, int *ngpus, int d)
+{
+    int *gpus = 0;
+    if(gpu_list){
+        int len = strlen(gpu_list);
+        *ngpus = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (gpu_list[i] == ',') ++*ngpus;
+        }
+        gpus = calloc(*ngpus, sizeof(int));
+        for(i = 0; i < *ngpus; ++i){
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',')+1;
+        }
+    } else {
+        gpus = calloc(1, sizeof(float));
+        *gpus = d;
+        *ngpus = 1;
+    }
+    return gpus;
+}
+
 int *read_map(char *filename)
 {
     int n = 0;
@@ -47,6 +94,22 @@ void shuffle(void *arr, size_t n, size_t size)
     }
 }
 
+int *random_index_order(int min, int max)
+{
+    int *inds = calloc(max-min, sizeof(int));
+    int i;
+    for(i = min; i < max; ++i){
+        inds[i] = i;
+    }
+    for(i = min; i < max-1; ++i){
+        int swap = inds[i];
+        int index = i + rand()%(max-i);
+        inds[i] = inds[index];
+        inds[index] = swap;
+    }
+    return inds;
+}
+
 void del_arg(int argc, char **argv, int index)
 {
     int i;
@@ -194,6 +257,21 @@ void error(const char *s)
     exit(-1);
 }
 
+unsigned char *read_file(char *filename)
+{
+    FILE *fp = fopen(filename, "rb");
+    size_t size;
+
+    fseek(fp, 0, SEEK_END); 
+    size = ftell(fp);
+    fseek(fp, 0, SEEK_SET); 
+
+    unsigned char *text = calloc(size+1, sizeof(char));
+    fread(text, 1, size, fp);
+    fclose(fp);
+    return text;
+}
+
 void malloc_error()
 {
     fprintf(stderr, "Malloc error\n");
@@ -524,6 +602,20 @@ int sample_array(float *a, int n)
     return n-1;
 }
 
+int max_int_index(int *a, int n)
+{
+    if(n <= 0) return -1;
+    int i, max_i = 0;
+    int max = a[0];
+    for(i = 1; i < n; ++i){
+        if(a[i] > max){
+            max = a[i];
+            max_i = i;
+        }
+    }
+    return max_i;
+}
+
 int max_index(float *a, int n)
 {
     if(n <= 0) return -1;
@@ -538,6 +630,15 @@ int max_index(float *a, int n)
     return max_i;
 }
 
+int int_index(int *a, int val, int n)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        if(a[i] == val) return i;
+    }
+    return -1;
+}
+
 int rand_int(int min, int max)
 {
     if (max < min){
@@ -585,13 +686,13 @@ float rand_normal()
 size_t rand_size_t()
 {
     return  ((size_t)(rand()&0xff) << 56) | 
-            ((size_t)(rand()&0xff) << 48) |
-            ((size_t)(rand()&0xff) << 40) |
-            ((size_t)(rand()&0xff) << 32) |
-            ((size_t)(rand()&0xff) << 24) |
-            ((size_t)(rand()&0xff) << 16) |
-            ((size_t)(rand()&0xff) << 8) |
-            ((size_t)(rand()&0xff) << 0);
+        ((size_t)(rand()&0xff) << 48) |
+        ((size_t)(rand()&0xff) << 40) |
+        ((size_t)(rand()&0xff) << 32) |
+        ((size_t)(rand()&0xff) << 24) |
+        ((size_t)(rand()&0xff) << 16) |
+        ((size_t)(rand()&0xff) << 8) |
+        ((size_t)(rand()&0xff) << 0);
 }
 
 float rand_uniform(float min, float max)
diff --git a/image.darknet/inst/include/darknet/src/utils.h b/image.darknet/inst/include/darknet/src/utils.h
index bbc6765..ef24da7 100644
--- a/image.darknet/inst/include/darknet/src/utils.h
+++ b/image.darknet/inst/include/darknet/src/utils.h
@@ -2,16 +2,22 @@
 #define UTILS_H
 #include <stdio.h>
 #include <time.h>
+#include "darknet.h"
 #include "list.h"
 
-#define SECRET_NUM -1234
-#define TWO_PI 6.2831853071795864769252866
+#define TIME(a) \
+    do { \
+    double start = what_time_is_it_now(); \
+    a; \
+    printf("%s took: %f seconds\n", #a, what_time_is_it_now() - start); \
+    } while (0)
 
-int *read_map(char *filename);
+#define TWO_PI 6.2831853071795864769252866f
+
+double what_time_is_it_now();
 void shuffle(void *arr, size_t n, size_t size);
 void sorta_shuffle(void *arr, size_t n, size_t size, size_t sections);
 void free_ptrs(void **ptrs, int n);
-char *basecfg(char *cfgfile);
 int alphanum_to_int(char c);
 char int_to_alphanum(int i);
 int read_int(int fd);
@@ -21,44 +27,27 @@ void write_all(int fd, char *buffer, size_t bytes);
 int read_all_fail(int fd, char *buffer, size_t bytes);
 int write_all_fail(int fd, char *buffer, size_t bytes);
 void find_replace(char *str, char *orig, char *rep, char *output);
-void error(const char *s);
 void malloc_error();
 void file_error(char *s);
 void strip(char *s);
 void strip_char(char *s, char bad);
-void top_k(float *a, int n, int k, int *index);
 list *split_str(char *s, char delim);
 char *fgetl(FILE *fp);
 list *parse_csv_line(char *line);
 char *copy_string(char *s);
 int count_fields(char *line);
 float *parse_fields(char *line, int n);
-void normalize_array(float *a, int n);
-void scale_array(float *a, int n, float s);
 void translate_array(float *a, int n, float s);
-int max_index(float *a, int n);
 float constrain(float min, float max, float a);
 int constrain_int(int a, int min, int max);
-float mse_array(float *a, int n);
-float rand_normal();
-size_t rand_size_t();
-float rand_uniform(float min, float max);
 float rand_scale(float s);
 int rand_int(int min, int max);
-float sum_array(float *a, int n);
-float mean_array(float *a, int n);
 void mean_arrays(float **a, int n, int els, float *avg);
-float variance_array(float *a, int n);
-float mag_array(float *a, int n);
 float dist_array(float *a, float *b, int n, int sub);
 float **one_hot_encode(float *a, int n, int k);
 float sec(clock_t clocks);
-int find_int_arg(int argc, char **argv, char *arg, int def);
-float find_float_arg(int argc, char **argv, char *arg, float def);
-int find_arg(int argc, char* argv[], char *arg);
-char *find_char_arg(int argc, char **argv, char *arg, char *def);
-int sample_array(float *a, int n);
 void print_statistics(float *a, int n);
+int int_index(int *a, int val, int n);
 
 #endif
 
diff --git a/image.darknet/inst/include/darknet/src/voxel.c b/image.darknet/inst/include/darknet/src/voxel.c
deleted file mode 100644
index 1b53880..0000000
--- a/image.darknet/inst/include/darknet/src/voxel.c
+++ /dev/null
@@ -1,169 +0,0 @@
-#include "network.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-image get_image_from_stream(CvCapture *cap);
-#endif
-
-void extract_voxel(char *lfile, char *rfile, char *prefix)
-{
-#ifdef OPENCV
-    int w = 1920;
-    int h = 1080;
-    int shift = 0;
-    int count = 0;
-    CvCapture *lcap = cvCaptureFromFile(lfile);
-    CvCapture *rcap = cvCaptureFromFile(rfile);
-    while(1){
-        image l = get_image_from_stream(lcap);
-        image r = get_image_from_stream(rcap);
-        if(!l.w || !r.w) break;
-        if(count%100 == 0) {
-            shift = best_3d_shift_r(l, r, -l.h/100, l.h/100);
-            printf("%d\n", shift);
-        }
-        image ls = crop_image(l, (l.w - w)/2, (l.h - h)/2, w, h);
-        image rs = crop_image(r, 105 + (r.w - w)/2, (r.h - h)/2 + shift, w, h);
-        char buff[256];
-        sprintf(buff, "%s_%05d_l", prefix, count);
-        save_image(ls, buff);
-        sprintf(buff, "%s_%05d_r", prefix, count);
-        save_image(rs, buff);
-        free_image(l);
-        free_image(r);
-        free_image(ls);
-        free_image(rs);
-        ++count;
-    }
-
-#else
-    printf("need OpenCV for extraction\n");
-#endif
-}
-
-void train_voxel(char *cfgfile, char *weightfile)
-{
-    char *train_images = "/data/imagenet/imagenet1k.train.list";
-    char *backup_directory = "/home/pjreddie/backup/";
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = net.batch*net.subdivisions;
-    int i = *net.seen/imgs;
-    data train, buffer;
-
-
-    list *plist = get_paths(train_images);
-    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.scale = 4;
-    args.paths = paths;
-    args.n = imgs;
-    args.m = plist->size;
-    args.d = &buffer;
-    args.type = SUPER_DATA;
-
-    pthread_t load_thread = load_data_in_thread(args);
-    clock_t time;
-    //while(i*imgs < N*120){
-    while(get_current_batch(net) < net.max_batches){
-        i += 1;
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data_in_thread(args);
-
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-
-        time=clock();
-        float loss = train_network(net, train);
-        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-
-        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
-        if(i%1000==0){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-        }
-        if(i%100==0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup", backup_directory, base);
-            save_weights(net, buff);
-        }
-        free_data(train);
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
-}
-
-void test_voxel(char *cfgfile, char *weightfile, char *filename)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(2222222);
-
-    clock_t time;
-    char buff[256];
-    char *input = buff;
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        }else{
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image im = load_image_color(input, 0, 0);
-        resize_network(&net, im.w, im.h);
-        printf("%d %d\n", im.w, im.h);
-
-        float *X = im.data;
-        time=clock();
-        network_predict(net, X);
-        image out = get_network_image(net);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        save_image(out, "out");
-
-        free_image(im);
-        if (filename) break;
-    }
-}
-
-
-void run_voxel(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    char *filename = (argc > 5) ? argv[5] : 0;
-    if(0==strcmp(argv[2], "train")) train_voxel(cfg, weights);
-    else if(0==strcmp(argv[2], "test")) test_voxel(cfg, weights, filename);
-    else if(0==strcmp(argv[2], "extract")) extract_voxel(argv[3], argv[4], argv[5]);
-    /*
-       else if(0==strcmp(argv[2], "valid")) validate_voxel(cfg, weights);
-     */
-}
diff --git a/image.darknet/inst/include/darknet/src/yolo.c b/image.darknet/inst/include/darknet/src/yolo.c
deleted file mode 100644
index ee5f73b..0000000
--- a/image.darknet/inst/include/darknet/src/yolo.c
+++ /dev/null
@@ -1,355 +0,0 @@
-#include "network.h"
-#include "detection_layer.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "box.h"
-#include "demo.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-char *voc_names[] = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"};
-
-void train_yolo(char *cfgfile, char *weightfile)
-{
-    char *train_images = "/data/voc/train.txt";
-    char *backup_directory = "/home/pjreddie/backup/";
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = net.batch*net.subdivisions;
-    int i = *net.seen/imgs;
-    data train, buffer;
-
-
-    layer l = net.layers[net.n - 1];
-
-    int side = l.side;
-    int classes = l.classes;
-    float jitter = l.jitter;
-
-    list *plist = get_paths(train_images);
-    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.paths = paths;
-    args.n = imgs;
-    args.m = plist->size;
-    args.classes = classes;
-    args.jitter = jitter;
-    args.num_boxes = side;
-    args.d = &buffer;
-    args.type = REGION_DATA;
-
-    args.angle = net.angle;
-    args.exposure = net.exposure;
-    args.saturation = net.saturation;
-    args.hue = net.hue;
-
-    pthread_t load_thread = load_data_in_thread(args);
-    clock_t time;
-    //while(i*imgs < N*120){
-    while(get_current_batch(net) < net.max_batches){
-        i += 1;
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data_in_thread(args);
-
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-
-        time=clock();
-        float loss = train_network(net, train);
-        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-
-        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
-        if(i%1000==0 || (i < 1000 && i%100 == 0)){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-        }
-        free_data(train);
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
-}
-
-void print_yolo_detections(FILE **fps, char *id, box *boxes, float **probs, int total, int classes, int w, int h)
-{
-    int i, j;
-    for(i = 0; i < total; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
-
-        if (xmin < 0) xmin = 0;
-        if (ymin < 0) ymin = 0;
-        if (xmax > w) xmax = w;
-        if (ymax > h) ymax = h;
-
-        for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, probs[i][j],
-                    xmin, ymin, xmax, ymax);
-        }
-    }
-}
-
-void validate_yolo(char *cfgfile, char *weightfile)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    srand(time(0));
-
-    char *base = "results/comp4_det_test_";
-    //list *plist = get_paths("data/voc.2007.test");
-    list *plist = get_paths("/home/pjreddie/data/voc/2007_test.txt");
-    //list *plist = get_paths("data/voc.2012.test");
-    char **paths = (char **)list_to_array(plist);
-
-    layer l = net.layers[net.n-1];
-    int classes = l.classes;
-
-    int j;
-    FILE **fps = calloc(classes, sizeof(FILE *));
-    for(j = 0; j < classes; ++j){
-        char buff[1024];
-        snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
-        fps[j] = fopen(buff, "w");
-    }
-    box *boxes = calloc(l.side*l.side*l.n, sizeof(box));
-    float **probs = calloc(l.side*l.side*l.n, sizeof(float *));
-    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
-
-    int m = plist->size;
-    int i=0;
-    int t;
-
-    float thresh = .001;
-    int nms = 1;
-    float iou_thresh = .5;
-
-    int nthreads = 8;
-    image *val = calloc(nthreads, sizeof(image));
-    image *val_resized = calloc(nthreads, sizeof(image));
-    image *buf = calloc(nthreads, sizeof(image));
-    image *buf_resized = calloc(nthreads, sizeof(image));
-    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.type = IMAGE_DATA;
-
-    for(t = 0; t < nthreads; ++t){
-        args.path = paths[i+t];
-        args.im = &buf[t];
-        args.resized = &buf_resized[t];
-        thr[t] = load_data_in_thread(args);
-    }
-    time_t start = time(0);
-    for(i = nthreads; i < m+nthreads; i += nthreads){
-        fprintf(stderr, "%d\n", i);
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            pthread_join(thr[t], 0);
-            val[t] = buf[t];
-            val_resized[t] = buf_resized[t];
-        }
-        for(t = 0; t < nthreads && i+t < m; ++t){
-            args.path = paths[i+t];
-            args.im = &buf[t];
-            args.resized = &buf_resized[t];
-            thr[t] = load_data_in_thread(args);
-        }
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            char *path = paths[i+t-nthreads];
-            char *id = basecfg(path);
-            float *X = val_resized[t].data;
-            network_predict(net, X);
-            int w = val[t].w;
-            int h = val[t].h;
-            get_detection_boxes(l, w, h, thresh, probs, boxes, 0);
-            if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, classes, iou_thresh);
-            print_yolo_detections(fps, id, boxes, probs, l.side*l.side*l.n, classes, w, h);
-            free(id);
-            free_image(val[t]);
-            free_image(val_resized[t]);
-        }
-    }
-    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
-}
-
-void validate_yolo_recall(char *cfgfile, char *weightfile)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    srand(time(0));
-
-    char *base = "results/comp4_det_test_";
-    list *plist = get_paths("data/voc.2007.test");
-    char **paths = (char **)list_to_array(plist);
-
-    layer l = net.layers[net.n-1];
-    int classes = l.classes;
-    int side = l.side;
-
-    int j, k;
-    FILE **fps = calloc(classes, sizeof(FILE *));
-    for(j = 0; j < classes; ++j){
-        char buff[1024];
-        snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
-        fps[j] = fopen(buff, "w");
-    }
-    box *boxes = calloc(side*side*l.n, sizeof(box));
-    float **probs = calloc(side*side*l.n, sizeof(float *));
-    for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
-
-    int m = plist->size;
-    int i=0;
-
-    float thresh = .001;
-    float iou_thresh = .5;
-    float nms = 0;
-
-    int total = 0;
-    int correct = 0;
-    int proposals = 0;
-    float avg_iou = 0;
-
-    for(i = 0; i < m; ++i){
-        char *path = paths[i];
-        image orig = load_image_color(path, 0, 0);
-        image sized = resize_image(orig, net.w, net.h);
-        char *id = basecfg(path);
-        network_predict(net, sized.data);
-        get_detection_boxes(l, orig.w, orig.h, thresh, probs, boxes, 1);
-        if (nms) do_nms(boxes, probs, side*side*l.n, 1, nms);
-
-        char labelpath[4096];
-        find_replace(path, "images", "labels", labelpath);
-        find_replace(labelpath, "JPEGImages", "labels", labelpath);
-        find_replace(labelpath, ".jpg", ".txt", labelpath);
-        find_replace(labelpath, ".JPEG", ".txt", labelpath);
-
-        int num_labels = 0;
-        box_label *truth = read_boxes(labelpath, &num_labels);
-        for(k = 0; k < side*side*l.n; ++k){
-            if(probs[k][0] > thresh){
-                ++proposals;
-            }
-        }
-        for (j = 0; j < num_labels; ++j) {
-            ++total;
-            box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
-            float best_iou = 0;
-            for(k = 0; k < side*side*l.n; ++k){
-                float iou = box_iou(boxes[k], t);
-                if(probs[k][0] > thresh && iou > best_iou){
-                    best_iou = iou;
-                }
-            }
-            avg_iou += best_iou;
-            if(best_iou > iou_thresh){
-                ++correct;
-            }
-        }
-
-        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
-        free(id);
-        free_image(orig);
-        free_image(sized);
-    }
-}
-
-void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh)
-{
-    image **alphabet = load_alphabet();
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    detection_layer l = net.layers[net.n-1];
-    set_batch_network(&net, 1);
-    srand(2222222);
-    clock_t time;
-    char buff[256];
-    char *input = buff;
-    int j;
-    float nms=.4;
-    box *boxes = calloc(l.side*l.side*l.n, sizeof(box));
-    float **probs = calloc(l.side*l.side*l.n, sizeof(float *));
-    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        } else {
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image im = load_image_color(input,0,0);
-        image sized = resize_image(im, net.w, net.h);
-        float *X = sized.data;
-        time=clock();
-        network_predict(net, X);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 0);
-        if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms);
-        //draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, alphabet, 20);
-        draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, alphabet, 20);
-        save_image(im, "predictions");
-        show_image(im, "predictions");
-
-        free_image(im);
-        free_image(sized);
-#ifdef OPENCV
-        cvWaitKey(0);
-        cvDestroyAllWindows();
-#endif
-        if (filename) break;
-    }
-}
-
-void run_yolo(int argc, char **argv)
-{
-    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
-    float thresh = find_float_arg(argc, argv, "-thresh", .2);
-    int cam_index = find_int_arg(argc, argv, "-c", 0);
-    int frame_skip = find_int_arg(argc, argv, "-s", 0);
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    char *filename = (argc > 5) ? argv[5]: 0;
-    if(0==strcmp(argv[2], "test")) test_yolo(cfg, weights, filename, thresh);
-    else if(0==strcmp(argv[2], "train")) train_yolo(cfg, weights);
-    else if(0==strcmp(argv[2], "valid")) validate_yolo(cfg, weights);
-    else if(0==strcmp(argv[2], "recall")) validate_yolo_recall(cfg, weights);
-    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, voc_names, 20, frame_skip, prefix, .5);
-}
diff --git a/image.darknet/inst/include/darknet/src/yolo_layer.c b/image.darknet/inst/include/darknet/src/yolo_layer.c
new file mode 100644
index 0000000..c338036
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/yolo_layer.c
@@ -0,0 +1,374 @@
+#include "yolo_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "box.h"
+#include "cuda.h"
+#include "utils.h"
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes)
+{
+    int i;
+    layer l = {0};
+    l.type = YOLO;
+
+    l.n = n;
+    l.total = total;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = n*(classes + 4 + 1);
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
+    l.classes = classes;
+    l.cost = calloc(1, sizeof(float));
+    l.biases = calloc(total*2, sizeof(float));
+    if(mask) l.mask = mask;
+    else{
+        l.mask = calloc(n, sizeof(int));
+        for(i = 0; i < n; ++i){
+            l.mask[i] = i;
+        }
+    }
+    l.bias_updates = calloc(n*2, sizeof(float));
+    l.outputs = h*w*n*(classes + 4 + 1);
+    l.inputs = l.outputs;
+    l.truths = 90*(4 + 1);
+    l.delta = calloc(batch*l.outputs, sizeof(float));
+    l.output = calloc(batch*l.outputs, sizeof(float));
+    for(i = 0; i < total*2; ++i){
+        l.biases[i] = .5;
+    }
+
+    l.forward = forward_yolo_layer;
+    l.backward = backward_yolo_layer;
+#ifdef GPU
+    l.forward_gpu = forward_yolo_layer_gpu;
+    l.backward_gpu = backward_yolo_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+#endif
+
+    fprintf(stderr, "yolo\n");
+    srand(0);
+
+    return l;
+}
+
+void resize_yolo_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+
+    l->outputs = h*w*l->n*(l->classes + 4 + 1);
+    l->inputs = l->outputs;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+
+    l->delta_gpu =     cuda_make_array(l->delta, l->batch*l->outputs);
+    l->output_gpu =    cuda_make_array(l->output, l->batch*l->outputs);
+#endif
+}
+
+box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
+{
+    box b;
+    b.x = (i + x[index + 0*stride]) / lw;
+    b.y = (j + x[index + 1*stride]) / lh;
+    b.w = exp(x[index + 2*stride]) * biases[2*n]   / w;
+    b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
+    return b;
+}
+
+float delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride)
+{
+    box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
+    float iou = box_iou(pred, truth);
+
+    float tx = (truth.x*lw - i);
+    float ty = (truth.y*lh - j);
+    float tw = log(truth.w*w / biases[2*n]);
+    float th = log(truth.h*h / biases[2*n + 1]);
+
+    delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
+    delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
+    delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
+    delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
+    return iou;
+}
+
+
+void delta_yolo_class(float *output, float *delta, int index, int class, int classes, int stride, float *avg_cat)
+{
+    int n;
+    if (delta[index]){
+        delta[index + stride*class] = 1 - output[index + stride*class];
+        if(avg_cat) *avg_cat += output[index + stride*class];
+        return;
+    }
+    for(n = 0; n < classes; ++n){
+        delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n];
+        if(n == class && avg_cat) *avg_cat += output[index + stride*n];
+    }
+}
+
+static int entry_index(layer l, int batch, int location, int entry)
+{
+    int n =   location / (l.w*l.h);
+    int loc = location % (l.w*l.h);
+    return batch*l.outputs + n*l.w*l.h*(4+l.classes+1) + entry*l.w*l.h + loc;
+}
+
+void forward_yolo_layer(const layer l, network net)
+{
+    int i,j,b,t,n;
+    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+
+#ifndef GPU
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, 4);
+            activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
+        }
+    }
+#endif
+
+    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+    if(!net.train) return;
+    float avg_iou = 0;
+    float recall = 0;
+    float recall75 = 0;
+    float avg_cat = 0;
+    float avg_obj = 0;
+    float avg_anyobj = 0;
+    int count = 0;
+    int class_count = 0;
+    *(l.cost) = 0;
+    for (b = 0; b < l.batch; ++b) {
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
+                    float best_iou = 0;
+                    int best_t = 0;
+                    for(t = 0; t < l.max_boxes; ++t){
+                        box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
+                        if(!truth.x) break;
+                        float iou = box_iou(pred, truth);
+                        if (iou > best_iou) {
+                            best_iou = iou;
+                            best_t = t;
+                        }
+                    }
+                    int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
+                    avg_anyobj += l.output[obj_index];
+                    l.delta[obj_index] = 0 - l.output[obj_index];
+                    if (best_iou > l.ignore_thresh) {
+                        l.delta[obj_index] = 0;
+                    }
+                    if (best_iou > l.truth_thresh) {
+                        l.delta[obj_index] = 1 - l.output[obj_index];
+
+                        int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
+                        if (l.map) class = l.map[class];
+                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
+                        delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
+                        box truth = float_to_box(net.truth + best_t*(4 + 1) + b*l.truths, 1);
+                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+                    }
+                }
+            }
+        }
+        for(t = 0; t < l.max_boxes; ++t){
+            box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
+
+            if(!truth.x) break;
+            float best_iou = 0;
+            int best_n = 0;
+            i = (truth.x * l.w);
+            j = (truth.y * l.h);
+            box truth_shift = truth;
+            truth_shift.x = truth_shift.y = 0;
+            for(n = 0; n < l.total; ++n){
+                box pred = {0};
+                pred.w = l.biases[2*n]/net.w;
+                pred.h = l.biases[2*n+1]/net.h;
+                float iou = box_iou(pred, truth_shift);
+                if (iou > best_iou){
+                    best_iou = iou;
+                    best_n = n;
+                }
+            }
+
+            int mask_n = int_index(l.mask, best_n, l.n);
+            if(mask_n >= 0){
+                int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
+                float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+
+                int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
+                avg_obj += l.output[obj_index];
+                l.delta[obj_index] = 1 - l.output[obj_index];
+
+                int class = net.truth[t*(4 + 1) + b*l.truths + 4];
+                if (l.map) class = l.map[class];
+                int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
+                delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);
+
+                ++count;
+                ++class_count;
+                if(iou > .5) recall += 1;
+                if(iou > .75) recall75 += 1;
+                avg_iou += iou;
+            }
+        }
+    }
+    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
+}
+
+void backward_yolo_layer(const layer l, network net)
+{
+   axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
+}
+
+void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
+{
+    int i;
+    int new_w=0;
+    int new_h=0;
+    if (((float)netw/w) < ((float)neth/h)) {
+        new_w = netw;
+        new_h = (h * netw)/w;
+    } else {
+        new_h = neth;
+        new_w = (w * neth)/h;
+    }
+    for (i = 0; i < n; ++i){
+        box b = dets[i].bbox;
+        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); 
+        b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth); 
+        b.w *= (float)netw/new_w;
+        b.h *= (float)neth/new_h;
+        if(!relative){
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+        dets[i].bbox = b;
+    }
+}
+
+int yolo_num_detections(layer l, float thresh)
+{
+    int i, n;
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
+            if(l.output[obj_index] > thresh){
+                ++count;
+            }
+        }
+    }
+    return count;
+}
+
+void avg_flipped_yolo(layer l)
+{
+    int i,j,n,z;
+    float *flip = l.output + l.outputs;
+    for (j = 0; j < l.h; ++j) {
+        for (i = 0; i < l.w/2; ++i) {
+            for (n = 0; n < l.n; ++n) {
+                for(z = 0; z < l.classes + 4 + 1; ++z){
+                    int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+                    int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+                    float swap = flip[i1];
+                    flip[i1] = flip[i2];
+                    flip[i2] = swap;
+                    if(z == 0){
+                        flip[i1] = -flip[i1];
+                        flip[i2] = -flip[i2];
+                    }
+                }
+            }
+        }
+    }
+    for(i = 0; i < l.outputs; ++i){
+        l.output[i] = (l.output[i] + flip[i])/2.;
+    }
+}
+
+int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets)
+{
+    int i,j,n;
+    float *predictions = l.output;
+    if (l.batch == 2) avg_flipped_yolo(l);
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        int row = i / l.w;
+        int col = i % l.w;
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
+            float objectness = predictions[obj_index];
+            if(objectness <= thresh) continue;
+            int box_index  = entry_index(l, 0, n*l.w*l.h + i, 0);
+            dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
+            dets[count].objectness = objectness;
+            dets[count].classes = l.classes;
+            for(j = 0; j < l.classes; ++j){
+                int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
+                float prob = objectness*predictions[class_index];
+                dets[count].prob[j] = (prob > thresh) ? prob : 0;
+            }
+            ++count;
+        }
+    }
+    correct_yolo_boxes(dets, count, w, h, netw, neth, relative);
+    return count;
+}
+
+#ifdef GPU
+
+void forward_yolo_layer_gpu(const layer l, network net)
+{
+    copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
+    int b, n;
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, 4);
+            activate_array_gpu(l.output_gpu + index, (1+l.classes)*l.w*l.h, LOGISTIC);
+        }
+    }
+    if(!net.train || l.onlyforward){
+        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        return;
+    }
+
+    cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs);
+    forward_yolo_layer(l, net);
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+}
+
+void backward_yolo_layer_gpu(const layer l, network net)
+{
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+#endif
+
diff --git a/image.darknet/inst/include/darknet/src/yolo_layer.h b/image.darknet/inst/include/darknet/src/yolo_layer.h
new file mode 100644
index 0000000..d2a0243
--- /dev/null
+++ b/image.darknet/inst/include/darknet/src/yolo_layer.h
@@ -0,0 +1,19 @@
+#ifndef YOLO_LAYER_H
+#define YOLO_LAYER_H
+
+#include "darknet.h"
+#include "layer.h"
+#include "network.h"
+
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes);
+void forward_yolo_layer(const layer l, network net);
+void backward_yolo_layer(const layer l, network net);
+void resize_yolo_layer(layer *l, int w, int h);
+int yolo_num_detections(layer l, float thresh);
+
+#ifdef GPU
+void forward_yolo_layer_gpu(const layer l, network net);
+void backward_yolo_layer_gpu(layer l, network net);
+#endif
+
+#endif
diff --git a/image.darknet/inst/models/.gitignore b/image.darknet/inst/models/.gitignore
new file mode 100644
index 0000000..7e26728
--- /dev/null
+++ b/image.darknet/inst/models/.gitignore
@@ -0,0 +1 @@
+yolov3.weights
diff --git a/image.darknet/man/image_darknet_detect.Rd b/image.darknet/man/image_darknet_detect.Rd
index 91b0cd3..52f770b 100644
--- a/image.darknet/man/image_darknet_detect.Rd
+++ b/image.darknet/man/image_darknet_detect.Rd
@@ -45,8 +45,8 @@ x <- image_darknet_detect(file = f, object = yolo_tiny_voc)
 weights <- file.path(system.file(package="image.darknet", "models"), "yolo.weights")
 download.file(url = "http://pjreddie.com/media/files/yolo.weights", destfile = weights)
 yolo_coco <- image_darknet_model(type = 'detect', 
- model = "yolo.cfg", 
- weights = system.file(package="image.darknet", "models", "yolo.weights"), 
+ model = "yolov3.cfg", 
+ weights = system.file(package="image.darknet", "models", "yolov3.weights"), 
  labels = system.file(package="image.darknet", "include", "darknet", "data", "coco.names"))
 yolo_coco
 
diff --git a/image.darknet/src/__R_API_classifier.c b/image.darknet/src/__R_API_classifier.c
index cfffa5a..4e88201 100644
--- a/image.darknet/src/__R_API_classifier.c
+++ b/image.darknet/src/__R_API_classifier.c
@@ -22,11 +22,8 @@ image get_image_from_stream(CvCapture *cap);
 void darknet_predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int top,
                                 char **pred_lab, double *pred_score, char **names, int resize){
   
-  network net = parse_network_cfg(cfgfile);
-  if(weightfile){
-    load_weights(&net, weightfile);
-  }
-  set_batch_network(&net, 1);
+  network *net = load_network(cfgfile, weightfile, 0);
+  set_batch_network(net, 1);
   srand(2222222);
   /*
   list *options = read_data_cfg(datacfg);
@@ -41,21 +38,21 @@ void darknet_predict_classifier(char *datacfg, char *cfgfile, char *weightfile,
   int *indexes = calloc(top, sizeof(int));
   char buff[256];
   char *input = buff;
-  int size = net.w;
+  int size = net->w;
   while(1){
     strncpy(input, filename, 256);
     image im = load_image_color(input, 0, 0);
     image r = resize_min(im, size);
     if(resize > 0) {
-      resize_network(&net, r.w, r.h);
+      resize_network(net, r.w, r.h);
     }
     //printf("%d %d\n", r.w, r.h);
     
     float *X = r.data;
     time=clock();
     float *predictions = network_predict(net, X);
-    if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 0);
-    top_k(predictions, net.outputs, top, indexes);
+    if(net->hierarchy) hierarchy_predictions(predictions, net->outputs, net->hierarchy, 0, 1);
+    top_k(predictions, net->outputs, top, indexes);
     //printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
     for(i = 0; i < top; ++i){
       int index = indexes[i];
diff --git a/image.darknet/src/__R_API_detector.c b/image.darknet/src/__R_API_detector.c
index 905e582..b0b0f1b 100644
--- a/image.darknet/src/__R_API_detector.c
+++ b/image.darknet/src/__R_API_detector.c
@@ -32,11 +32,9 @@ image **load_alphabet_pkg(char *path)
 int darknet_test_detector(char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, char **names, char *path)
 {
   image **alphabet = load_alphabet_pkg(path);
-  network net = parse_network_cfg(cfgfile);
-  if(weightfile){
-    load_weights(&net, weightfile);
-  }
-  set_batch_network(&net, 1);
+  network *net = load_network(cfgfile, weightfile, 0);
+  
+  set_batch_network(net, 1);
   srand(2222222);
   clock_t time;
   char buff[256];
@@ -47,8 +45,9 @@ int darknet_test_detector(char *cfgfile, char *weightfile, char *filename, float
   while(1){
     strncpy(input, filename, 256);
     image im = load_image_color(input,0,0);
-    image sized = resize_image(im, net.w, net.h);
-    layer l = net.layers[net.n-1];
+    
+    image sized = letterbox_image(im, net->w, net->h);
+    layer l = net->layers[net->n-1];
     
     box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
     float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
@@ -58,20 +57,15 @@ int darknet_test_detector(char *cfgfile, char *weightfile, char *filename, float
     time=clock();
     network_predict(net, X);
     printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-    get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0, hier_thresh);
-    if (l.softmax_tree && nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
-    else if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
     
-    for(int i = 0; i < l.w*l.h*l.n; ++i){
-      int class = max_index(probs[i], l.classes);
-      float prob = probs[i][class];
-      if(prob > thresh){
-        boxes_abovethreshold  = boxes_abovethreshold + 1;
-      }
-    }
+    int nboxes = 0;
+    detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1, &nboxes);
+    //printf("%d\n", nboxes);
+    //if (nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+    if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+    draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes);
+    free_detections(dets, nboxes);
     
-    printf("Boxes: %d of which %d above the threshold.\n", l.w*l.h*l.n, boxes_abovethreshold);
-    draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes);
     save_image(im, "predictions");
     
     free_image(im);
diff --git a/image.darknet/src/activation_kernels.cu b/image.darknet/src/activation_kernels.cu
index 994e206..4dc5804 100644
--- a/image.darknet/src/activation_kernels.cu
+++ b/image.darknet/src/activation_kernels.cu
@@ -10,8 +10,8 @@ extern "C" {
 
 __device__ float lhtan_activate_kernel(float x)
 {
-    if(x < 0) return .001*x;
-    if(x > 1) return .001*(x-1) + 1;
+    if(x < 0) return .001f*x;
+    if(x > 1) return .001f*(x-1.f) + 1.f;
     return x;
 }
 __device__ float lhtan_gradient_kernel(float x)
@@ -27,25 +27,26 @@ __device__ float hardtan_activate_kernel(float x)
     return x;
 }
 __device__ float linear_activate_kernel(float x){return x;}
-__device__ float logistic_activate_kernel(float x){return 1./(1. + exp(-x));}
-__device__ float loggy_activate_kernel(float x){return 2./(1. + exp(-x)) - 1;}
+__device__ float logistic_activate_kernel(float x){return 1.f/(1.f + expf(-x));}
+__device__ float loggy_activate_kernel(float x){return 2.f/(1.f + expf(-x)) - 1;}
 __device__ float relu_activate_kernel(float x){return x*(x>0);}
-__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);}
-__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01*x;}
-__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1*x;}
-__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1*x;}
-__device__ float tanh_activate_kernel(float x){return (2/(1 + exp(-2*x)) - 1);}
+__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(expf(x)-1);}
+__device__ float selu_activate_kernel(float x){return (x >= 0)*1.0507f*x + (x < 0)*1.0507f*1.6732f*(expf(x)-1);}
+__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01f*x;}
+__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1f*x;}
+__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1f*x;}
+__device__ float tanh_activate_kernel(float x){return (2.f/(1 + expf(-2*x)) - 1);}
 __device__ float plse_activate_kernel(float x)
 {
-    if(x < -4) return .01 * (x + 4);
-    if(x > 4)  return .01 * (x - 4) + 1;
-    return .125*x + .5;
+    if(x < -4) return .01f * (x + 4);
+    if(x > 4)  return .01f * (x - 4) + 1;
+    return .125f*x + .5f;
 }
 __device__ float stair_activate_kernel(float x)
 {
-    int n = floor(x);
-    if (n%2 == 0) return floor(x/2.);
-    else return (x - n) + floor(x/2.);
+    int n = floorf(x);
+    if (n%2 == 0) return floorf(x/2);
+    else return (x - n) + floorf(x/2);
 }
  
 
@@ -58,19 +59,20 @@ __device__ float linear_gradient_kernel(float x){return 1;}
 __device__ float logistic_gradient_kernel(float x){return (1-x)*x;}
 __device__ float loggy_gradient_kernel(float x)
 {
-    float y = (x+1.)/2.;
+    float y = (x+1)/2;
     return 2*(1-y)*y;
 }
 __device__ float relu_gradient_kernel(float x){return (x>0);}
 __device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);}
-__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01;}
-__device__ float ramp_gradient_kernel(float x){return (x>0)+.1;}
-__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1;}
+__device__ float selu_gradient_kernel(float x){return (x >= 0)*1.0507 + (x < 0)*(x + 1.0507*1.6732);}
+__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01f;}
+__device__ float ramp_gradient_kernel(float x){return (x>0)+.1f;}
+__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1f;}
 __device__ float tanh_gradient_kernel(float x){return 1-x*x;}
-__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01 : .125;}
+__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01f : .125f;}
 __device__ float stair_gradient_kernel(float x)
 {
-    if (floor(x) == x) return 0;
+    if (floorf(x) == x) return 0;
     return 1;
 }
 
@@ -87,6 +89,8 @@ __device__ float activate_kernel(float x, ACTIVATION a)
             return relu_activate_kernel(x);
         case ELU:
             return elu_activate_kernel(x);
+        case SELU:
+            return selu_activate_kernel(x);
         case RELIE:
             return relie_activate_kernel(x);
         case RAMP:
@@ -120,6 +124,8 @@ __device__ float gradient_kernel(float x, ACTIVATION a)
             return relu_gradient_kernel(x);
         case ELU:
             return elu_gradient_kernel(x);
+        case SELU:
+            return selu_gradient_kernel(x);
         case RELIE:
             return relie_gradient_kernel(x);
         case RAMP:
@@ -140,6 +146,41 @@ __device__ float gradient_kernel(float x, ACTIVATION a)
     return 0;
 }
 
+__global__ void binary_gradient_array_kernel(float *x, float *dy, int n, int s, BINARY_ACTIVATION a, float *dx)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    int i = id % s;
+    int b = id / s;
+    float x1 = x[b*s + i];
+    float x2 = x[b*s + s/2 + i];
+    if(id < n) {
+        float de = dy[id];
+        dx[b*s + i] = x2*de;
+        dx[b*s + s/2 + i] = x1*de; 
+    }
+}
+
+extern "C" void binary_gradient_array_gpu(float *x, float *dx, int n, int size, BINARY_ACTIVATION a, float *y) 
+{
+    binary_gradient_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, dx, n/2, size, a, y);
+    check_error(cudaPeekAtLastError());
+}
+__global__ void binary_activate_array_kernel(float *x, int n, int s, BINARY_ACTIVATION a, float *y)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    int i = id % s;
+    int b = id / s;
+    float x1 = x[b*s + i];
+    float x2 = x[b*s + s/2 + i];
+    if(id < n) y[id] = x1*x2;
+}
+
+extern "C" void binary_activate_array_gpu(float *x, int n, int size, BINARY_ACTIVATION a, float *y) 
+{
+    binary_activate_array_kernel<<<cuda_gridsize(n/2), BLOCK>>>(x, n/2, size, a, y);
+    check_error(cudaPeekAtLastError());
+}
+
 __global__ void activate_array_kernel(float *x, int n, ACTIVATION a)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -152,13 +193,13 @@ __global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delt
     if(i < n) delta[i] *= gradient_kernel(x[i], a);
 }
 
-extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a) 
+extern "C" void activate_array_gpu(float *x, int n, ACTIVATION a) 
 {
     activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta) 
+extern "C" void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta) 
 {
     gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta);
     check_error(cudaPeekAtLastError());
diff --git a/image.darknet/src/activation_layer.c b/image.darknet/src/activation_layer.c
index 3430dac..b4ba953 100644
--- a/image.darknet/src/activation_layer.c
+++ b/image.darknet/src/activation_layer.c
@@ -35,29 +35,29 @@ layer make_activation_layer(int batch, int inputs, ACTIVATION activation)
     return l;
 }
 
-void forward_activation_layer(layer l, network_state state)
+void forward_activation_layer(layer l, network net)
 {
-    copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
+    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
     activate_array(l.output, l.outputs*l.batch, l.activation);
 }
 
-void backward_activation_layer(layer l, network_state state)
+void backward_activation_layer(layer l, network net)
 {
     gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
-    copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
+    copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
 }
 
 #ifdef GPU
 
-void forward_activation_layer_gpu(layer l, network_state state)
+void forward_activation_layer_gpu(layer l, network net)
 {
-    copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
 }
 
-void backward_activation_layer_gpu(layer l, network_state state)
+void backward_activation_layer_gpu(layer l, network net)
 {
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
-    copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
diff --git a/image.darknet/src/activation_layer.h b/image.darknet/src/activation_layer.h
index a09756a..42118a8 100644
--- a/image.darknet/src/activation_layer.h
+++ b/image.darknet/src/activation_layer.h
@@ -7,12 +7,12 @@
 
 layer make_activation_layer(int batch, int inputs, ACTIVATION activation);
 
-void forward_activation_layer(layer l, network_state state);
-void backward_activation_layer(layer l, network_state state);
+void forward_activation_layer(layer l, network net);
+void backward_activation_layer(layer l, network net);
 
 #ifdef GPU
-void forward_activation_layer_gpu(layer l, network_state state);
-void backward_activation_layer_gpu(layer l, network_state state);
+void forward_activation_layer_gpu(layer l, network net);
+void backward_activation_layer_gpu(layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/activations.c b/image.darknet/src/activations.c
index 0cbb2f5..da1a17a 100644
--- a/image.darknet/src/activations.c
+++ b/image.darknet/src/activations.c
@@ -16,6 +16,8 @@ char *get_activation_string(ACTIVATION a)
             return "relu";
         case ELU:
             return "elu";
+        case SELU:
+            return "selu";
         case RELIE:
             return "relie";
         case RAMP:
@@ -46,6 +48,7 @@ ACTIVATION get_activation(char *s)
     if (strcmp(s, "loggy")==0) return LOGGY;
     if (strcmp(s, "relu")==0) return RELU;
     if (strcmp(s, "elu")==0) return ELU;
+    if (strcmp(s, "selu")==0) return SELU;
     if (strcmp(s, "relie")==0) return RELIE;
     if (strcmp(s, "plse")==0) return PLSE;
     if (strcmp(s, "hardtan")==0) return HARDTAN;
@@ -72,6 +75,8 @@ float activate(float x, ACTIVATION a)
             return relu_activate(x);
         case ELU:
             return elu_activate(x);
+        case SELU:
+            return selu_activate(x);
         case RELIE:
             return relie_activate(x);
         case RAMP:
@@ -113,6 +118,8 @@ float gradient(float x, ACTIVATION a)
             return relu_gradient(x);
         case ELU:
             return elu_gradient(x);
+        case SELU:
+            return selu_gradient(x);
         case RELIE:
             return relie_gradient(x);
         case RAMP:
diff --git a/image.darknet/src/activations.h b/image.darknet/src/activations.h
index 1c36ff5..9780d2c 100644
--- a/image.darknet/src/activations.h
+++ b/image.darknet/src/activations.h
@@ -1,12 +1,9 @@
 #ifndef ACTIVATIONS_H
 #define ACTIVATIONS_H
+#include "darknet.h"
 #include "cuda.h"
 #include "math.h"
 
-typedef enum{
-    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN
-}ACTIVATION;
-
 ACTIVATION get_activation(char *s);
 
 char *get_activation_string(ACTIVATION a);
@@ -15,8 +12,8 @@ float gradient(float x, ACTIVATION a);
 void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
 void activate_array(float *x, const int n, const ACTIVATION a);
 #ifdef GPU
-void activate_array_ongpu(float *x, int n, ACTIVATION a);
-void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta);
+void activate_array_gpu(float *x, int n, ACTIVATION a);
+void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta);
 #endif
 
 static inline float stair_activate(float x)
@@ -36,6 +33,7 @@ static inline float logistic_activate(float x){return 1./(1. + exp(-x));}
 static inline float loggy_activate(float x){return 2./(1. + exp(-x)) - 1;}
 static inline float relu_activate(float x){return x*(x>0);}
 static inline float elu_activate(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);}
+static inline float selu_activate(float x){return (x >= 0)*1.0507*x + (x < 0)*1.0507*1.6732*(exp(x)-1);}
 static inline float relie_activate(float x){return (x>0) ? x : .01*x;}
 static inline float ramp_activate(float x){return x*(x>0)+.1*x;}
 static inline float leaky_activate(float x){return (x>0) ? x : .1*x;}
@@ -78,6 +76,7 @@ static inline float stair_gradient(float x)
 }
 static inline float relu_gradient(float x){return (x>0);}
 static inline float elu_gradient(float x){return (x >= 0) + (x < 0)*(x + 1);}
+static inline float selu_gradient(float x){return (x >= 0)*1.0507 + (x < 0)*(x + 1.0507*1.6732);}
 static inline float relie_gradient(float x){return (x>0) ? 1 : .01;}
 static inline float ramp_gradient(float x){return (x>0)+.1;}
 static inline float leaky_gradient(float x){return (x>0) ? 1 : .1;}
diff --git a/image.darknet/src/avgpool_layer.c b/image.darknet/src/avgpool_layer.c
index b6932fe..83034db 100644
--- a/image.darknet/src/avgpool_layer.c
+++ b/image.darknet/src/avgpool_layer.c
@@ -37,7 +37,7 @@ void resize_avgpool_layer(avgpool_layer *l, int w, int h)
     l->inputs = h*w*l->c;
 }
 
-void forward_avgpool_layer(const avgpool_layer l, network_state state)
+void forward_avgpool_layer(const avgpool_layer l, network net)
 {
     int b,i,k;
 
@@ -47,14 +47,14 @@ void forward_avgpool_layer(const avgpool_layer l, network_state state)
             l.output[out_index] = 0;
             for(i = 0; i < l.h*l.w; ++i){
                 int in_index = i + l.h*l.w*(k + b*l.c);
-                l.output[out_index] += state.input[in_index];
+                l.output[out_index] += net.input[in_index];
             }
             l.output[out_index] /= l.h*l.w;
         }
     }
 }
 
-void backward_avgpool_layer(const avgpool_layer l, network_state state)
+void backward_avgpool_layer(const avgpool_layer l, network net)
 {
     int b,i,k;
 
@@ -63,7 +63,7 @@ void backward_avgpool_layer(const avgpool_layer l, network_state state)
             int out_index = k + b*l.c;
             for(i = 0; i < l.h*l.w; ++i){
                 int in_index = i + l.h*l.w*(k + b*l.c);
-                state.delta[in_index] += l.delta[out_index] / (l.h*l.w);
+                net.delta[in_index] += l.delta[out_index] / (l.h*l.w);
             }
         }
     }
diff --git a/image.darknet/src/avgpool_layer.h b/image.darknet/src/avgpool_layer.h
index f8329ae..3bd356c 100644
--- a/image.darknet/src/avgpool_layer.h
+++ b/image.darknet/src/avgpool_layer.h
@@ -11,12 +11,12 @@ typedef layer avgpool_layer;
 image get_avgpool_image(avgpool_layer l);
 avgpool_layer make_avgpool_layer(int batch, int w, int h, int c);
 void resize_avgpool_layer(avgpool_layer *l, int w, int h);
-void forward_avgpool_layer(const avgpool_layer l, network_state state);
-void backward_avgpool_layer(const avgpool_layer l, network_state state);
+void forward_avgpool_layer(const avgpool_layer l, network net);
+void backward_avgpool_layer(const avgpool_layer l, network net);
 
 #ifdef GPU
-void forward_avgpool_layer_gpu(avgpool_layer l, network_state state);
-void backward_avgpool_layer_gpu(avgpool_layer l, network_state state);
+void forward_avgpool_layer_gpu(avgpool_layer l, network net);
+void backward_avgpool_layer_gpu(avgpool_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/avgpool_layer_kernels.cu b/image.darknet/src/avgpool_layer_kernels.cu
index b7e2770..a7eca3a 100644
--- a/image.darknet/src/avgpool_layer_kernels.cu
+++ b/image.darknet/src/avgpool_layer_kernels.cu
@@ -43,19 +43,19 @@ __global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float
     }
 }
 
-extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network_state state)
+extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network net)
 {
     size_t n = layer.c*layer.batch;
 
-    forward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, state.input, layer.output_gpu);
+    forward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.input_gpu, layer.output_gpu);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network_state state)
+extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network net)
 {
     size_t n = layer.c*layer.batch;
 
-    backward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, state.delta, layer.delta_gpu);
+    backward_avgpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.w, layer.h, layer.c, net.delta_gpu, layer.delta_gpu);
     check_error(cudaPeekAtLastError());
 }
 
diff --git a/image.darknet/src/batchnorm_layer.c b/image.darknet/src/batchnorm_layer.c
index b53548b..ebff387 100644
--- a/image.darknet/src/batchnorm_layer.c
+++ b/image.darknet/src/batchnorm_layer.c
@@ -1,3 +1,4 @@
+#include "convolutional_layer.h"
 #include "batchnorm_layer.h"
 #include "blas.h"
 #include <stdio.h>
@@ -5,55 +6,67 @@
 layer make_batchnorm_layer(int batch, int w, int h, int c)
 {
     fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
-    layer layer = {0};
-    layer.type = BATCHNORM;
-    layer.batch = batch;
-    layer.h = layer.out_h = h;
-    layer.w = layer.out_w = w;
-    layer.c = layer.out_c = c;
-    layer.output = calloc(h * w * c * batch, sizeof(float));
-    layer.delta  = calloc(h * w * c * batch, sizeof(float));
-    layer.inputs = w*h*c;
-    layer.outputs = layer.inputs;
-
-    layer.scales = calloc(c, sizeof(float));
-    layer.scale_updates = calloc(c, sizeof(float));
+    layer l = {0};
+    l.type = BATCHNORM;
+    l.batch = batch;
+    l.h = l.out_h = h;
+    l.w = l.out_w = w;
+    l.c = l.out_c = c;
+    l.output = calloc(h * w * c * batch, sizeof(float));
+    l.delta  = calloc(h * w * c * batch, sizeof(float));
+    l.inputs = w*h*c;
+    l.outputs = l.inputs;
+
+    l.scales = calloc(c, sizeof(float));
+    l.scale_updates = calloc(c, sizeof(float));
+    l.biases = calloc(c, sizeof(float));
+    l.bias_updates = calloc(c, sizeof(float));
     int i;
     for(i = 0; i < c; ++i){
-        layer.scales[i] = 1;
+        l.scales[i] = 1;
     }
 
-    layer.mean = calloc(c, sizeof(float));
-    layer.variance = calloc(c, sizeof(float));
+    l.mean = calloc(c, sizeof(float));
+    l.variance = calloc(c, sizeof(float));
 
-    layer.rolling_mean = calloc(c, sizeof(float));
-    layer.rolling_variance = calloc(c, sizeof(float));
+    l.rolling_mean = calloc(c, sizeof(float));
+    l.rolling_variance = calloc(c, sizeof(float));
 
-    layer.forward = forward_batchnorm_layer;
-    layer.backward = backward_batchnorm_layer;
+    l.forward = forward_batchnorm_layer;
+    l.backward = backward_batchnorm_layer;
 #ifdef GPU
-    layer.forward_gpu = forward_batchnorm_layer_gpu;
-    layer.backward_gpu = backward_batchnorm_layer_gpu;
+    l.forward_gpu = forward_batchnorm_layer_gpu;
+    l.backward_gpu = backward_batchnorm_layer_gpu;
+
+    l.output_gpu =  cuda_make_array(l.output, h * w * c * batch);
+    l.delta_gpu =   cuda_make_array(l.delta, h * w * c * batch);
+
+    l.biases_gpu = cuda_make_array(l.biases, c);
+    l.bias_updates_gpu = cuda_make_array(l.bias_updates, c);
 
-    layer.output_gpu =  cuda_make_array(layer.output, h * w * c * batch);
-    layer.delta_gpu =   cuda_make_array(layer.delta, h * w * c * batch);
+    l.scales_gpu = cuda_make_array(l.scales, c);
+    l.scale_updates_gpu = cuda_make_array(l.scale_updates, c);
 
-    layer.scales_gpu = cuda_make_array(layer.scales, c);
-    layer.scale_updates_gpu = cuda_make_array(layer.scale_updates, c);
+    l.mean_gpu = cuda_make_array(l.mean, c);
+    l.variance_gpu = cuda_make_array(l.variance, c);
 
-    layer.mean_gpu = cuda_make_array(layer.mean, c);
-    layer.variance_gpu = cuda_make_array(layer.variance, c);
+    l.rolling_mean_gpu = cuda_make_array(l.mean, c);
+    l.rolling_variance_gpu = cuda_make_array(l.variance, c);
 
-    layer.rolling_mean_gpu = cuda_make_array(layer.mean, c);
-    layer.rolling_variance_gpu = cuda_make_array(layer.variance, c);
+    l.mean_delta_gpu = cuda_make_array(l.mean, c);
+    l.variance_delta_gpu = cuda_make_array(l.variance, c);
 
-    layer.mean_delta_gpu = cuda_make_array(layer.mean, c);
-    layer.variance_delta_gpu = cuda_make_array(layer.variance, c);
+    l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
+    l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
+    #ifdef CUDNN
+    cudnnCreateTensorDescriptor(&l.normTensorDesc);
+    cudnnCreateTensorDescriptor(&l.dstTensorDesc);
+    cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
+    cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
 
-    layer.x_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
-    layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
+    #endif
 #endif
-    return layer;
+    return l;
 }
 
 void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
@@ -108,7 +121,7 @@ void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_del
         for(f = 0; f < filters; ++f){
             for(k = 0; k < spatial; ++k){
                 int index = j*filters*spatial + f*spatial + k;
-                delta[index] = delta[index] * 1./(sqrt(variance[f]) + .00001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
+                delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
             }
         }
     }
@@ -119,33 +132,35 @@ void resize_batchnorm_layer(layer *layer, int w, int h)
     fprintf(stderr, "Not implemented\n");
 }
 
-void forward_batchnorm_layer(layer l, network_state state)
+void forward_batchnorm_layer(layer l, network net)
 {
-    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
-    if(l.type == CONNECTED){
-        l.out_c = l.outputs;
-        l.out_h = l.out_w = 1;
-    }
-    if(state.train){
+    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
+    copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
+    if(net.train){
         mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
         variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
 
-        scal_cpu(l.out_c, .9, l.rolling_mean, 1);
-        axpy_cpu(l.out_c, .1, l.mean, 1, l.rolling_mean, 1);
-        scal_cpu(l.out_c, .9, l.rolling_variance, 1);
-        axpy_cpu(l.out_c, .1, l.variance, 1, l.rolling_variance, 1);
+        scal_cpu(l.out_c, .99, l.rolling_mean, 1);
+        axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
+        scal_cpu(l.out_c, .99, l.rolling_variance, 1);
+        axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);
 
-        copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
         normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);   
         copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
     } else {
         normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
     }
     scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
+    add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
 }
 
-void backward_batchnorm_layer(const layer l, network_state state)
+void backward_batchnorm_layer(layer l, network net)
 {
+    if(!net.train){
+        l.mean = l.rolling_mean;
+        l.variance = l.rolling_variance;
+    }
+    backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
     backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
 
     scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
@@ -153,7 +168,7 @@ void backward_batchnorm_layer(const layer l, network_state state)
     mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
     variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
     normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
-    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
+    if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1);
 }
 
 #ifdef GPU
@@ -171,34 +186,86 @@ void push_batchnorm_layer(layer l)
     cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.c);
 }
 
-void forward_batchnorm_layer_gpu(layer l, network_state state)
+void forward_batchnorm_layer_gpu(layer l, network net)
 {
-    if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
-    if(l.type == CONNECTED){
-        l.out_c = l.outputs;
-        l.out_h = l.out_w = 1;
-    }
-    if (state.train) {
+    if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
+    if (net.train) {
+#ifdef CUDNN
+        float one = 1;
+        float zero = 0;
+        cudnnBatchNormalizationForwardTraining(cudnn_handle(),
+                CUDNN_BATCHNORM_SPATIAL,
+                &one,
+                &zero,
+                l.dstTensorDesc,
+                l.x_gpu,
+                l.dstTensorDesc,
+                l.output_gpu,
+                l.normTensorDesc,
+                l.scales_gpu,
+                l.biases_gpu,
+                .01,
+                l.rolling_mean_gpu,
+                l.rolling_variance_gpu,
+                .00001,
+                l.mean_gpu,
+                l.variance_gpu);
+#else
         fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
         fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
 
-        scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
-        axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
-        scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
-        axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
+        scal_gpu(l.out_c, .99, l.rolling_mean_gpu, 1);
+        axpy_gpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
+        scal_gpu(l.out_c, .99, l.rolling_variance_gpu, 1);
+        axpy_gpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
         normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
-        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
+
+        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+#endif
     } else {
         normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
     }
 
-    scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
 }
 
-void backward_batchnorm_layer_gpu(const layer l, network_state state)
+void backward_batchnorm_layer_gpu(layer l, network net)
 {
+    if(!net.train){
+        l.mean_gpu = l.rolling_mean_gpu;
+        l.variance_gpu = l.rolling_variance_gpu;
+    }
+#ifdef CUDNN
+    float one = 1;
+    float zero = 0;
+    cudnnBatchNormalizationBackward(cudnn_handle(),
+            CUDNN_BATCHNORM_SPATIAL,
+            &one,
+            &zero,
+            &one,
+            &one,
+            l.dstTensorDesc,
+            l.x_gpu,
+            l.dstTensorDesc,
+            l.delta_gpu,
+            l.dstTensorDesc,
+            l.x_norm_gpu,
+            l.normTensorDesc,
+            l.scales_gpu,
+            l.scale_updates_gpu,
+            l.bias_updates_gpu,
+            .00001,
+            l.mean_gpu,
+            l.variance_gpu);
+    copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
+#else
+    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
     backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
 
     scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
@@ -206,6 +273,7 @@ void backward_batchnorm_layer_gpu(const layer l, network_state state)
     fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
     fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
     normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
-    if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
+#endif
+    if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
diff --git a/image.darknet/src/batchnorm_layer.h b/image.darknet/src/batchnorm_layer.h
index 99d1d0f..25a18a3 100644
--- a/image.darknet/src/batchnorm_layer.h
+++ b/image.darknet/src/batchnorm_layer.h
@@ -6,12 +6,12 @@
 #include "network.h"
 
 layer make_batchnorm_layer(int batch, int w, int h, int c);
-void forward_batchnorm_layer(layer l, network_state state);
-void backward_batchnorm_layer(layer l, network_state state);
+void forward_batchnorm_layer(layer l, network net);
+void backward_batchnorm_layer(layer l, network net);
 
 #ifdef GPU
-void forward_batchnorm_layer_gpu(layer l, network_state state);
-void backward_batchnorm_layer_gpu(layer l, network_state state);
+void forward_batchnorm_layer_gpu(layer l, network net);
+void backward_batchnorm_layer_gpu(layer l, network net);
 void pull_batchnorm_layer(layer l);
 void push_batchnorm_layer(layer l);
 #endif
diff --git a/image.darknet/src/blas.c b/image.darknet/src/blas.c
index 31bd86b..9e16044 100644
--- a/image.darknet/src/blas.c
+++ b/image.darknet/src/blas.c
@@ -1,5 +1,6 @@
 #include "blas.h"
-#include "math.h"
+
+#include <math.h>
 #include <assert.h>
 #include <float.h>
 #include <stdio.h>
@@ -54,7 +55,17 @@ void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
     }
 }
 
-void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        if(da) da[i] += dc[i] * s[i];
+        if(db) db[i] += dc[i] * (1-s[i]);
+        ds[i] += dc[i] * (a[i] - b[i]);
+    }
+}
+
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
 {
     int stride = w1/w2;
     int sample = w2/w1;
@@ -73,7 +84,7 @@ void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2,
                 for(i = 0; i < minw; ++i){
                     int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
                     int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
-                    out[out_index] += add[add_index];
+                    out[out_index] = s1*out[out_index] + s2*add[add_index];
                 }
             }
         }
@@ -112,6 +123,27 @@ void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, fl
     }
 }
 
+void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial)
+{
+    int b,f,i;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < spatial; ++i){
+            float sum = 0;
+            for(f = 0; f < filters; ++f){
+                int index = b*filters*spatial + f*spatial + i;
+                sum += powf(x[index], 2);
+            }
+            sum = sqrtf(sum);
+            for(f = 0; f < filters; ++f){
+                int index = b*filters*spatial + f*spatial + i;
+                x[index] /= sum;
+                dx[index] = (1 - x[index]) / sum;
+            }
+        }
+    }
+}
+
+
 void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
 {
     int b, f, i;
@@ -161,12 +193,48 @@ void fill_cpu(int N, float ALPHA, float *X, int INCX)
     for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
 }
 
+void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    int i, j;
+    int index = 0;
+    for(j = 0; j < B; ++j) {
+        for(i = 0; i < NX; ++i){
+            if(X) X[j*NX + i] += OUT[index];
+            ++index;
+        }
+        for(i = 0; i < NY; ++i){
+            if(Y) Y[j*NY + i] += OUT[index];
+            ++index;
+        }
+    }
+}
+
+void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    int i, j;
+    int index = 0;
+    for(j = 0; j < B; ++j) {
+        for(i = 0; i < NX; ++i){
+            OUT[index++] = X[j*NX + i];
+        }
+        for(i = 0; i < NY; ++i){
+            OUT[index++] = Y[j*NY + i];
+        }
+    }
+}
+
 void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
 {
     int i;
     for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
 }
 
+void mult_add_into_cpu(int N, float *X, float *Y, float *Z)
+{
+    int i;
+    for(i = 0; i < N; ++i) Z[i] += X[i]*Y[i];
+}
+
 void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
 {
     int i;
@@ -179,11 +247,43 @@ void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
         }
         else {
             error[i] = 2*abs_val - 1;
-            delta[i] = (diff < 0) ? -1 : 1;
+            delta[i] = (diff < 0) ? 1 : -1;
         }
     }
 }
 
+void l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float diff = truth[i] - pred[i];
+        error[i] = fabs(diff);
+        delta[i] = diff > 0 ? 1 : -1;
+    }
+}
+
+void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = (t) ? -log(p) : 0;
+        delta[i] = t-p;
+    }
+}
+
+void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = -t*log(p) - (1-t)*log(1-p);
+        delta[i] = t-p;
+    }
+}
+
 void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
 {
     int i;
@@ -202,21 +302,50 @@ float dot_cpu(int N, float *X, int INCX, float *Y, int INCY)
     return dot;
 }
 
-void softmax(float *input, int n, float temp, float *output)
+void softmax(float *input, int n, float temp, int stride, float *output)
 {
     int i;
     float sum = 0;
     float largest = -FLT_MAX;
     for(i = 0; i < n; ++i){
-        if(input[i] > largest) largest = input[i];
+        if(input[i*stride] > largest) largest = input[i*stride];
     }
     for(i = 0; i < n; ++i){
-        float e = exp(input[i]/temp - largest/temp);
+        float e = exp(input[i*stride]/temp - largest/temp);
         sum += e;
-        output[i] = e;
+        output[i*stride] = e;
     }
     for(i = 0; i < n; ++i){
-        output[i] /= sum;
+        output[i*stride] /= sum;
     }
 }
 
+
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    int g, b;
+    for(b = 0; b < batch; ++b){
+        for(g = 0; g < groups; ++g){
+            softmax(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
+        }
+    }
+}
+
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+    int i, j, k, b;
+    for(b = 0; b < batch; ++b){
+        for(k = 0; k < c; ++k){
+            for(j = 0; j < h*stride; ++j){
+                for(i = 0; i < w*stride; ++i){
+                    int in_index = b*w*h*c + k*w*h + (j/stride)*w + i/stride;
+                    int out_index = b*w*h*c*stride*stride + k*w*h*stride*stride + j*w*stride + i;
+                    if(forward) out[out_index] = scale*in[in_index];
+                    else in[in_index] += scale*out[out_index];
+                }
+            }
+        }
+    }
+}
+
+
diff --git a/image.darknet/src/blas.h b/image.darknet/src/blas.h
index 3d6ee7d..707291d 100644
--- a/image.darknet/src/blas.h
+++ b/image.darknet/src/blas.h
@@ -1,5 +1,7 @@
 #ifndef BLAS_H
 #define BLAS_H
+#include "darknet.h"
+
 void flatten(float *x, int size, int layers, int batch, int forward);
 void pm(int M, int N, float *A);
 float *random_matrix(int rows, int cols);
@@ -8,53 +10,60 @@ void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward
 
 void test_blas();
 
+void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
+void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
+void mult_add_into_cpu(int N, float *X, float *Y, float *Z);
+
 void const_cpu(int N, float ALPHA, float *X, int INCX);
-void constrain_ongpu(int N, float ALPHA, float * X, int INCX);
+void constrain_gpu(int N, float ALPHA, float * X, int INCX);
 void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
 void mul_cpu(int N, float *X, int INCX, float *Y, int INCY);
 
-void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
-void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
-void scal_cpu(int N, float ALPHA, float *X, int INCX);
-void fill_cpu(int N, float ALPHA, float * X, int INCX);
-float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
-void test_gpu_blas();
-void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
+int test_gpu_blas();
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out);
 
 void mean_cpu(float *x, int batch, int filters, int spatial, float *mean);
 void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
-void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
 
 void scale_bias(float *output, float *scales, int batch, int n, int size);
 void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
 void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta);
 void  variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta);
 void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);
+void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial);
 
 void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
 void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
 void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);
+void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc);
 
-void softmax(float *input, int n, float temp, float *output);
+void softmax(float *input, int n, float temp, int stride, float *output);
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
 
 #ifdef GPU
 #include "cuda.h"
-
-void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
-void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
-void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
-void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
-void scal_ongpu(int N, float ALPHA, float * X, int INCX);
-void supp_ongpu(int N, float ALPHA, float * X, int INCX);
-void mask_ongpu(int N, float * X, float mask_num, float * mask);
-void const_ongpu(int N, float ALPHA, float *X, int INCX);
-void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
-void mul_ongpu(int N, float *X, int INCX, float *Y, int INCY);
-void fill_ongpu(int N, float ALPHA, float * X, int INCX);
+#include "tree.h"
+
+void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
+void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
+void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
+void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
+void add_gpu(int N, float ALPHA, float * X, int INCX);
+void supp_gpu(int N, float ALPHA, float * X, int INCX);
+void mask_gpu(int N, float * X, float mask_num, float * mask, float val);
+void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale);
+void const_gpu(int N, float ALPHA, float *X, int INCX);
+void pow_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
+void mul_gpu(int N, float *X, int INCX, float *Y, int INCY);
 
 void mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
 void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
 void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
+void l2normalize_gpu(float *x, float *dx, int batch, int filters, int spatial);
 
 void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);
 
@@ -63,25 +72,34 @@ void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *varianc
 
 void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
 void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
-void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
+void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out);
 void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
 void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
 
+void logistic_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void l2_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void wgan_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc);
 void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c);
 void mult_add_into_gpu(int num, float *a, float *b, float *c);
+void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
+void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
 
-void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
+void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
 
-void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output);
+void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
+void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
 void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t);
 
-void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out);
+void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out);
+void softmax_tree(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier);
+void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
 
 #endif
 #endif
diff --git a/image.darknet/src/blas_kernels.cu b/image.darknet/src/blas_kernels.cu
index d940176..47e8217 100644
--- a/image.darknet/src/blas_kernels.cu
+++ b/image.darknet/src/blas_kernels.cu
@@ -53,24 +53,40 @@ void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size,
     check_error(cudaPeekAtLastError());
 }
 
-__global__ void add_bias_kernel(float *output, float *biases, int n, int size)
+__global__ void add_bias_kernel(float *output, float *biases, int batch, int n, int size)
 {
-    int offset = blockIdx.x * blockDim.x + threadIdx.x;
-    int filter = blockIdx.y;
-    int batch = blockIdx.z;
+    int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (index >= n*size*batch) return;
+    int i = index % size;
+    index /= size;
+    int j = index % n;
+    index /= n;
+    int k = index;
 
-    if(offset < size) output[(batch*n+filter)*size + offset] += biases[filter];
+    output[(k*n+j)*size + i] += biases[j];
 }
 
 void add_bias_gpu(float *output, float *biases, int batch, int n, int size)
 {
-    dim3 dimGrid((size-1)/BLOCK + 1, n, batch);
-    dim3 dimBlock(BLOCK, 1, 1);
+    int num = n*size*batch;
 
-    add_bias_kernel<<<dimGrid, dimBlock>>>(output, biases, n, size);
+    add_bias_kernel<<<cuda_gridsize(num), BLOCK>>>(output, biases, batch, n, size);
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void backward_bias_conn_kernel(float *bias_updates, float *delta, int batch, int n)
+{
+    int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (index >= n) return;
+    int b;
+    float sum = 0;
+    for(b = 0; b < batch; ++b){
+        int i = b*n + index;
+        sum += delta[i];
+    }
+    bias_updates[index] += sum;
+}
+
 __global__ void backward_bias_kernel(float *bias_updates, float *delta, int batch, int n, int size)
 {
     __shared__ float part[BLOCK];
@@ -91,6 +107,16 @@ __global__ void backward_bias_kernel(float *bias_updates, float *delta, int batc
     }
 }
 
+void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size)
+{
+    if(size == 1){
+        backward_bias_conn_kernel<<<cuda_gridsize(n), BLOCK>>>(bias_updates, delta, batch, n);
+    }else{
+        backward_bias_kernel<<<n, BLOCK>>>(bias_updates, delta, batch, n, size);
+    }
+    check_error(cudaPeekAtLastError());
+}
+
 /*
 __global__ void dot_kernel(float *output, float scale, int batch, int n, int size, float *delta)
 {
@@ -133,20 +159,16 @@ void dot_error_gpu(layer l)
 }
 */
 
-void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size)
-{
-    backward_bias_kernel<<<n, BLOCK>>>(bias_updates, delta, batch, n, size);
-    check_error(cudaPeekAtLastError());
-}
-
 
 __global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t)
 {
     int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (index >= N) return;
+
+    float mhat = m[index] / (1.f - powf(B1, t));
+    float vhat = v[index] / (1.f - powf(B2, t));
     
-    x[index] = x[index] - (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps));
-    //if(index == 0) printf("%f %f %f %f\n", m[index], v[index], (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps)));
+    x[index] = x[index] + rate * mhat / (sqrtf(vhat) + eps);
 }
 
 extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t)
@@ -155,13 +177,27 @@ extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2
     check_error(cudaPeekAtLastError());
 }
 
+extern "C" void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
+{
+    scal_gpu(n, B1, m, 1);
+    scal_gpu(n, B2, v, 1);
+    axpy_gpu(n, -decay*batch, w, 1, d, 1);
+
+    axpy_gpu(n, (1-B1), d, 1, m, 1);
+    mul_gpu(n, d, 1, d, 1);
+    axpy_gpu(n, (1-B2), d, 1, v, 1);
+
+    adam_gpu(n, w, m, v, B1, B2, rate, eps, t);
+    fill_gpu(n, 0, d, 1);
+}
+
 __global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial)
 {
     int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (index >= N) return;
     int f = (index/spatial)%filters;
     
-    x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f);
+    x[index] = (x[index] - mean[f])/(sqrtf(variance[f] + .00001f));
 }
 
 __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
@@ -170,7 +206,7 @@ __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *vari
     if (index >= N) return;
     int f = (index/spatial)%filters;
     
-    delta[index] = delta[index] * 1./(sqrt(variance[f]) + .000001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
+    delta[index] = delta[index] * 1.f/(sqrtf(variance[f] + .00001f)) + variance_delta[f] * 2.f * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
 }
 
 extern "C" void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
@@ -192,7 +228,7 @@ __global__ void  variance_delta_kernel(float *x, float *delta, float *mean, floa
             variance_delta[i] += delta[index]*(x[index] - mean[i]);
         }
     }
-    variance_delta[i] *= -.5 * pow(variance[i] + .000001f, (float)(-3./2.));
+    variance_delta[i] *= -.5f * powf(variance[i] + .00001f, (float)(-3.f/2.f));
 }
 
 __global__ void accumulate_kernel(float *x, int n, int groups, float *sum)
@@ -224,12 +260,14 @@ __global__ void fast_mean_delta_kernel(float *delta, float *variance, int batch,
         }
     }
 
+    __syncthreads();
+
     if(id == 0){
         mean_delta[filter] = 0;
         for(i = 0; i < threads; ++i){
             mean_delta[filter] += local[i];
         }
-        mean_delta[filter] *= (-1./sqrt(variance[filter] + .000001f));
+        mean_delta[filter] *= (-1.f/sqrtf(variance[filter] + .00001f));
     }
 }
 
@@ -252,12 +290,14 @@ __global__ void  fast_variance_delta_kernel(float *x, float *delta, float *mean,
         }
     }
 
+    __syncthreads();
+
     if(id == 0){
         variance_delta[filter] = 0;
         for(i = 0; i < threads; ++i){
             variance_delta[filter] += local[i];
         }
-        variance_delta[filter] *= -.5 * pow(variance[filter] + .000001f, (float)(-3./2.));
+        variance_delta[filter] *= -.5f * powf(variance[filter] + .00001f, (float)(-3.f/2.f));
     }
 }
 
@@ -274,7 +314,7 @@ __global__ void mean_delta_kernel(float *delta, float *variance, int batch, int
             mean_delta[i] += delta[index];
         }
     }
-    mean_delta[i] *= (-1./sqrt(variance[i] + .000001f));
+    mean_delta[i] *= (-1.f/sqrtf(variance[i] + .00001f));
 }
 
 extern "C" void mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
@@ -297,7 +337,7 @@ extern "C" void fast_variance_delta_gpu(float *x, float *delta, float *mean, flo
 
 __global__ void  mean_kernel(float *x, int batch, int filters, int spatial, float *mean)
 {
-    float scale = 1./(batch * spatial);
+    float scale = 1.f/(batch * spatial);
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (i >= filters) return;
     int j,k;
@@ -313,7 +353,7 @@ __global__ void  mean_kernel(float *x, int batch, int filters, int spatial, floa
 
 __global__ void variance_kernel(float *x, float *mean, int batch, int filters, int spatial, float *variance)
 {
-    float scale = 1./(batch * spatial - 1);
+    float scale = 1.f/(batch * spatial - 1);
     int j,k;
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (i >= filters) return;
@@ -321,7 +361,7 @@ __global__ void variance_kernel(float *x, float *mean, int batch, int filters, i
     for(j = 0; j < batch; ++j){
         for(k = 0; k < spatial; ++k){
             int index = j*filters*spatial + i*spatial + k;
-            variance[i] += pow((x[index] - mean[i]), 2);
+            variance[i] += powf((x[index] - mean[i]), 2);
         }
     }
     variance[i] *= scale;
@@ -391,22 +431,22 @@ __global__ void supp_kernel(int N, float ALPHA, float *X, int INCX)
     }
 }
 
-__global__ void scal_kernel(int N, float ALPHA, float *X, int INCX)
+__global__ void add_kernel(int N, float ALPHA, float *X, int INCX)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(i < N) X[i*INCX] *= ALPHA;
+    if(i < N) X[i*INCX] += ALPHA;
 }
 
-__global__ void fill_kernel(int N, float ALPHA, float *X, int INCX)
+__global__ void scal_kernel(int N, float ALPHA, float *X, int INCX)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(i < N) X[i*INCX] = ALPHA;
+    if(i < N) X[i*INCX] *= ALPHA;
 }
 
-__global__ void mask_kernel(int n,  float *x, float mask_num, float *mask)
+__global__ void fill_kernel(int N, float ALPHA, float *X, int INCX)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(i < n && mask[i] == mask_num) x[i] = mask_num;
+    if(i < N) X[i*INCX] = ALPHA;
 }
 
 __global__ void copy_kernel(int N,  float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
@@ -429,6 +469,35 @@ extern "C" void normalize_gpu(float *x, float *mean, float *variance, int batch,
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void l2norm_kernel(int N, float *x, float *dx, int batch, int filters, int spatial)
+{
+    int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (index >= N) return;
+    int b = index / spatial;
+    int i = index % spatial;
+    int f;
+    float sum = 0;
+    for(f = 0; f < filters; ++f){
+        int index = b*filters*spatial + f*spatial + i;
+        sum += powf(x[index], 2);
+    }
+    sum = sqrtf(sum);
+    if(sum == 0) sum = 1;
+    //printf("%f\n", sum);
+    for(f = 0; f < filters; ++f){
+        int index = b*filters*spatial + f*spatial + i;
+        x[index] /= sum;
+        dx[index] = (1 - x[index]) / sum;
+    }
+}
+
+extern "C" void l2normalize_gpu(float *x, float *dx, int batch, int filters, int spatial)
+{
+    size_t N = batch*spatial;
+    l2norm_kernel<<<cuda_gridsize(N), BLOCK>>>(N, x, dx, batch, filters, spatial);
+    check_error(cudaPeekAtLastError());
+}
+
 __global__ void  fast_mean_kernel(float *x, int batch, int filters, int spatial, float *mean)
 {
     const int threads = BLOCK;
@@ -447,6 +516,8 @@ __global__ void  fast_mean_kernel(float *x, int batch, int filters, int spatial,
         }
     }
 
+    __syncthreads();
+
     if(id == 0){
         mean[filter] = 0;
         for(i = 0; i < threads; ++i){
@@ -471,10 +542,12 @@ __global__ void  fast_variance_kernel(float *x, float *mean, int batch, int filt
         for(i = 0; i < spatial; i += threads){
             int index = j*spatial*filters + filter*spatial + i + id;
 
-            local[id] += (i+id < spatial) ? pow((x[index] - mean[filter]), 2) : 0;
+            local[id] += (i+id < spatial) ? powf((x[index] - mean[filter]), 2) : 0;
         }
     }
 
+    __syncthreads();
+
     if(id == 0){
         variance[filter] = 0;
         for(i = 0; i < threads; ++i){
@@ -509,35 +582,35 @@ extern "C" void variance_gpu(float *x, float *mean, int batch, int filters, int
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
+extern "C" void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
 {
-    axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
+    axpy_gpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
 }
 
-extern "C" void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
+extern "C" void pow_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
 {
     pow_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX, Y, INCY);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
+extern "C" void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
 {
     axpy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY)
+extern "C" void copy_gpu(int N, float * X, int INCX, float * Y, int INCY)
 {
-    copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY);
+    copy_gpu_offset(N, X, 0, INCX, Y, 0, INCY);
 }
 
-extern "C" void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY)
+extern "C" void mul_gpu(int N, float * X, int INCX, float * Y, int INCY)
 {
     mul_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, INCX, Y, INCY);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
+extern "C" void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
 {
     copy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, OFFX, INCX, Y, OFFY, INCY);
     check_error(cudaPeekAtLastError());
@@ -560,58 +633,82 @@ __global__ void flatten_kernel(int N, float *x, int spatial, int layers, int bat
     else out[i1] = x[i2];
 }
 
-extern "C" void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out)
+extern "C" void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out)
 {
     int size = spatial*batch*layers;
     flatten_kernel<<<cuda_gridsize(size), BLOCK>>>(size, x, spatial, layers, batch, forward, out);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
+extern "C" void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
 {
     int size = w*h*c*batch;
     reorg_kernel<<<cuda_gridsize(size), BLOCK>>>(size, x, w, h, c, batch, stride, forward, out);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask)
+__global__ void mask_kernel(int n,  float *x, float mask_num, float *mask, float val)
 {
-    mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask);
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n && mask[i] == mask_num) x[i] = val;
+}
+
+extern "C" void mask_gpu(int N, float * X, float mask_num, float * mask, float val)
+{
+    mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask, val);
+    check_error(cudaPeekAtLastError());
+}
+
+__global__ void scale_mask_kernel(int n,  float *x, float mask_num, float *mask, float scale)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n && mask[i] == mask_num) x[i] *= scale;
+}
+
+extern "C" void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale)
+{
+    scale_mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask, scale);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void const_ongpu(int N, float ALPHA, float * X, int INCX)
+extern "C" void const_gpu(int N, float ALPHA, float * X, int INCX)
 {
     const_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void constrain_ongpu(int N, float ALPHA, float * X, int INCX)
+extern "C" void constrain_gpu(int N, float ALPHA, float * X, int INCX)
 {
     constrain_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
     check_error(cudaPeekAtLastError());
 }
 
 
-extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX)
+extern "C" void add_gpu(int N, float ALPHA, float * X, int INCX)
+{
+    add_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
+    check_error(cudaPeekAtLastError());
+}
+
+extern "C" void scal_gpu(int N, float ALPHA, float * X, int INCX)
 {
     scal_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void supp_ongpu(int N, float ALPHA, float * X, int INCX)
+extern "C" void supp_gpu(int N, float ALPHA, float * X, int INCX)
 {
     supp_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void fill_ongpu(int N, float ALPHA, float * X, int INCX)
+extern "C" void fill_gpu(int N, float ALPHA, float * X, int INCX)
 {
     fill_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
     check_error(cudaPeekAtLastError());
 }
 
-__global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+__global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
 {
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (id >= size) return;
@@ -625,10 +722,11 @@ __global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stri
 
     int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
     int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
-    out[out_index] += add[add_index];
+    out[out_index] = s1*out[out_index] + s2*add[add_index];
+    //out[out_index] += add[add_index];
 }
 
-extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
 {
     int minw = (w1 < w2) ? w1 : w2;
     int minh = (h1 < h2) ? h1 : h2;
@@ -642,7 +740,7 @@ extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int
     if(sample < 1) sample = 1;
 
     int size = batch * minw * minh * minc;
-    shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, out);
+    shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, s1, s2, out);
     check_error(cudaPeekAtLastError());
 }
 
@@ -651,14 +749,14 @@ __global__ void smooth_l1_kernel(int n, float *pred, float *truth, float *delta,
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if(i < n){
         float diff = truth[i] - pred[i];
-        float abs_val = abs(diff);
+        float abs_val = fabsf(diff);
         if(abs_val < 1) {
             error[i] = diff * diff;
             delta[i] = diff;
         }
         else {
             error[i] = 2*abs_val - 1;
-            delta[i] = (diff < 0) ? -1 : 1;
+            delta[i] = (diff > 0) ? 1 : -1;
         }
     }
 }
@@ -669,6 +767,40 @@ extern "C" void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, fl
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void softmax_x_ent_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = (t) ? -log(p) : 0;
+        delta[i] = t-p;
+    }
+}
+
+extern "C" void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    softmax_x_ent_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
+__global__ void logistic_x_ent_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = -t*log(p+.0000001) - (1-t)*log(1-p+.0000001);
+        delta[i] = t-p;
+    }
+}
+
+extern "C" void logistic_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    logistic_x_ent_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
 __global__ void l2_kernel(int n, float *pred, float *truth, float *delta, float *error)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -685,6 +817,38 @@ extern "C" void l2_gpu(int n, float *pred, float *truth, float *delta, float *er
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void l1_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float diff = truth[i] - pred[i];
+        error[i] = abs(diff);
+        delta[i] = (diff > 0) ? 1 : -1;
+    }
+}
+
+extern "C" void l1_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    l1_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
+__global__ void wgan_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        error[i] = truth[i] ? -pred[i] : pred[i];
+        delta[i] = (truth[i] > 0) ? 1 : -1;
+    }
+}
+
+extern "C" void wgan_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    wgan_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
+
 
 
 __global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float *c)
@@ -695,6 +859,46 @@ __global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float *
     }
 }
 
+__global__ void deinter_kernel(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < (NX+NY)*B){
+        int b = i / (NX+NY);
+        int j = i % (NX+NY);
+        if (j < NX){
+            if(X) X[b*NX + j] += OUT[i];
+        } else {
+            if(Y) Y[b*NY + j - NX] += OUT[i];
+        }
+    }
+}
+
+extern "C" void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    deinter_kernel<<<cuda_gridsize((NX+NY)*B), BLOCK>>>(NX, X, NY, Y, B, OUT);
+    check_error(cudaPeekAtLastError());
+}
+
+__global__ void inter_kernel(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < (NX+NY)*B){
+        int b = i / (NX+NY);
+        int j = i % (NX+NY);
+        if (j < NX){
+            OUT[i] = X[b*NX + j];
+        } else {
+            OUT[i] = Y[b*NY + j - NX];
+        }
+    }
+}
+
+extern "C" void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
+{
+    inter_kernel<<<cuda_gridsize((NX+NY)*B), BLOCK>>>(NX, X, NY, Y, B, OUT);
+    check_error(cudaPeekAtLastError());
+}
+
 extern "C" void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c)
 {
     weighted_sum_kernel<<<cuda_gridsize(num), BLOCK>>>(num, a, b, s, c);
@@ -706,8 +910,8 @@ __global__ void weighted_delta_kernel(int n, float *a, float *b, float *s, float
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if(i < n){
         if(da) da[i] += dc[i] * s[i];
-        db[i] += dc[i] * (1-s[i]);
-        ds[i] += dc[i] * a[i] + dc[i] * -b[i];
+        if(db) db[i] += dc[i] * (1-s[i]);
+        ds[i] += dc[i] * (a[i] - b[i]);
     }
 }
 
@@ -732,36 +936,100 @@ extern "C" void mult_add_into_gpu(int num, float *a, float *b, float *c)
 }
 
 
-__device__ void softmax_device(int n, float *input, float temp, float *output)
+__device__ void softmax_device(float *input, int n, float temp, int stride, float *output)
 {
     int i;
     float sum = 0;
     float largest = -INFINITY;
     for(i = 0; i < n; ++i){
-        int val = input[i];
+        int val = input[i*stride];
         largest = (val>largest) ? val : largest;
     }
     for(i = 0; i < n; ++i){
-        float e = exp(input[i]/temp - largest/temp);
+        float e = expf(input[i*stride]/temp - largest/temp);
         sum += e;
-        output[i] = e;
+        output[i*stride] = e;
     }
     for(i = 0; i < n; ++i){
-        output[i] /= sum;
+        output[i*stride] /= sum;
     }
 }
 
-__global__ void softmax_kernel(int n, int offset, int batch, float *input, float temp, float *output)
+
+__global__ void softmax_tree_kernel(float *input, int spatial, int batch, int stride, float temp, float *output, int groups, int *group_size, int *group_offset)
 {
-    int b = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(b >= batch) return;
-    softmax_device(n, input + b*offset, temp, output + b*offset);
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= spatial*batch*groups) return;
+    int s = id % spatial;
+    id = id / spatial;
+    int g = id % groups;
+    int b = id / groups;
+    int goff = group_offset[g]*spatial;
+    int boff = b*stride;
+    softmax_device(input + goff + boff + s, group_size[g], temp, spatial, output + goff + boff + s);
+}
+
+extern "C" void softmax_tree(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier)
+{
+    int *tree_groups_size = cuda_make_int_array(hier.group_size, hier.groups);
+    int *tree_groups_offset = cuda_make_int_array(hier.group_offset, hier.groups);
+    /*
+       static int *tree_groups_size = 0;
+       static int *tree_groups_offset = 0;
+       if(!tree_groups_size){
+       tree_groups_size = cuda_make_int_array(hier.group_size, hier.groups);
+       tree_groups_offset = cuda_make_int_array(hier.group_offset, hier.groups);
+       }
+     */
+    int num = spatial*batch*hier.groups;
+    softmax_tree_kernel<<<cuda_gridsize(num), BLOCK>>>(input, spatial, batch, stride, temp, output, hier.groups, tree_groups_size, tree_groups_offset);
+    check_error(cudaPeekAtLastError());
+    cuda_free((float *)tree_groups_size);
+    cuda_free((float *)tree_groups_offset);
+}
+
+__global__ void softmax_kernel(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= batch*groups) return;
+    int b = id / groups;
+    int g = id % groups;
+    softmax_device(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
 }
 
-extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output)
+extern "C" void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    softmax_kernel<<<cuda_gridsize(batch*groups), BLOCK>>>(input, n, batch, batch_offset, groups, group_offset, stride, temp, output);
+    check_error(cudaPeekAtLastError());
+}
+
+
+__global__ void upsample_kernel(size_t N, float *x, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+    size_t i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i >= N) return;
+    int out_index = i;
+    int out_w = i%(w*stride);
+    i = i/(w*stride);
+    int out_h = i%(h*stride);
+    i = i/(h*stride);
+    int out_c = i%c;
+    i = i/c;
+    int b = i%batch;
+
+    int in_w = out_w / stride;
+    int in_h = out_h / stride;
+    int in_c = out_c;
+
+    int in_index = b*w*h*c + in_c*w*h + in_h*w + in_w;
+
+
+    if(forward) out[out_index] += scale * x[in_index];
+    else atomicAdd(x+in_index, scale * out[out_index]);
+}
+extern "C" void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
 {
-    int inputs = n;
-    int batch = groups;
-    softmax_kernel<<<cuda_gridsize(batch), BLOCK>>>(inputs, offset, batch, input, temp, output);
+    size_t size = w*h*c*batch*stride*stride;
+    upsample_kernel<<<cuda_gridsize(size), BLOCK>>>(size, in, w, h, c, batch, stride, forward, scale, out);
     check_error(cudaPeekAtLastError());
 }
diff --git a/image.darknet/src/box.c b/image.darknet/src/box.c
index 39dea06..8a1772c 100644
--- a/image.darknet/src/box.c
+++ b/image.darknet/src/box.c
@@ -3,13 +3,98 @@
 #include <math.h>
 #include <stdlib.h>
 
-box float_to_box(float *f)
+int nms_comparator(const void *pa, const void *pb)
 {
-    box b;
+    detection a = *(detection *)pa;
+    detection b = *(detection *)pb;
+    float diff = 0;
+    if(b.sort_class >= 0){
+        diff = a.prob[b.sort_class] - b.prob[b.sort_class];
+    } else {
+        diff = a.objectness - b.objectness;
+    }
+    if(diff < 0) return 1;
+    else if(diff > 0) return -1;
+    return 0;
+}
+
+void do_nms_obj(detection *dets, int total, int classes, float thresh)
+{
+    int i, j, k;
+    k = total-1;
+    for(i = 0; i <= k; ++i){
+        if(dets[i].objectness == 0){
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k+1;
+
+    for(i = 0; i < total; ++i){
+        dets[i].sort_class = -1;
+    }
+
+    qsort(dets, total, sizeof(detection), nms_comparator);
+    for(i = 0; i < total; ++i){
+        if(dets[i].objectness == 0) continue;
+        box a = dets[i].bbox;
+        for(j = i+1; j < total; ++j){
+            if(dets[j].objectness == 0) continue;
+            box b = dets[j].bbox;
+            if (box_iou(a, b) > thresh){
+                dets[j].objectness = 0;
+                for(k = 0; k < classes; ++k){
+                    dets[j].prob[k] = 0;
+                }
+            }
+        }
+    }
+}
+
+
+void do_nms_sort(detection *dets, int total, int classes, float thresh)
+{
+    int i, j, k;
+    k = total-1;
+    for(i = 0; i <= k; ++i){
+        if(dets[i].objectness == 0){
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k+1;
+
+    for(k = 0; k < classes; ++k){
+        for(i = 0; i < total; ++i){
+            dets[i].sort_class = k;
+        }
+        qsort(dets, total, sizeof(detection), nms_comparator);
+        for(i = 0; i < total; ++i){
+            if(dets[i].prob[k] == 0) continue;
+            box a = dets[i].bbox;
+            for(j = i+1; j < total; ++j){
+                box b = dets[j].bbox;
+                if (box_iou(a, b) > thresh){
+                    dets[j].prob[k] = 0;
+                }
+            }
+        }
+    }
+}
+
+box float_to_box(float *f, int stride)
+{
+    box b = {0};
     b.x = f[0];
-    b.y = f[1];
-    b.w = f[2];
-    b.h = f[3];
+    b.y = f[1*stride];
+    b.w = f[2*stride];
+    b.h = f[3*stride];
     return b;
 }
 
@@ -230,79 +315,6 @@ dbox diou(box a, box b)
     return dd;
 }
 
-typedef struct{
-    int index;
-    int class;
-    float **probs;
-} sortable_bbox;
-
-int nms_comparator(const void *pa, const void *pb)
-{
-    sortable_bbox a = *(sortable_bbox *)pa;
-    sortable_bbox b = *(sortable_bbox *)pb;
-    float diff = a.probs[a.index][b.class] - b.probs[b.index][b.class];
-    if(diff < 0) return 1;
-    else if(diff > 0) return -1;
-    return 0;
-}
-
-void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh)
-{
-    int i, j, k;
-    sortable_bbox *s = calloc(total, sizeof(sortable_bbox));
-
-    for(i = 0; i < total; ++i){
-        s[i].index = i;       
-        s[i].class = classes;
-        s[i].probs = probs;
-    }
-
-    qsort(s, total, sizeof(sortable_bbox), nms_comparator);
-    for(i = 0; i < total; ++i){
-        if(probs[s[i].index][classes] == 0) continue;
-        box a = boxes[s[i].index];
-        for(j = i+1; j < total; ++j){
-            box b = boxes[s[j].index];
-            if (box_iou(a, b) > thresh){
-                for(k = 0; k < classes+1; ++k){
-                    probs[s[j].index][k] = 0;
-                }
-            }
-        }
-    }
-    free(s);
-}
-
-
-void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh)
-{
-    int i, j, k;
-    sortable_bbox *s = calloc(total, sizeof(sortable_bbox));
-
-    for(i = 0; i < total; ++i){
-        s[i].index = i;       
-        s[i].class = 0;
-        s[i].probs = probs;
-    }
-
-    for(k = 0; k < classes; ++k){
-        for(i = 0; i < total; ++i){
-            s[i].class = k;
-        }
-        qsort(s, total, sizeof(sortable_bbox), nms_comparator);
-        for(i = 0; i < total; ++i){
-            if(probs[s[i].index][k] == 0) continue;
-            box a = boxes[s[i].index];
-            for(j = i+1; j < total; ++j){
-                box b = boxes[s[j].index];
-                if (box_iou(a, b) > thresh){
-                    probs[s[j].index][k] = 0;
-                }
-            }
-        }
-    }
-    free(s);
-}
 
 void do_nms(box *boxes, float **probs, int total, int classes, float thresh)
 {
diff --git a/image.darknet/src/box.h b/image.darknet/src/box.h
index c65589b..dda3e59 100644
--- a/image.darknet/src/box.h
+++ b/image.darknet/src/box.h
@@ -1,21 +1,13 @@
 #ifndef BOX_H
 #define BOX_H
-
-typedef struct{
-    float x, y, w, h;
-} box;
+#include "darknet.h"
 
 typedef struct{
     float dx, dy, dw, dh;
 } dbox;
 
-box float_to_box(float *f);
-float box_iou(box a, box b);
 float box_rmse(box a, box b);
 dbox diou(box a, box b);
-void do_nms(box *boxes, float **probs, int total, int classes, float thresh);
-void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh);
-void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh);
 box decode_box(box b, box anchor);
 box encode_box(box b, box anchor);
 
diff --git a/image.darknet/src/captcha.c b/image.darknet/src/captcha.c
deleted file mode 100644
index 3d449b2..0000000
--- a/image.darknet/src/captcha.c
+++ /dev/null
@@ -1,364 +0,0 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-
-void fix_data_captcha(data d, int mask)
-{
-    matrix labels = d.y;
-    int i, j;
-    for(i = 0; i < d.y.rows; ++i){
-        for(j = 0; j < d.y.cols; j += 2){
-            if (mask){
-                if(!labels.vals[i][j]){
-                    labels.vals[i][j] = SECRET_NUM;
-                    labels.vals[i][j+1] = SECRET_NUM;
-                }else if(labels.vals[i][j+1]){
-                    labels.vals[i][j] = 0;
-                }
-            } else{
-                if (labels.vals[i][j]) {
-                    labels.vals[i][j+1] = 0;
-                } else {
-                    labels.vals[i][j+1] = 1;
-                }
-            }
-        }
-    }
-}
-
-void train_captcha(char *cfgfile, char *weightfile)
-{
-    srand(time(0));
-    float avg_loss = -1;
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = 1024;
-    int i = *net.seen/imgs;
-    int solved = 1;
-    list *plist;
-    char **labels = get_labels("/data/captcha/reimgs.labels.list");
-    if (solved){
-        plist = get_paths("/data/captcha/reimgs.solved.list");
-    }else{
-        plist = get_paths("/data/captcha/reimgs.raw.list");
-    }
-    char **paths = (char **)list_to_array(plist);
-    printf("%d\n", plist->size);
-    clock_t time;
-    pthread_t load_thread;
-    data train;
-    data buffer;
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.paths = paths;
-    args.classes = 26;
-    args.n = imgs;
-    args.m = plist->size;
-    args.labels = labels;
-    args.d = &buffer;
-    args.type = CLASSIFICATION_DATA;
-
-    load_thread = load_data_in_thread(args);
-    while(1){
-        ++i;
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        fix_data_captcha(train, solved);
-
-        /*
-           image im = float_to_image(256, 256, 3, train.X.vals[114]);
-           show_image(im, "training");
-           cvWaitKey(0);
-         */
-
-        load_thread = load_data_in_thread(args);
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-        time=clock();
-        float loss = train_network(net, train);
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-        printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), *net.seen);
-        free_data(train);
-        if(i%100==0){
-            char buff[256];
-            sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i);
-            save_weights(net, buff);
-        }
-    }
-}
-
-void test_captcha(char *cfgfile, char *weightfile, char *filename)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(2222222);
-    int i = 0;
-    char **names = get_labels("/data/captcha/reimgs.labels.list");
-    char buff[256];
-    char *input = buff;
-    int indexes[26];
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        }else{
-            //printf("Enter Image Path: ");
-            //fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image im = load_image_color(input, net.w, net.h);
-        float *X = im.data;
-        float *predictions = network_predict(net, X);
-        top_predictions(net, 26, indexes);
-        //printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        for(i = 0; i < 26; ++i){
-            int index = indexes[i];
-            if(i != 0) printf(", ");
-            printf("%s %f", names[index], predictions[index]);
-        }
-        printf("\n");
-        fflush(stdout);
-        free_image(im);
-        if (filename) break;
-    }
-}
-
-void valid_captcha(char *cfgfile, char *weightfile, char *filename)
-{
-    char **labels = get_labels("/data/captcha/reimgs.labels.list");
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    list *plist = get_paths("/data/captcha/reimgs.fg.list");
-    char **paths = (char **)list_to_array(plist);
-    int N = plist->size;
-    int outputs = net.outputs;
-
-    set_batch_network(&net, 1);
-    srand(2222222);
-    int i, j;
-    for(i = 0; i < N; ++i){
-        if (i%100 == 0) fprintf(stderr, "%d\n", i);
-        image im = load_image_color(paths[i], net.w, net.h);
-        float *X = im.data;
-        float *predictions = network_predict(net, X);
-        //printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        int truth = -1;
-        for(j = 0; j < 13; ++j){
-            if (strstr(paths[i], labels[j])) truth = j;
-        }
-        if (truth == -1){
-            fprintf(stderr, "bad: %s\n", paths[i]);
-            return;
-        }
-        printf("%d, ", truth);
-        for(j = 0; j < outputs; ++j){
-            if (j != 0) printf(", ");
-            printf("%f", predictions[j]);
-        }
-        printf("\n");
-        fflush(stdout);
-        free_image(im);
-        if (filename) break;
-    }
-}
-
-/*
-   void train_captcha(char *cfgfile, char *weightfile)
-   {
-   float avg_loss = -1;
-   srand(time(0));
-   char *base = basecfg(cfgfile);
-   printf("%s\n", base);
-   network net = parse_network_cfg(cfgfile);
-   if(weightfile){
-   load_weights(&net, weightfile);
-   }
-   printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-   int imgs = 1024;
-   int i = net.seen/imgs;
-   list *plist = get_paths("/data/captcha/train.auto5");
-   char **paths = (char **)list_to_array(plist);
-   printf("%d\n", plist->size);
-   clock_t time;
-   while(1){
-   ++i;
-   time=clock();
-   data train = load_data_captcha(paths, imgs, plist->size, 10, 200, 60);
-   translate_data_rows(train, -128);
-   scale_data_rows(train, 1./128);
-   printf("Loaded: %lf seconds\n", sec(clock()-time));
-   time=clock();
-   float loss = train_network(net, train);
-   net.seen += imgs;
-   if(avg_loss == -1) avg_loss = loss;
-   avg_loss = avg_loss*.9 + loss*.1;
-   printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen);
-   free_data(train);
-   if(i%10==0){
-   char buff[256];
-   sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i);
-   save_weights(net, buff);
-   }
-   }
-   }
-
-   void decode_captcha(char *cfgfile, char *weightfile)
-   {
-   setbuf(stdout, NULL);
-   srand(time(0));
-   network net = parse_network_cfg(cfgfile);
-   set_batch_network(&net, 1);
-   if(weightfile){
-   load_weights(&net, weightfile);
-   }
-   char filename[256];
-   while(1){
-   printf("Enter filename: ");
-   fgets(filename, 256, stdin);
-   strtok(filename, "\n");
-   image im = load_image_color(filename, 300, 57);
-   scale_image(im, 1./255.);
-   float *X = im.data;
-   float *predictions = network_predict(net, X);
-   image out  = float_to_image(300, 57, 1, predictions);
-   show_image(out, "decoded");
-#ifdef OPENCV
-cvWaitKey(0);
-#endif
-free_image(im);
-}
-}
-
-void encode_captcha(char *cfgfile, char *weightfile)
-{
-float avg_loss = -1;
-srand(time(0));
-char *base = basecfg(cfgfile);
-printf("%s\n", base);
-network net = parse_network_cfg(cfgfile);
-if(weightfile){
-    load_weights(&net, weightfile);
-}
-printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-int imgs = 1024;
-int i = net.seen/imgs;
-list *plist = get_paths("/data/captcha/encode.list");
-char **paths = (char **)list_to_array(plist);
-printf("%d\n", plist->size);
-clock_t time;
-while(1){
-    ++i;
-    time=clock();
-    data train = load_data_captcha_encode(paths, imgs, plist->size, 300, 57);
-    scale_data_rows(train, 1./255);
-    printf("Loaded: %lf seconds\n", sec(clock()-time));
-    time=clock();
-    float loss = train_network(net, train);
-    net.seen += imgs;
-    if(avg_loss == -1) avg_loss = loss;
-    avg_loss = avg_loss*.9 + loss*.1;
-    printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen);
-    free_matrix(train.X);
-    if(i%100==0){
-        char buff[256];
-        sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i);
-        save_weights(net, buff);
-    }
-}
-}
-
-void validate_captcha(char *cfgfile, char *weightfile)
-{
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int numchars = 37;
-    list *plist = get_paths("/data/captcha/solved.hard");
-    char **paths = (char **)list_to_array(plist);
-    int imgs = plist->size;
-    data valid = load_data_captcha(paths, imgs, 0, 10, 200, 60);
-    translate_data_rows(valid, -128);
-    scale_data_rows(valid, 1./128);
-    matrix pred = network_predict_data(net, valid);
-    int i, k;
-    int correct = 0;
-    int total = 0;
-    int accuracy = 0;
-    for(i = 0; i < imgs; ++i){
-        int allcorrect = 1;
-        for(k = 0; k < 10; ++k){
-            char truth = int_to_alphanum(max_index(valid.y.vals[i]+k*numchars, numchars));
-            char prediction = int_to_alphanum(max_index(pred.vals[i]+k*numchars, numchars));
-            if (truth != prediction) allcorrect=0;
-            if (truth != '.' && truth == prediction) ++correct;
-            if (truth != '.' || truth != prediction) ++total;
-        }
-        accuracy += allcorrect;
-    }
-    printf("Word Accuracy: %f, Char Accuracy %f\n", (float)accuracy/imgs, (float)correct/total);
-    free_data(valid);
-}
-
-void test_captcha(char *cfgfile, char *weightfile)
-{
-    setbuf(stdout, NULL);
-    srand(time(0));
-    //char *base = basecfg(cfgfile);
-    //printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    set_batch_network(&net, 1);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    char filename[256];
-    while(1){
-        //printf("Enter filename: ");
-        fgets(filename, 256, stdin);
-        strtok(filename, "\n");
-        image im = load_image_color(filename, 200, 60);
-        translate_image(im, -128);
-        scale_image(im, 1/128.);
-        float *X = im.data;
-        float *predictions = network_predict(net, X);
-        print_letters(predictions, 10);
-        free_image(im);
-    }
-}
-    */
-void run_captcha(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    char *filename = (argc > 5) ? argv[5]: 0;
-    if(0==strcmp(argv[2], "train")) train_captcha(cfg, weights);
-    else if(0==strcmp(argv[2], "test")) test_captcha(cfg, weights, filename);
-    else if(0==strcmp(argv[2], "valid")) valid_captcha(cfg, weights, filename);
-    //if(0==strcmp(argv[2], "test")) test_captcha(cfg, weights);
-    //else if(0==strcmp(argv[2], "encode")) encode_captcha(cfg, weights);
-    //else if(0==strcmp(argv[2], "decode")) decode_captcha(cfg, weights);
-    //else if(0==strcmp(argv[2], "valid")) validate_captcha(cfg, weights);
-}
-
diff --git a/image.darknet/src/classifier.c b/image.darknet/src/classifier.c
deleted file mode 100644
index 586530a..0000000
--- a/image.darknet/src/classifier.c
+++ /dev/null
@@ -1,1167 +0,0 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-#include "option_list.h"
-#include "blas.h"
-#include "assert.h"
-#include "classifier.h"
-#include "cuda.h"
-#include <sys/time.h>
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-image get_image_from_stream(CvCapture *cap);
-#endif
-
-float *get_regression_values(char **labels, int n)
-{
-    float *v = calloc(n, sizeof(float));
-    int i;
-    for(i = 0; i < n; ++i){
-        char *p = strchr(labels[i], ' ');
-        *p = 0;
-        v[i] = atof(p+1);
-    }
-    return v;
-}
-
-void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
-{
-    int i;
-
-    float avg_loss = -1;
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    printf("%d\n", ngpus);
-    network *nets = calloc(ngpus, sizeof(network));
-
-    srand(time(0));
-    int seed = rand();
-    for(i = 0; i < ngpus; ++i){
-        srand(seed);
-#ifdef GPU
-        cuda_set_device(gpus[i]);
-#endif
-        nets[i] = parse_network_cfg(cfgfile);
-        if(weightfile){
-            load_weights(&nets[i], weightfile);
-        }
-        if(clear) *nets[i].seen = 0;
-        nets[i].learning_rate *= ngpus;
-    }
-    srand(time(0));
-    network net = nets[0];
-
-    int imgs = net.batch * net.subdivisions * ngpus;
-
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    list *options = read_data_cfg(datacfg);
-
-    char *backup_directory = option_find_str(options, "backup", "/backup/");
-    char *label_list = option_find_str(options, "labels", "data/labels.list");
-    char *train_list = option_find_str(options, "train", "data/train.list");
-    int classes = option_find_int(options, "classes", 2);
-
-    char **labels = get_labels(label_list);
-    list *plist = get_paths(train_list);
-    char **paths = (char **)list_to_array(plist);
-    printf("%d\n", plist->size);
-    int N = plist->size;
-    clock_t time;
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.threads = 32;
-    args.hierarchy = net.hierarchy;
-
-    args.min = net.min_crop;
-    args.max = net.max_crop;
-    args.angle = net.angle;
-    args.aspect = net.aspect;
-    args.exposure = net.exposure;
-    args.saturation = net.saturation;
-    args.hue = net.hue;
-    args.size = net.w;
-
-    args.paths = paths;
-    args.classes = classes;
-    args.n = imgs;
-    args.m = N;
-    args.labels = labels;
-    args.type = CLASSIFICATION_DATA;
-
-    data train;
-    data buffer;
-    pthread_t load_thread;
-    args.d = &buffer;
-    load_thread = load_data(args);
-
-    int epoch = (*net.seen)/N;
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
-        time=clock();
-
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data(args);
-
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-        time=clock();
-
-        float loss = 0;
-#ifdef GPU
-        if(ngpus == 1){
-            loss = train_network(net, train);
-        } else {
-            loss = train_networks(nets, ngpus, train, 4);
-        }
-#else
-        loss = train_network(net, train);
-#endif
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-        free_data(train);
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
-            save_weights(net, buff);
-        }
-        if(get_current_batch(net)%100 == 0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup",backup_directory,base);
-            save_weights(net, buff);
-        }
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s.weights", backup_directory, base);
-    save_weights(net, buff);
-
-    free_network(net);
-    free_ptrs((void**)labels, classes);
-    free_ptrs((void**)paths, plist->size);
-    free_list(plist);
-    free(base);
-}
-
-
-/*
-   void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int clear)
-   {
-   srand(time(0));
-   float avg_loss = -1;
-   char *base = basecfg(cfgfile);
-   printf("%s\n", base);
-   network net = parse_network_cfg(cfgfile);
-   if(weightfile){
-   load_weights(&net, weightfile);
-   }
-   if(clear) *net.seen = 0;
-
-   int imgs = net.batch * net.subdivisions;
-
-   printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-   list *options = read_data_cfg(datacfg);
-
-   char *backup_directory = option_find_str(options, "backup", "/backup/");
-   char *label_list = option_find_str(options, "labels", "data/labels.list");
-   char *train_list = option_find_str(options, "train", "data/train.list");
-   int classes = option_find_int(options, "classes", 2);
-
-   char **labels = get_labels(label_list);
-   list *plist = get_paths(train_list);
-   char **paths = (char **)list_to_array(plist);
-   printf("%d\n", plist->size);
-   int N = plist->size;
-   clock_t time;
-
-   load_args args = {0};
-   args.w = net.w;
-   args.h = net.h;
-   args.threads = 8;
-
-   args.min = net.min_crop;
-   args.max = net.max_crop;
-   args.angle = net.angle;
-   args.aspect = net.aspect;
-   args.exposure = net.exposure;
-   args.saturation = net.saturation;
-   args.hue = net.hue;
-   args.size = net.w;
-   args.hierarchy = net.hierarchy;
-
-   args.paths = paths;
-   args.classes = classes;
-   args.n = imgs;
-   args.m = N;
-   args.labels = labels;
-   args.type = CLASSIFICATION_DATA;
-
-   data train;
-   data buffer;
-   pthread_t load_thread;
-   args.d = &buffer;
-   load_thread = load_data(args);
-
-   int epoch = (*net.seen)/N;
-   while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
-   time=clock();
-
-   pthread_join(load_thread, 0);
-   train = buffer;
-   load_thread = load_data(args);
-
-   printf("Loaded: %lf seconds\n", sec(clock()-time));
-   time=clock();
-
-#ifdef OPENCV
-if(0){
-int u;
-for(u = 0; u < imgs; ++u){
-    image im = float_to_image(net.w, net.h, 3, train.X.vals[u]);
-    show_image(im, "loaded");
-    cvWaitKey(0);
-}
-}
-#endif
-
-float loss = train_network(net, train);
-free_data(train);
-
-if(avg_loss == -1) avg_loss = loss;
-avg_loss = avg_loss*.9 + loss*.1;
-printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-if(*net.seen/N > epoch){
-    epoch = *net.seen/N;
-    char buff[256];
-    sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
-    save_weights(net, buff);
-}
-if(get_current_batch(net)%100 == 0){
-    char buff[256];
-    sprintf(buff, "%s/%s.backup",backup_directory,base);
-    save_weights(net, buff);
-}
-}
-char buff[256];
-sprintf(buff, "%s/%s.weights", backup_directory, base);
-save_weights(net, buff);
-
-free_network(net);
-free_ptrs((void**)labels, classes);
-free_ptrs((void**)paths, plist->size);
-free_list(plist);
-free(base);
-}
-*/
-
-void validate_classifier_crop(char *datacfg, char *filename, char *weightfile)
-{
-    int i = 0;
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-
-    list *options = read_data_cfg(datacfg);
-
-    char *label_list = option_find_str(options, "labels", "data/labels.list");
-    char *valid_list = option_find_str(options, "valid", "data/train.list");
-    int classes = option_find_int(options, "classes", 2);
-    int topk = option_find_int(options, "top", 1);
-
-    char **labels = get_labels(label_list);
-    list *plist = get_paths(valid_list);
-
-    char **paths = (char **)list_to_array(plist);
-    int m = plist->size;
-    free_list(plist);
-
-    clock_t time;
-    float avg_acc = 0;
-    float avg_topk = 0;
-    int splits = m/1000;
-    int num = (i+1)*m/splits - i*m/splits;
-
-    data val, buffer;
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-
-    args.paths = paths;
-    args.classes = classes;
-    args.n = num;
-    args.m = 0;
-    args.labels = labels;
-    args.d = &buffer;
-    args.type = OLD_CLASSIFICATION_DATA;
-
-    pthread_t load_thread = load_data_in_thread(args);
-    for(i = 1; i <= splits; ++i){
-        time=clock();
-
-        pthread_join(load_thread, 0);
-        val = buffer;
-
-        num = (i+1)*m/splits - i*m/splits;
-        char **part = paths+(i*m/splits);
-        if(i != splits){
-            args.paths = part;
-            load_thread = load_data_in_thread(args);
-        }
-        printf("Loaded: %d images in %lf seconds\n", val.X.rows, sec(clock()-time));
-
-        time=clock();
-        float *acc = network_accuracies(net, val, topk);
-        avg_acc += acc[0];
-        avg_topk += acc[1];
-        printf("%d: top 1: %f, top %d: %f, %lf seconds, %d images\n", i, avg_acc/i, topk, avg_topk/i, sec(clock()-time), val.X.rows);
-        free_data(val);
-    }
-}
-
-void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
-{
-    int i, j;
-    network net = parse_network_cfg(filename);
-    set_batch_network(&net, 1);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-
-    list *options = read_data_cfg(datacfg);
-
-    char *label_list = option_find_str(options, "labels", "data/labels.list");
-    char *valid_list = option_find_str(options, "valid", "data/train.list");
-    int classes = option_find_int(options, "classes", 2);
-    int topk = option_find_int(options, "top", 1);
-
-    char **labels = get_labels(label_list);
-    list *plist = get_paths(valid_list);
-
-    char **paths = (char **)list_to_array(plist);
-    int m = plist->size;
-    free_list(plist);
-
-    float avg_acc = 0;
-    float avg_topk = 0;
-    int *indexes = calloc(topk, sizeof(int));
-
-    for(i = 0; i < m; ++i){
-        int class = -1;
-        char *path = paths[i];
-        for(j = 0; j < classes; ++j){
-            if(strstr(path, labels[j])){
-                class = j;
-                break;
-            }
-        }
-        int w = net.w;
-        int h = net.h;
-        int shift = 32;
-        image im = load_image_color(paths[i], w+shift, h+shift);
-        image images[10];
-        images[0] = crop_image(im, -shift, -shift, w, h);
-        images[1] = crop_image(im, shift, -shift, w, h);
-        images[2] = crop_image(im, 0, 0, w, h);
-        images[3] = crop_image(im, -shift, shift, w, h);
-        images[4] = crop_image(im, shift, shift, w, h);
-        flip_image(im);
-        images[5] = crop_image(im, -shift, -shift, w, h);
-        images[6] = crop_image(im, shift, -shift, w, h);
-        images[7] = crop_image(im, 0, 0, w, h);
-        images[8] = crop_image(im, -shift, shift, w, h);
-        images[9] = crop_image(im, shift, shift, w, h);
-        float *pred = calloc(classes, sizeof(float));
-        for(j = 0; j < 10; ++j){
-            float *p = network_predict(net, images[j].data);
-            if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1);
-            axpy_cpu(classes, 1, p, 1, pred, 1);
-            free_image(images[j]);
-        }
-        free_image(im);
-        top_k(pred, classes, topk, indexes);
-        free(pred);
-        if(indexes[0] == class) avg_acc += 1;
-        for(j = 0; j < topk; ++j){
-            if(indexes[j] == class) avg_topk += 1;
-        }
-
-        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
-    }
-}
-
-void validate_classifier_full(char *datacfg, char *filename, char *weightfile)
-{
-    int i, j;
-    network net = parse_network_cfg(filename);
-    set_batch_network(&net, 1);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-
-    list *options = read_data_cfg(datacfg);
-
-    char *label_list = option_find_str(options, "labels", "data/labels.list");
-    char *valid_list = option_find_str(options, "valid", "data/train.list");
-    int classes = option_find_int(options, "classes", 2);
-    int topk = option_find_int(options, "top", 1);
-
-    char **labels = get_labels(label_list);
-    list *plist = get_paths(valid_list);
-
-    char **paths = (char **)list_to_array(plist);
-    int m = plist->size;
-    free_list(plist);
-
-    float avg_acc = 0;
-    float avg_topk = 0;
-    int *indexes = calloc(topk, sizeof(int));
-
-    int size = net.w;
-    for(i = 0; i < m; ++i){
-        int class = -1;
-        char *path = paths[i];
-        for(j = 0; j < classes; ++j){
-            if(strstr(path, labels[j])){
-                class = j;
-                break;
-            }
-        }
-        image im = load_image_color(paths[i], 0, 0);
-        image resized = resize_min(im, size);
-        resize_network(&net, resized.w, resized.h);
-        //show_image(im, "orig");
-        //show_image(crop, "cropped");
-        //cvWaitKey(0);
-        float *pred = network_predict(net, resized.data);
-        if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1);
-
-        free_image(im);
-        free_image(resized);
-        top_k(pred, classes, topk, indexes);
-
-        if(indexes[0] == class) avg_acc += 1;
-        for(j = 0; j < topk; ++j){
-            if(indexes[j] == class) avg_topk += 1;
-        }
-
-        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
-    }
-}
-
-
-void validate_classifier_single(char *datacfg, char *filename, char *weightfile)
-{
-    int i, j;
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(time(0));
-
-    list *options = read_data_cfg(datacfg);
-
-    char *label_list = option_find_str(options, "labels", "data/labels.list");
-    char *leaf_list = option_find_str(options, "leaves", 0);
-    if(leaf_list) change_leaves(net.hierarchy, leaf_list);
-    char *valid_list = option_find_str(options, "valid", "data/train.list");
-    int classes = option_find_int(options, "classes", 2);
-    int topk = option_find_int(options, "top", 1);
-
-    char **labels = get_labels(label_list);
-    list *plist = get_paths(valid_list);
-
-    char **paths = (char **)list_to_array(plist);
-    int m = plist->size;
-    free_list(plist);
-
-    float avg_acc = 0;
-    float avg_topk = 0;
-    int *indexes = calloc(topk, sizeof(int));
-
-    for(i = 0; i < m; ++i){
-        int class = -1;
-        char *path = paths[i];
-        for(j = 0; j < classes; ++j){
-            if(strstr(path, labels[j])){
-                class = j;
-                break;
-            }
-        }
-        image im = load_image_color(paths[i], 0, 0);
-        image resized = resize_min(im, net.w);
-        image crop = crop_image(resized, (resized.w - net.w)/2, (resized.h - net.h)/2, net.w, net.h);
-        //show_image(im, "orig");
-        //show_image(crop, "cropped");
-        //cvWaitKey(0);
-        float *pred = network_predict(net, crop.data);
-        if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1);
-
-        if(resized.data != im.data) free_image(resized);
-        free_image(im);
-        free_image(crop);
-        top_k(pred, classes, topk, indexes);
-
-        if(indexes[0] == class) avg_acc += 1;
-        for(j = 0; j < topk; ++j){
-            if(indexes[j] == class) avg_topk += 1;
-        }
-
-        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
-    }
-}
-
-void validate_classifier_multi(char *datacfg, char *filename, char *weightfile)
-{
-    int i, j;
-    network net = parse_network_cfg(filename);
-    set_batch_network(&net, 1);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-
-    list *options = read_data_cfg(datacfg);
-
-    char *label_list = option_find_str(options, "labels", "data/labels.list");
-    char *valid_list = option_find_str(options, "valid", "data/train.list");
-    int classes = option_find_int(options, "classes", 2);
-    int topk = option_find_int(options, "top", 1);
-
-    char **labels = get_labels(label_list);
-    list *plist = get_paths(valid_list);
-    int scales[] = {224, 288, 320, 352, 384};
-    int nscales = sizeof(scales)/sizeof(scales[0]);
-
-    char **paths = (char **)list_to_array(plist);
-    int m = plist->size;
-    free_list(plist);
-
-    float avg_acc = 0;
-    float avg_topk = 0;
-    int *indexes = calloc(topk, sizeof(int));
-
-    for(i = 0; i < m; ++i){
-        int class = -1;
-        char *path = paths[i];
-        for(j = 0; j < classes; ++j){
-            if(strstr(path, labels[j])){
-                class = j;
-                break;
-            }
-        }
-        float *pred = calloc(classes, sizeof(float));
-        image im = load_image_color(paths[i], 0, 0);
-        for(j = 0; j < nscales; ++j){
-            image r = resize_min(im, scales[j]);
-            resize_network(&net, r.w, r.h);
-            float *p = network_predict(net, r.data);
-            if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1);
-            axpy_cpu(classes, 1, p, 1, pred, 1);
-            flip_image(r);
-            p = network_predict(net, r.data);
-            axpy_cpu(classes, 1, p, 1, pred, 1);
-            if(r.data != im.data) free_image(r);
-        }
-        free_image(im);
-        top_k(pred, classes, topk, indexes);
-        free(pred);
-        if(indexes[0] == class) avg_acc += 1;
-        for(j = 0; j < topk; ++j){
-            if(indexes[j] == class) avg_topk += 1;
-        }
-
-        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
-    }
-}
-
-void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int layer_num)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(2222222);
-
-    list *options = read_data_cfg(datacfg);
-
-    char *name_list = option_find_str(options, "names", 0);
-    if(!name_list) name_list = option_find_str(options, "labels", "data/labels.list");
-    int top = option_find_int(options, "top", 1);
-
-    int i = 0;
-    char **names = get_labels(name_list);
-    clock_t time;
-    int *indexes = calloc(top, sizeof(int));
-    char buff[256];
-    char *input = buff;
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        }else{
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image orig = load_image_color(input, 0, 0);
-        image r = resize_min(orig, 256);
-        image im = crop_image(r, (r.w - 224 - 1)/2 + 1, (r.h - 224 - 1)/2 + 1, 224, 224);
-        float mean[] = {0.48263312050943, 0.45230225481413, 0.40099074308742};
-        float std[] = {0.22590347483426, 0.22120921437787, 0.22103996251583};
-        float var[3];
-        var[0] = std[0]*std[0];
-        var[1] = std[1]*std[1];
-        var[2] = std[2]*std[2];
-
-        normalize_cpu(im.data, mean, var, 1, 3, im.w*im.h);
-
-        float *X = im.data;
-        time=clock();
-        float *predictions = network_predict(net, X);
-
-        layer l = net.layers[layer_num];
-        for(i = 0; i < l.c; ++i){
-            if(l.rolling_mean) printf("%f %f %f\n", l.rolling_mean[i], l.rolling_variance[i], l.scales[i]);
-        }
-#ifdef GPU
-        cuda_pull_array(l.output_gpu, l.output, l.outputs);
-#endif
-        for(i = 0; i < l.outputs; ++i){
-            printf("%f\n", l.output[i]);
-        }
-        /*
-
-           printf("\n\nWeights\n");
-           for(i = 0; i < l.n*l.size*l.size*l.c; ++i){
-           printf("%f\n", l.filters[i]);
-           }
-
-           printf("\n\nBiases\n");
-           for(i = 0; i < l.n; ++i){
-           printf("%f\n", l.biases[i]);
-           }
-         */
-
-        top_predictions(net, top, indexes);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        for(i = 0; i < top; ++i){
-            int index = indexes[i];
-            printf("%s: %f\n", names[index], predictions[index]);
-        }
-        free_image(im);
-        if (filename) break;
-    }
-}
-
-void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int top)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(2222222);
-
-    list *options = read_data_cfg(datacfg);
-
-    char *name_list = option_find_str(options, "names", 0);
-    if(!name_list) name_list = option_find_str(options, "labels", "data/labels.list");
-    if(top == 0) top = option_find_int(options, "top", 1);
-
-    int i = 0;
-    char **names = get_labels(name_list);
-    clock_t time;
-    int *indexes = calloc(top, sizeof(int));
-    char buff[256];
-    char *input = buff;
-    int size = net.w;
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        }else{
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image im = load_image_color(input, 0, 0);
-        image r = resize_min(im, size);
-        resize_network(&net, r.w, r.h);
-        printf("%d %d\n", r.w, r.h);
-
-        float *X = r.data;
-        time=clock();
-        float *predictions = network_predict(net, X);
-        if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 0);
-        top_k(predictions, net.outputs, top, indexes);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        for(i = 0; i < top; ++i){
-            int index = indexes[i];
-            if(net.hierarchy) printf("%d, %s: %f, parent: %s \n",index, names[index], predictions[index], (net.hierarchy->parent[index] >= 0) ? names[net.hierarchy->parent[index]] : "Root");
-            else printf("%s: %f\n",names[index], predictions[index]);
-        }
-        if(r.data != im.data) free_image(r);
-        free_image(im);
-        if (filename) break;
-    }
-}
-
-
-void label_classifier(char *datacfg, char *filename, char *weightfile)
-{
-    int i;
-    network net = parse_network_cfg(filename);
-    set_batch_network(&net, 1);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-
-    list *options = read_data_cfg(datacfg);
-
-    char *label_list = option_find_str(options, "names", "data/labels.list");
-    char *test_list = option_find_str(options, "test", "data/train.list");
-    int classes = option_find_int(options, "classes", 2);
-
-    char **labels = get_labels(label_list);
-    list *plist = get_paths(test_list);
-
-    char **paths = (char **)list_to_array(plist);
-    int m = plist->size;
-    free_list(plist);
-
-    for(i = 0; i < m; ++i){
-        image im = load_image_color(paths[i], 0, 0);
-        image resized = resize_min(im, net.w);
-        image crop = crop_image(resized, (resized.w - net.w)/2, (resized.h - net.h)/2, net.w, net.h);
-        float *pred = network_predict(net, crop.data);
-
-        if(resized.data != im.data) free_image(resized);
-        free_image(im);
-        free_image(crop);
-        int ind = max_index(pred, classes);
-
-        printf("%s\n", labels[ind]);
-    }
-}
-
-
-void test_classifier(char *datacfg, char *cfgfile, char *weightfile, int target_layer)
-{
-    int curr = 0;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-
-    list *options = read_data_cfg(datacfg);
-
-    char *test_list = option_find_str(options, "test", "data/test.list");
-    int classes = option_find_int(options, "classes", 2);
-
-    list *plist = get_paths(test_list);
-
-    char **paths = (char **)list_to_array(plist);
-    int m = plist->size;
-    free_list(plist);
-
-    clock_t time;
-
-    data val, buffer;
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.paths = paths;
-    args.classes = classes;
-    args.n = net.batch;
-    args.m = 0;
-    args.labels = 0;
-    args.d = &buffer;
-    args.type = OLD_CLASSIFICATION_DATA;
-
-    pthread_t load_thread = load_data_in_thread(args);
-    for(curr = net.batch; curr < m; curr += net.batch){
-        time=clock();
-
-        pthread_join(load_thread, 0);
-        val = buffer;
-
-        if(curr < m){
-            args.paths = paths + curr;
-            if (curr + net.batch > m) args.n = m - curr;
-            load_thread = load_data_in_thread(args);
-        }
-        fprintf(stderr, "Loaded: %d images in %lf seconds\n", val.X.rows, sec(clock()-time));
-
-        time=clock();
-        matrix pred = network_predict_data(net, val);
-
-        int i, j;
-        if (target_layer >= 0){
-            //layer l = net.layers[target_layer];
-        }
-
-        for(i = 0; i < pred.rows; ++i){
-            printf("%s", paths[curr-net.batch+i]);
-            for(j = 0; j < pred.cols; ++j){
-                printf("\t%g", pred.vals[i][j]);
-            }
-            printf("\n");
-        }
-
-        free_matrix(pred);
-
-        fprintf(stderr, "%lf seconds, %d images, %d total\n", sec(clock()-time), val.X.rows, curr);
-        free_data(val);
-    }
-}
-
-
-void threat_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
-{
-#ifdef OPENCV
-    float threat = 0;
-    float roll = .2;
-
-    printf("Classifier Demo\n");
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    list *options = read_data_cfg(datacfg);
-
-    srand(2222222);
-    CvCapture * cap;
-
-    if(filename){
-        cap = cvCaptureFromFile(filename);
-    }else{
-        cap = cvCaptureFromCAM(cam_index);
-    }
-
-    int top = option_find_int(options, "top", 1);
-
-    char *name_list = option_find_str(options, "names", 0);
-    char **names = get_labels(name_list);
-
-    int *indexes = calloc(top, sizeof(int));
-
-    if(!cap) error("Couldn't connect to webcam.\n");
-    //cvNamedWindow("Threat", CV_WINDOW_NORMAL); 
-    //cvResizeWindow("Threat", 512, 512);
-    float fps = 0;
-    int i;
-
-    int count = 0;
-
-    while(1){
-        ++count;
-        struct timeval tval_before, tval_after, tval_result;
-        gettimeofday(&tval_before, NULL);
-
-        image in = get_image_from_stream(cap);
-        if(!in.data) break;
-        image in_s = resize_image(in, net.w, net.h);
-
-        image out = in;
-        int x1 = out.w / 20;
-        int y1 = out.h / 20;
-        int x2 = 2*x1;
-        int y2 = out.h - out.h/20;
-
-        int border = .01*out.h;
-        int h = y2 - y1 - 2*border;
-        int w = x2 - x1 - 2*border;
-
-        float *predictions = network_predict(net, in_s.data);
-        float curr_threat = 0;
-        if(1){
-            curr_threat = predictions[0] * 0 + 
-                predictions[1] * .6 + 
-                predictions[2];
-        } else {
-            curr_threat = predictions[218] +
-                predictions[539] + 
-                predictions[540] + 
-                predictions[368] + 
-                predictions[369] + 
-                predictions[370];
-        }
-        threat = roll * curr_threat + (1-roll) * threat;
-
-        draw_box_width(out, x2 + border, y1 + .02*h, x2 + .5 * w, y1 + .02*h + border, border, 0,0,0);
-        if(threat > .97) {
-            draw_box_width(out,  x2 + .5 * w + border,
-                    y1 + .02*h - 2*border, 
-                    x2 + .5 * w + 6*border, 
-                    y1 + .02*h + 3*border, 3*border, 1,0,0);
-        }
-        draw_box_width(out,  x2 + .5 * w + border,
-                y1 + .02*h - 2*border, 
-                x2 + .5 * w + 6*border, 
-                y1 + .02*h + 3*border, .5*border, 0,0,0);
-        draw_box_width(out, x2 + border, y1 + .42*h, x2 + .5 * w, y1 + .42*h + border, border, 0,0,0);
-        if(threat > .57) {
-            draw_box_width(out,  x2 + .5 * w + border,
-                    y1 + .42*h - 2*border, 
-                    x2 + .5 * w + 6*border, 
-                    y1 + .42*h + 3*border, 3*border, 1,1,0);
-        }
-        draw_box_width(out,  x2 + .5 * w + border,
-                y1 + .42*h - 2*border, 
-                x2 + .5 * w + 6*border, 
-                y1 + .42*h + 3*border, .5*border, 0,0,0);
-
-        draw_box_width(out, x1, y1, x2, y2, border, 0,0,0);
-        for(i = 0; i < threat * h ; ++i){
-            float ratio = (float) i / h;
-            float r = (ratio < .5) ? (2*(ratio)) : 1;
-            float g = (ratio < .5) ? 1 : 1 - 2*(ratio - .5);
-            draw_box_width(out, x1 + border, y2 - border - i, x2 - border, y2 - border - i, 1, r, g, 0);
-        }
-        top_predictions(net, top, indexes);
-        char buff[256];
-        sprintf(buff, "/home/pjreddie/tmp/threat_%06d", count);
-        //save_image(out, buff);
-
-        printf("\033[2J");
-        printf("\033[1;1H");
-        printf("\nFPS:%.0f\n",fps);
-
-        for(i = 0; i < top; ++i){
-            int index = indexes[i];
-            printf("%.1f%%: %s\n", predictions[index]*100, names[index]);
-        }
-
-        if(1){
-            show_image(out, "Threat");
-            cvWaitKey(10);
-        }
-        free_image(in_s);
-        free_image(in);
-
-        gettimeofday(&tval_after, NULL);
-        timersub(&tval_after, &tval_before, &tval_result);
-        float curr = 1000000.f/((long int)tval_result.tv_usec);
-        fps = .9*fps + .1*curr;
-    }
-#endif
-}
-
-
-void gun_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
-{
-#ifdef OPENCV
-    int bad_cats[] = {218, 539, 540, 1213, 1501, 1742, 1911, 2415, 4348, 19223, 368, 369, 370, 1133, 1200, 1306, 2122, 2301, 2537, 2823, 3179, 3596, 3639, 4489, 5107, 5140, 5289, 6240, 6631, 6762, 7048, 7171, 7969, 7984, 7989, 8824, 8927, 9915, 10270, 10448, 13401, 15205, 18358, 18894, 18895, 19249, 19697};
-
-    printf("Classifier Demo\n");
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    list *options = read_data_cfg(datacfg);
-
-    srand(2222222);
-    CvCapture * cap;
-
-    if(filename){
-        cap = cvCaptureFromFile(filename);
-    }else{
-        cap = cvCaptureFromCAM(cam_index);
-    }
-
-    int top = option_find_int(options, "top", 1);
-
-    char *name_list = option_find_str(options, "names", 0);
-    char **names = get_labels(name_list);
-
-    int *indexes = calloc(top, sizeof(int));
-
-    if(!cap) error("Couldn't connect to webcam.\n");
-    cvNamedWindow("Threat Detection", CV_WINDOW_NORMAL); 
-    cvResizeWindow("Threat Detection", 512, 512);
-    float fps = 0;
-    int i;
-
-    while(1){
-        struct timeval tval_before, tval_after, tval_result;
-        gettimeofday(&tval_before, NULL);
-
-        image in = get_image_from_stream(cap);
-        image in_s = resize_image(in, net.w, net.h);
-        show_image(in, "Threat Detection");
-
-        float *predictions = network_predict(net, in_s.data);
-        top_predictions(net, top, indexes);
-
-        printf("\033[2J");
-        printf("\033[1;1H");
-
-        int threat = 0;
-        for(i = 0; i < sizeof(bad_cats)/sizeof(bad_cats[0]); ++i){
-            int index = bad_cats[i];
-            if(predictions[index] > .01){
-                printf("Threat Detected!\n");
-                threat = 1;
-                break;
-            }
-        }
-        if(!threat) printf("Scanning...\n");
-        for(i = 0; i < sizeof(bad_cats)/sizeof(bad_cats[0]); ++i){
-            int index = bad_cats[i];
-            if(predictions[index] > .01){
-                printf("%s\n", names[index]);
-            }
-        }
-
-        free_image(in_s);
-        free_image(in);
-
-        cvWaitKey(10);
-
-        gettimeofday(&tval_after, NULL);
-        timersub(&tval_after, &tval_before, &tval_result);
-        float curr = 1000000.f/((long int)tval_result.tv_usec);
-        fps = .9*fps + .1*curr;
-    }
-#endif
-}
-
-void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
-{
-#ifdef OPENCV
-    printf("Classifier Demo\n");
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    list *options = read_data_cfg(datacfg);
-
-    srand(2222222);
-    CvCapture * cap;
-
-    if(filename){
-        cap = cvCaptureFromFile(filename);
-    }else{
-        cap = cvCaptureFromCAM(cam_index);
-    }
-
-    int top = option_find_int(options, "top", 1);
-
-    char *name_list = option_find_str(options, "names", 0);
-    char **names = get_labels(name_list);
-
-    int *indexes = calloc(top, sizeof(int));
-
-    if(!cap) error("Couldn't connect to webcam.\n");
-    cvNamedWindow("Classifier", CV_WINDOW_NORMAL); 
-    cvResizeWindow("Classifier", 512, 512);
-    float fps = 0;
-    int i;
-
-    while(1){
-        struct timeval tval_before, tval_after, tval_result;
-        gettimeofday(&tval_before, NULL);
-
-        image in = get_image_from_stream(cap);
-        image in_s = resize_image(in, net.w, net.h);
-        show_image(in, "Classifier");
-
-        float *predictions = network_predict(net, in_s.data);
-        if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 1);
-        top_predictions(net, top, indexes);
-
-        printf("\033[2J");
-        printf("\033[1;1H");
-        printf("\nFPS:%.0f\n",fps);
-
-        for(i = 0; i < top; ++i){
-            int index = indexes[i];
-            printf("%.1f%%: %s\n", predictions[index]*100, names[index]);
-        }
-
-        free_image(in_s);
-        free_image(in);
-
-        cvWaitKey(10);
-
-        gettimeofday(&tval_after, NULL);
-        timersub(&tval_after, &tval_before, &tval_result);
-        float curr = 1000000.f/((long int)tval_result.tv_usec);
-        fps = .9*fps + .1*curr;
-    }
-#endif
-}
-
-
-void run_classifier(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
-    int *gpus = 0;
-    int gpu = 0;
-    int ngpus = 0;
-    if(gpu_list){
-        printf("%s\n", gpu_list);
-        int len = strlen(gpu_list);
-        ngpus = 1;
-        int i;
-        for(i = 0; i < len; ++i){
-            if (gpu_list[i] == ',') ++ngpus;
-        }
-        gpus = calloc(ngpus, sizeof(int));
-        for(i = 0; i < ngpus; ++i){
-            gpus[i] = atoi(gpu_list);
-            gpu_list = strchr(gpu_list, ',')+1;
-        }
-    } else {
-        gpu = gpu_index;
-        gpus = &gpu;
-        ngpus = 1;
-    }
-
-    int cam_index = find_int_arg(argc, argv, "-c", 0);
-    int top = find_int_arg(argc, argv, "-t", 0);
-    int clear = find_arg(argc, argv, "-clear");
-    char *data = argv[3];
-    char *cfg = argv[4];
-    char *weights = (argc > 5) ? argv[5] : 0;
-    char *filename = (argc > 6) ? argv[6]: 0;
-    char *layer_s = (argc > 7) ? argv[7]: 0;
-    int layer = layer_s ? atoi(layer_s) : -1;
-    if(0==strcmp(argv[2], "predict")) predict_classifier(data, cfg, weights, filename, top);
-    else if(0==strcmp(argv[2], "try")) try_classifier(data, cfg, weights, filename, atoi(layer_s));
-    else if(0==strcmp(argv[2], "train")) train_classifier(data, cfg, weights, gpus, ngpus, clear);
-    else if(0==strcmp(argv[2], "demo")) demo_classifier(data, cfg, weights, cam_index, filename);
-    else if(0==strcmp(argv[2], "gun")) gun_classifier(data, cfg, weights, cam_index, filename);
-    else if(0==strcmp(argv[2], "threat")) threat_classifier(data, cfg, weights, cam_index, filename);
-    else if(0==strcmp(argv[2], "test")) test_classifier(data, cfg, weights, layer);
-    else if(0==strcmp(argv[2], "label")) label_classifier(data, cfg, weights);
-    else if(0==strcmp(argv[2], "valid")) validate_classifier_single(data, cfg, weights);
-    else if(0==strcmp(argv[2], "validmulti")) validate_classifier_multi(data, cfg, weights);
-    else if(0==strcmp(argv[2], "valid10")) validate_classifier_10(data, cfg, weights);
-    else if(0==strcmp(argv[2], "validcrop")) validate_classifier_crop(data, cfg, weights);
-    else if(0==strcmp(argv[2], "validfull")) validate_classifier_full(data, cfg, weights);
-}
-
-
diff --git a/image.darknet/src/classifier.h b/image.darknet/src/classifier.h
index 3c89f49..8b13789 100644
--- a/image.darknet/src/classifier.h
+++ b/image.darknet/src/classifier.h
@@ -1,2 +1 @@
 
-list *read_data_cfg(char *filename);
diff --git a/image.darknet/src/col2im.h b/image.darknet/src/col2im.h
index 0237497..3fbe053 100644
--- a/image.darknet/src/col2im.h
+++ b/image.darknet/src/col2im.h
@@ -6,7 +6,7 @@ void col2im_cpu(float* data_col,
         int ksize, int stride, int pad, float* data_im);
 
 #ifdef GPU
-void col2im_ongpu(float *data_col,
+void col2im_gpu(float *data_col,
         int channels, int height, int width,
         int ksize, int stride, int pad, float *data_im);
 #endif
diff --git a/image.darknet/src/col2im_kernels.cu b/image.darknet/src/col2im_kernels.cu
index aed2df9..ba45e0f 100644
--- a/image.darknet/src/col2im_kernels.cu
+++ b/image.darknet/src/col2im_kernels.cu
@@ -41,7 +41,7 @@ __global__ void col2im_gpu_kernel(const int n, const float* data_col,
     }
 }
 
-void col2im_ongpu(float *data_col,
+void col2im_gpu(float *data_col,
         int channels, int height, int width,
         int ksize, int stride, int pad, float *data_im){
     // We are going to launch channels * height_col * width_col kernels, each
diff --git a/image.darknet/src/compare.c b/image.darknet/src/compare.c
index 4fd266c..ef1de6c 100644
--- a/image.darknet/src/compare.c
+++ b/image.darknet/src/compare.c
@@ -14,10 +14,8 @@ void train_compare(char *cfgfile, char *weightfile)
     char *base = basecfg(cfgfile);
     char *backup_directory = "/home/pjreddie/backup/";
     printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network net = *load_network(cfgfile, weightfile, 0);
+    
     printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
     int imgs = 1024;
     list *plist = get_paths("data/compare.train.list");
@@ -51,40 +49,37 @@ void train_compare(char *cfgfile, char *weightfile)
         load_thread = load_data_in_thread(args);
         printf("Loaded: %lf seconds\n", sec(clock()-time));
         time=clock();
-        float loss = train_network(net, train);
+        float loss = train_network(&net, train);
         if(avg_loss == -1) avg_loss = loss;
         avg_loss = avg_loss*.9 + loss*.1;
-        printf("%.3f: %f, %f avg, %lf seconds, %d images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen);
+        printf("%.3f: %f, %f avg, %lf seconds, %ld images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen);
         free_data(train);
         if(i%100 == 0){
             char buff[256];
             sprintf(buff, "%s/%s_%d_minor_%d.weights",backup_directory,base, epoch, i);
-            save_weights(net, buff);
+            save_weights(&net, buff);
         }
         if(*net.seen/N > epoch){
             epoch = *net.seen/N;
             i = 0;
             char buff[256];
             sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
-            save_weights(net, buff);
+            save_weights(&net, buff);
             if(epoch%22 == 0) net.learning_rate *= .1;
         }
     }
     pthread_join(load_thread, 0);
     free_data(buffer);
-    free_network(net);
+    free_network(&net);
     free_ptrs((void**)paths, plist->size);
     free_list(plist);
     free(base);
 }
 
-void validate_compare(char *filename, char *weightfile)
+void validate_compare(char *cfgfile, char *weightfile)
 {
     int i = 0;
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+  network net = *load_network(cfgfile, weightfile, 0);
     srand(time(0));
 
     list *plist = get_paths("data/compare.val.list");
@@ -127,7 +122,7 @@ void validate_compare(char *filename, char *weightfile)
         printf("Loaded: %d images in %lf seconds\n", val.X.rows, sec(clock()-time));
 
         time=clock();
-        matrix pred = network_predict_data(net, val);
+        matrix pred = network_predict_data(&net, val);
         int j,k;
         for(j = 0; j < val.y.rows; ++j){
             for(k = 0; k < 20; ++k){
@@ -179,7 +174,7 @@ int bbox_comparator(const void *a, const void *b)
     float *X  = calloc(net.w*net.h*net.c, sizeof(float));
     memcpy(X,                   im1.data, im1.w*im1.h*im1.c*sizeof(float));
     memcpy(X+im1.w*im1.h*im1.c, im2.data, im2.w*im2.h*im2.c*sizeof(float));
-    float *predictions = network_predict(net, X);
+    float *predictions = network_predict(&net, X);
     
     free_image(im1);
     free_image(im2);
@@ -208,7 +203,7 @@ void bbox_fight(network net, sortable_bbox *a, sortable_bbox *b, int classes, in
     float *X  = calloc(net.w*net.h*net.c, sizeof(float));
     memcpy(X,                   im1.data, im1.w*im1.h*im1.c*sizeof(float));
     memcpy(X+im1.w*im1.h*im1.c, im2.data, im2.w*im2.h*im2.c*sizeof(float));
-    float *predictions = network_predict(net, X);
+    float *predictions = network_predict(&net, X);
     ++total_compares;
 
     int i;
@@ -224,15 +219,12 @@ void bbox_fight(network net, sortable_bbox *a, sortable_bbox *b, int classes, in
     free(X);
 }
 
-void SortMaster3000(char *filename, char *weightfile)
+void SortMaster3000(char *cfgfile, char *weightfile)
 {
     int i = 0;
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfgfile, weightfile, 0);
     srand(time(0));
-    set_batch_network(&net, 1);
+    set_batch_network(net, 1);
 
     list *plist = get_paths("data/compare.sort.list");
     //list *plist = get_paths("data/compare.val.old");
@@ -243,7 +235,7 @@ void SortMaster3000(char *filename, char *weightfile)
     printf("Sorting %d boxes...\n", N);
     for(i = 0; i < N; ++i){
         boxes[i].filename = paths[i];
-        boxes[i].net = net;
+        boxes[i].net = *net;
         boxes[i].class = 7;
         boxes[i].elo = 1500;
     }
@@ -255,17 +247,13 @@ void SortMaster3000(char *filename, char *weightfile)
     printf("Sorted in %d compares, %f secs\n", total_compares, sec(clock()-time));
 }
 
-void BattleRoyaleWithCheese(char *filename, char *weightfile)
+void BattleRoyaleWithCheese(char *cfgfile, char *weightfile)
 {
     int classes = 20;
     int i,j;
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
+    network *net = load_network(cfgfile, weightfile, 0);
     srand(time(0));
-    set_batch_network(&net, 1);
-
+    set_batch_network(net, 1);
     list *plist = get_paths("data/compare.sort.list");
     //list *plist = get_paths("data/compare.small.list");
     //list *plist = get_paths("data/compare.cat.list");
@@ -278,7 +266,7 @@ void BattleRoyaleWithCheese(char *filename, char *weightfile)
     printf("Battling %d boxes...\n", N);
     for(i = 0; i < N; ++i){
         boxes[i].filename = paths[i];
-        boxes[i].net = net;
+        boxes[i].net = *net;
         boxes[i].classes = classes;
         boxes[i].elos = calloc(classes, sizeof(float));;
         for(j = 0; j < classes; ++j){
@@ -292,7 +280,7 @@ void BattleRoyaleWithCheese(char *filename, char *weightfile)
         printf("Round: %d\n", round);
         shuffle(boxes, N, sizeof(sortable_bbox));
         for(i = 0; i < N/2; ++i){
-            bbox_fight(net, boxes+i*2, boxes+i*2+1, classes, -1);
+            bbox_fight(*net, boxes+i*2, boxes+i*2+1, classes, -1);
         }
         printf("Round: %f secs, %d remaining\n", sec(clock()-round_time), N);
     }
@@ -312,7 +300,7 @@ void BattleRoyaleWithCheese(char *filename, char *weightfile)
 
             sorta_shuffle(boxes, N, sizeof(sortable_bbox), 10);
             for(i = 0; i < N/2; ++i){
-                bbox_fight(net, boxes+i*2, boxes+i*2+1, classes, class);
+                bbox_fight(*net, boxes+i*2, boxes+i*2+1, classes, class);
             }
             qsort(boxes, N, sizeof(sortable_bbox), elo_comparator);
             if(round <= 20) N = (N*9/10)/2*2;
diff --git a/image.darknet/src/connected_layer.c b/image.darknet/src/connected_layer.c
index b678ed0..353f4e5 100644
--- a/image.darknet/src/connected_layer.c
+++ b/image.darknet/src/connected_layer.c
@@ -1,4 +1,5 @@
 #include "connected_layer.h"
+#include "convolutional_layer.h"
 #include "batchnorm_layer.h"
 #include "utils.h"
 #include "cuda.h"
@@ -10,10 +11,11 @@
 #include <stdlib.h>
 #include <string.h>
 
-connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize)
+layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam)
 {
     int i;
-    connected_layer l = {0};
+    layer l = {0};
+    l.learning_rate_scale = 1;
     l.type = CONNECTED;
 
     l.inputs = inputs;
@@ -50,6 +52,14 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
         l.biases[i] = 0;
     }
 
+    if(adam){
+        l.m = calloc(l.inputs*l.outputs, sizeof(float));
+        l.v = calloc(l.inputs*l.outputs, sizeof(float));
+        l.bias_m = calloc(l.outputs, sizeof(float));
+        l.scale_m = calloc(l.outputs, sizeof(float));
+        l.bias_v = calloc(l.outputs, sizeof(float));
+        l.scale_v = calloc(l.outputs, sizeof(float));
+    }
     if(batch_normalize){
         l.scales = calloc(outputs, sizeof(float));
         l.scale_updates = calloc(outputs, sizeof(float));
@@ -82,10 +92,16 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
 
     l.output_gpu = cuda_make_array(l.output, outputs*batch);
     l.delta_gpu = cuda_make_array(l.delta, outputs*batch);
-    if(batch_normalize){
-        l.scales_gpu = cuda_make_array(l.scales, outputs);
-        l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
+    if (adam) {
+        l.m_gpu =       cuda_make_array(0, inputs*outputs);
+        l.v_gpu =       cuda_make_array(0, inputs*outputs);
+        l.bias_m_gpu =  cuda_make_array(0, outputs);
+        l.bias_v_gpu =  cuda_make_array(0, outputs);
+        l.scale_m_gpu = cuda_make_array(0, outputs);
+        l.scale_v_gpu = cuda_make_array(0, outputs);
+    }
 
+    if(batch_normalize){
         l.mean_gpu = cuda_make_array(l.mean, outputs);
         l.variance_gpu = cuda_make_array(l.variance, outputs);
 
@@ -95,8 +111,17 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
         l.mean_delta_gpu = cuda_make_array(l.mean, outputs);
         l.variance_delta_gpu = cuda_make_array(l.variance, outputs);
 
+        l.scales_gpu = cuda_make_array(l.scales, outputs);
+        l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
+
         l.x_gpu = cuda_make_array(l.output, l.batch*outputs);
         l.x_norm_gpu = cuda_make_array(l.output, l.batch*outputs);
+#ifdef CUDNN
+        cudnnCreateTensorDescriptor(&l.normTensorDesc);
+        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
+        cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
+        cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
+#endif
     }
 #endif
     l.activation = activation;
@@ -104,8 +129,12 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
     return l;
 }
 
-void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_connected_layer(layer l, update_args a)
 {
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
     axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
     scal_cpu(l.outputs, momentum, l.bias_updates, 1);
 
@@ -119,63 +148,39 @@ void update_connected_layer(connected_layer l, int batch, float learning_rate, f
     scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1);
 }
 
-void forward_connected_layer(connected_layer l, network_state state)
+void forward_connected_layer(layer l, network net)
 {
-    int i;
     fill_cpu(l.outputs*l.batch, 0, l.output, 1);
     int m = l.batch;
     int k = l.inputs;
     int n = l.outputs;
-    float *a = state.input;
+    float *a = net.input;
     float *b = l.weights;
     float *c = l.output;
     gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
     if(l.batch_normalize){
-        if(state.train){
-            mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
-            variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);
-
-            scal_cpu(l.outputs, .95, l.rolling_mean, 1);
-            axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1);
-            scal_cpu(l.outputs, .95, l.rolling_variance, 1);
-            axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1);
-
-            copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
-            normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);   
-            copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
-        } else {
-            normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1);
-        }
-        scale_bias(l.output, l.scales, l.batch, l.outputs, 1);
-    }
-    for(i = 0; i < l.batch; ++i){
-        axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1);
+        forward_batchnorm_layer(l, net);
+    } else {
+        add_bias(l.output, l.biases, l.batch, l.outputs, 1);
     }
     activate_array(l.output, l.outputs*l.batch, l.activation);
 }
 
-void backward_connected_layer(connected_layer l, network_state state)
+void backward_connected_layer(layer l, network net)
 {
-    int i;
     gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
-    for(i = 0; i < l.batch; ++i){
-        axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
-    }
-    if(l.batch_normalize){
-        backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates);
-
-        scale_bias(l.delta, l.scales, l.batch, l.outputs, 1);
 
-        mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta);
-        variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta);
-        normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta);
+    if(l.batch_normalize){
+        backward_batchnorm_layer(l, net);
+    } else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.outputs, 1);
     }
 
     int m = l.outputs;
     int k = l.batch;
     int n = l.inputs;
     float *a = l.delta;
-    float *b = state.input;
+    float *b = net.input;
     float *c = l.weight_updates;
     gemm(1,0,m,n,k,1,a,m,b,n,1,c,n);
 
@@ -185,7 +190,7 @@ void backward_connected_layer(connected_layer l, network_state state)
 
     a = l.delta;
     b = l.weights;
-    c = state.delta;
+    c = net.delta;
 
     if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
 }
@@ -213,11 +218,11 @@ void statistics_connected_layer(layer l)
         printf("Scales ");
         print_statistics(l.scales, l.outputs);
         /*
-        printf("Rolling Mean ");
-        print_statistics(l.rolling_mean, l.outputs);
-        printf("Rolling Variance ");
-        print_statistics(l.rolling_variance, l.outputs);
-        */
+           printf("Rolling Mean ");
+           print_statistics(l.rolling_mean, l.outputs);
+           printf("Rolling Variance ");
+           print_statistics(l.rolling_variance, l.outputs);
+         */
     }
     printf("Biases ");
     print_statistics(l.biases, l.outputs);
@@ -227,7 +232,7 @@ void statistics_connected_layer(layer l)
 
 #ifdef GPU
 
-void pull_connected_layer(connected_layer l)
+void pull_connected_layer(layer l)
 {
     cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
     cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
@@ -240,7 +245,7 @@ void pull_connected_layer(connected_layer l)
     }
 }
 
-void push_connected_layer(connected_layer l)
+void push_connected_layer(layer l)
 {
     cuda_push_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
     cuda_push_array(l.biases_gpu, l.biases, l.outputs);
@@ -253,62 +258,70 @@ void push_connected_layer(connected_layer l)
     }
 }
 
-void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_connected_layer_gpu(layer l, update_args a)
 {
-    axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
-    scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+    if(a.adam){
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.inputs*l.outputs, batch, a.t);
+        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t);
+        if(l.scales_gpu){
+            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t);
+        }
+    }else{
+        axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+        scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1);
 
-    if(l.batch_normalize){
-        axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
-        scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1);
-    }
+        if(l.batch_normalize){
+            axpy_gpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+            scal_gpu(l.outputs, momentum, l.scale_updates_gpu, 1);
+        }
 
-    axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
-    axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
-    scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
+        axpy_gpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_gpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        scal_gpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
+    }
 }
 
-void forward_connected_layer_gpu(connected_layer l, network_state state)
+void forward_connected_layer_gpu(layer l, network net)
 {
-    int i;
-    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
 
     int m = l.batch;
     int k = l.inputs;
     int n = l.outputs;
-    float * a = state.input;
+    float * a = net.input_gpu;
     float * b = l.weights_gpu;
     float * c = l.output_gpu;
-    gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
-    if(l.batch_normalize){
-        forward_batchnorm_layer_gpu(l, state);
-    }
-    for(i = 0; i < l.batch; ++i){
-        axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
+    gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
+
+    if (l.batch_normalize) {
+        forward_batchnorm_layer_gpu(l, net);
+    } else {
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.outputs, 1);
     }
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
 }
 
-void backward_connected_layer_gpu(connected_layer l, network_state state)
+void backward_connected_layer_gpu(layer l, network net)
 {
-    int i;
-    constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
-    for(i = 0; i < l.batch; ++i){
-        axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
-    }
-
+    constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
     if(l.batch_normalize){
-        backward_batchnorm_layer_gpu(l, state);
+        backward_batchnorm_layer_gpu(l, net);
+    } else {
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.outputs, 1);
     }
 
     int m = l.outputs;
     int k = l.batch;
     int n = l.inputs;
     float * a = l.delta_gpu;
-    float * b = state.input;
+    float * b = net.input_gpu;
     float * c = l.weight_updates_gpu;
-    gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
+    gemm_gpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
 
     m = l.batch;
     k = l.outputs;
@@ -316,8 +329,8 @@ void backward_connected_layer_gpu(connected_layer l, network_state state)
 
     a = l.delta_gpu;
     b = l.weights_gpu;
-    c = state.delta;
+    c = net.delta_gpu;
 
-    if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
+    if(c) gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
 }
 #endif
diff --git a/image.darknet/src/connected_layer.h b/image.darknet/src/connected_layer.h
index 23797b1..6727a96 100644
--- a/image.darknet/src/connected_layer.h
+++ b/image.darknet/src/connected_layer.h
@@ -5,22 +5,18 @@
 #include "layer.h"
 #include "network.h"
 
-typedef layer connected_layer;
+layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam);
 
-connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize);
-
-void forward_connected_layer(connected_layer layer, network_state state);
-void backward_connected_layer(connected_layer layer, network_state state);
-void update_connected_layer(connected_layer layer, int batch, float learning_rate, float momentum, float decay);
-void denormalize_connected_layer(layer l);
-void statistics_connected_layer(layer l);
+void forward_connected_layer(layer l, network net);
+void backward_connected_layer(layer l, network net);
+void update_connected_layer(layer l, update_args a);
 
 #ifdef GPU
-void forward_connected_layer_gpu(connected_layer layer, network_state state);
-void backward_connected_layer_gpu(connected_layer layer, network_state state);
-void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay);
-void push_connected_layer(connected_layer layer);
-void pull_connected_layer(connected_layer layer);
+void forward_connected_layer_gpu(layer l, network net);
+void backward_connected_layer_gpu(layer l, network net);
+void update_connected_layer_gpu(layer l, update_args a);
+void push_connected_layer(layer l);
+void pull_connected_layer(layer l);
 #endif
 
 #endif
diff --git a/image.darknet/src/convolutional_kernels.cu b/image.darknet/src/convolutional_kernels.cu
index fcaea03..4a1047b 100644
--- a/image.darknet/src/convolutional_kernels.cu
+++ b/image.darknet/src/convolutional_kernels.cu
@@ -33,7 +33,7 @@ __global__ void binarize_input_kernel(float *input, int n, int size, float *bina
     int i = 0;
     float mean = 0;
     for(i = 0; i < n; ++i){
-        mean += abs(input[i*size + s]);
+        mean += fabsf(input[i*size + s]);
     }
     mean = mean / n;
     for(i = 0; i < n; ++i){
@@ -55,7 +55,7 @@ __global__ void binarize_weights_kernel(float *weights, int n, int size, float *
     int i = 0;
     float mean = 0;
     for(i = 0; i < size; ++i){
-        mean += abs(weights[f*size + i]);
+        mean += fabsf(weights[f*size + i]);
     }
     mean = mean / size;
     for(i = 0; i < size; ++i){
@@ -70,19 +70,19 @@ void binarize_weights_gpu(float *weights, int n, int size, float *binary)
     check_error(cudaPeekAtLastError());
 }
 
-void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
+void forward_convolutional_layer_gpu(convolutional_layer l, network net)
 {
-    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
     if(l.binary){
-        binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
+        binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
         swap_binary(&l);
     }
 
     if(l.xnor){
-        binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
+        binarize_weights_gpu(l.weights_gpu, l.n, l.c/l.groups*l.size*l.size, l.binary_weights_gpu);
         swap_binary(&l);
-        binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
-        state.input = l.binary_input_gpu;
+        binarize_gpu(net.input_gpu, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
+        net.input_gpu = l.binary_input_gpu;
     }
 
 #ifdef CUDNN
@@ -90,74 +90,126 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
     cudnnConvolutionForward(cudnn_handle(),
                 &one,
                 l.srcTensorDesc,
-                state.input,
+                net.input_gpu,
                 l.weightDesc,
                 l.weights_gpu,
                 l.convDesc,
                 l.fw_algo,
-                state.workspace,
+                net.workspace,
                 l.workspace_size,
                 &one,
                 l.dstTensorDesc,
                 l.output_gpu);
 
 #else
-    int i;
-    int m = l.n;
-    int k = l.size*l.size*l.c;
+    int i, j;
+    int m = l.n/l.groups;
+    int k = l.size*l.size*l.c/l.groups;
     int n = l.out_w*l.out_h;
     for(i = 0; i < l.batch; ++i){
-        im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.workspace);
-        float * a = l.weights_gpu;
-        float * b = state.workspace;
-        float * c = l.output_gpu;
-        gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
+        for(j = 0; j < l.groups; ++j){
+            float *a = l.weights_gpu + j*l.nweights/l.groups;
+            float *b = net.workspace;
+            float *c = l.output_gpu + (i*l.groups + j)*n*m;
+            float *im = net.input_gpu + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
+
+            if (l.size == 1){
+                b = im;
+            } else {
+                im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+            }
+            gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
+        }
     }
 #endif
 
     if (l.batch_normalize) {
-        forward_batchnorm_layer_gpu(l, state);
+        forward_batchnorm_layer_gpu(l, net);
+    } else {
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
     }
-    add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
 
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
     //if(l.dot > 0) dot_error_gpu(l);
     if(l.binary || l.xnor) swap_binary(&l);
 }
 
-void backward_convolutional_layer_gpu(convolutional_layer l, network_state state)
+__global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id >= n) return;
+
+    int j = id % w;
+    id /= w;
+    int i = id % h;
+    id /= h;
+    int k = id % c;
+    id /= c;
+    int b = id;
+
+    int w_offset = -(size/2.f);
+    int h_offset = -(size/2.f);
+
+    int out_index = j + w*(i + h*(k + c*b));
+    int l, m;
+    for(l = 0; l < size; ++l){
+        for(m = 0; m < size; ++m){
+            int cur_h = h_offset + i + l;
+            int cur_w = w_offset + j + m;
+            int index = cur_w + w*(cur_h + h*(k + b*c));
+            int valid = (cur_h >= 0 && cur_h < h &&
+                    cur_w >= 0 && cur_w < w);
+            delta[out_index] += valid ? rate*(x[index] - x[out_index]) : 0;
+        }
+    }
+}
+
+extern "C" void smooth_layer(layer l, int size, float rate)
 {
-    //constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    int h = l.out_h;
+    int w = l.out_w;
+    int c = l.out_c;
+
+    size_t n = h*w*c*l.batch;
+
+    smooth_kernel<<<cuda_gridsize(n), BLOCK>>>(l.output_gpu, n, l.w, l.h, l.c, size, rate, l.delta_gpu);
+    check_error(cudaPeekAtLastError());
+}
+
+void backward_convolutional_layer_gpu(convolutional_layer l, network net)
+{
+    if(l.smooth){
+        smooth_layer(l, 5, l.smooth);
+    }
+    //constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
 
-    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
 
     if(l.batch_normalize){
-        backward_batchnorm_layer_gpu(l, state);
-        //axpy_ongpu(l.outputs*l.batch, -state.net.decay, l.x_gpu, 1, l.delta_gpu, 1);
+        backward_batchnorm_layer_gpu(l, net);
     } else {
-        //axpy_ongpu(l.outputs*l.batch, -state.net.decay, l.output_gpu, 1, l.delta_gpu, 1);
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
     }
-    float *original_input = state.input;
+    float *original_input = net.input_gpu;
 
-    if(l.xnor) state.input = l.binary_input_gpu;
+    if(l.xnor) net.input_gpu = l.binary_input_gpu;
 #ifdef CUDNN
     float one = 1;
     cudnnConvolutionBackwardFilter(cudnn_handle(),
             &one,
             l.srcTensorDesc,
-            state.input,
+            net.input_gpu,
             l.ddstTensorDesc,
             l.delta_gpu,
             l.convDesc,
             l.bf_algo,
-            state.workspace,
+            net.workspace,
             l.workspace_size,
             &one,
             l.dweightDesc,
             l.weight_updates_gpu);
 
-    if(state.delta){
+    if(net.delta_gpu){
         if(l.binary || l.xnor) swap_binary(&l);
         cudnnConvolutionBackwardData(cudnn_handle(),
                 &one,
@@ -167,108 +219,111 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
                 l.delta_gpu,
                 l.convDesc,
                 l.bd_algo,
-                state.workspace,
+                net.workspace,
                 l.workspace_size,
                 &one,
                 l.dsrcTensorDesc,
-                state.delta);
+                net.delta_gpu);
         if(l.binary || l.xnor) swap_binary(&l);
-        if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
+        if(l.xnor) gradient_array_gpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu);
     }
 
 #else
-    int m = l.n;
-    int n = l.size*l.size*l.c;
+    int m = l.n/l.groups;
+    int n = l.size*l.size*l.c/l.groups;
     int k = l.out_w*l.out_h;
 
-    int i;
+    int i, j;
     for(i = 0; i < l.batch; ++i){
-        float * a = l.delta_gpu;
-        float * b = state.workspace;
-        float * c = l.weight_updates_gpu;
-
-        im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.workspace);
-        gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);
-
-        if(state.delta){
-            if(l.binary || l.xnor) swap_binary(&l);
-            float * a = l.weights_gpu;
-            float * b = l.delta_gpu;
-            float * c = state.workspace;
-
-            gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);
-
-            col2im_ongpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta + i*l.c*l.h*l.w);
-            if(l.binary || l.xnor) {
-                swap_binary(&l);
+        for(j = 0; j < l.groups; ++j){
+            float *a = l.delta_gpu + (i*l.groups + j)*m*k;
+            float *b = net.workspace;
+            float *c = l.weight_updates_gpu + j*l.nweights/l.groups;
+
+            float *im  = net.input_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;
+            float *imd = net.delta_gpu+(i*l.groups + j)*l.c/l.groups*l.h*l.w;
+
+            im2col_gpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+            gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
+
+            if (net.delta_gpu) {
+                if (l.binary || l.xnor) swap_binary(&l);
+                a = l.weights_gpu + j*l.nweights/l.groups;
+                b = l.delta_gpu + (i*l.groups + j)*m*k;
+                c = net.workspace;
+                if (l.size == 1) {
+                    c = imd;
+                }
+
+                gemm_gpu(1,0,n,k,m,1,a,n,b,k,0,c,k);
+
+                if (l.size != 1) {
+                    col2im_gpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, imd);
+                }
+                if(l.binary || l.xnor) {
+                    swap_binary(&l);
+                }
             }
-            if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w);
+            if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
         }
     }
 #endif
 }
 
-void pull_convolutional_layer(convolutional_layer layer)
+void pull_convolutional_layer(layer l)
 {
-    cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
-    if (layer.batch_normalize){
-        cuda_pull_array(layer.scales_gpu, layer.scales, layer.n);
-        cuda_pull_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
-        cuda_pull_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
-    }
-    if (layer.adam){
-        cuda_pull_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
-        cuda_pull_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
+    cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
+    cuda_pull_array(l.biases_gpu, l.biases, l.n);
+    cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+    cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_pull_array(l.scales_gpu, l.scales, l.n);
+        cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
     }
 }
 
-void push_convolutional_layer(convolutional_layer layer)
+void push_convolutional_layer(layer l)
 {
-    cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
-    if (layer.batch_normalize){
-        cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
-        cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
-        cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
-    }
-    if (layer.adam){
-        cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
-        cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
+    cuda_push_array(l.weights_gpu, l.weights, l.nweights);
+    cuda_push_array(l.biases_gpu, l.biases, l.n);
+    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+    cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_push_array(l.scales_gpu, l.scales, l.n);
+        cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
     }
 }
 
-void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay)
+void update_convolutional_layer_gpu(layer l, update_args a)
 {
-    int size = layer.size*layer.size*layer.c*layer.n;
-    axpy_ongpu(layer.n, learning_rate/batch, layer.bias_updates_gpu, 1, layer.biases_gpu, 1);
-    scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1);
-
-    if(layer.scales_gpu){
-        axpy_ongpu(layer.n, learning_rate/batch, layer.scale_updates_gpu, 1, layer.scales_gpu, 1);
-        scal_ongpu(layer.n, momentum, layer.scale_updates_gpu, 1);
-    }
-
-    if(layer.adam){
-        scal_ongpu(size, layer.B1, layer.m_gpu, 1);
-        scal_ongpu(size, layer.B2, layer.v_gpu, 1);
-
-        axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
+    if(a.adam){
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
+        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
+        if(l.scales_gpu){
+            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
+        }
+    }else{
+        axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);
 
-        axpy_ongpu(size, -(1-layer.B1), layer.weight_updates_gpu, 1, layer.m_gpu, 1);
-        mul_ongpu(size, layer.weight_updates_gpu, 1, layer.weight_updates_gpu, 1);
-        axpy_ongpu(size, (1-layer.B2), layer.weight_updates_gpu, 1, layer.v_gpu, 1);
+        axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+        scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
 
-        adam_gpu(size, layer.weights_gpu, layer.m_gpu, layer.v_gpu, layer.B1, layer.B2, learning_rate/batch, layer.eps, layer.t+1);
-        fill_ongpu(size, 0, layer.weight_updates_gpu, 1);
-    }else{
-        axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
-        axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
-        scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);
+        if(l.scales_gpu){
+            axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+            scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
+        }
+    }
+    if(l.clip){
+        constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1);
     }
 }
 
diff --git a/image.darknet/src/convolutional_layer.c b/image.darknet/src/convolutional_layer.c
index 37211ab..1fb58b0 100644
--- a/image.darknet/src/convolutional_layer.c
+++ b/image.darknet/src/convolutional_layer.c
@@ -12,22 +12,17 @@
 #include "xnor_layer.h"
 #endif
 
-#ifndef AI2
-#define AI2 0
-void forward_xnor_layer(layer l, network_state state);
-#endif
-
 void swap_binary(convolutional_layer *l)
 {
     float *swap = l->weights;
     l->weights = l->binary_weights;
     l->binary_weights = swap;
 
-    #ifdef GPU
+#ifdef GPU
     swap = l->weights_gpu;
     l->weights_gpu = l->binary_weights_gpu;
     l->binary_weights_gpu = swap;
-    #endif
+#endif
 }
 
 void binarize_weights(float *weights, int n, int size, float *binary)
@@ -80,23 +75,15 @@ int convolutional_out_width(convolutional_layer l)
 
 image get_convolutional_image(convolutional_layer l)
 {
-    int h,w,c;
-    h = convolutional_out_height(l);
-    w = convolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.output);
+    return float_to_image(l.out_w,l.out_h,l.out_c,l.output);
 }
 
 image get_convolutional_delta(convolutional_layer l)
 {
-    int h,w,c;
-    h = convolutional_out_height(l);
-    w = convolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.delta);
+    return float_to_image(l.out_w,l.out_h,l.out_c,l.delta);
 }
 
-size_t get_workspace_size(layer l){
+static size_t get_workspace_size(layer l){
 #ifdef CUDNN
     if(gpu_index >= 0){
         size_t most = 0;
@@ -127,8 +114,8 @@ size_t get_workspace_size(layer l){
         if (s > most) most = s;
         return most;
     }
-    #endif
-    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
+#endif
+    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c/l.groups*sizeof(float);
 }
 
 #ifdef GPU
@@ -137,46 +124,62 @@ void cudnn_convolutional_setup(layer *l)
 {
     cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
     cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
-    cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
 
     cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
     cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
-    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
+    cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
+
+    cudnnSetFilter4dDescriptor(l->dweightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); 
+    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c/l->groups, l->size, l->size); 
+    #if CUDNN_MAJOR >= 6
+    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);
+    #else
     cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
+    #endif
+
+    #if CUDNN_MAJOR >= 7
+    cudnnSetConvolutionGroupCount(l->convDesc, l->groups);
+    #else
+    if(l->groups > 1){
+        error("CUDNN < 7 doesn't support groups, please upgrade!");
+    }
+    #endif
+
     cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
             l->srcTensorDesc,
             l->weightDesc,
             l->convDesc,
             l->dstTensorDesc,
-            CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
-            0,
+            CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+            2000000000,
             &l->fw_algo);
     cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(),
             l->weightDesc,
             l->ddstTensorDesc,
             l->convDesc,
             l->dsrcTensorDesc,
-            CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST,
-            0,
+            CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+            2000000000,
             &l->bd_algo);
     cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(),
             l->srcTensorDesc,
             l->ddstTensorDesc,
             l->convDesc,
             l->dweightDesc,
-            CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST,
-            0,
+            CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+            2000000000,
             &l->bf_algo);
 }
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam)
 {
     int i;
     convolutional_layer l = {0};
     l.type = CONVOLUTIONAL;
 
+    l.groups = groups;
     l.h = h;
     l.w = w;
     l.c = c;
@@ -189,17 +192,23 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
     l.pad = padding;
     l.batch_normalize = batch_normalize;
 
-    l.weights = calloc(c*n*size*size, sizeof(float));
-    l.weight_updates = calloc(c*n*size*size, sizeof(float));
+    l.weights = calloc(c/groups*n*size*size, sizeof(float));
+    l.weight_updates = calloc(c/groups*n*size*size, sizeof(float));
 
     l.biases = calloc(n, sizeof(float));
     l.bias_updates = calloc(n, sizeof(float));
 
+    l.nweights = c/groups*n*size*size;
+    l.nbiases = n;
+
     // float scale = 1./sqrt(size*size*c);
-    float scale = sqrt(2./(size*size*c));
-    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
-    int out_h = convolutional_out_height(l);
+    float scale = sqrt(2./(size*size*c/l.groups));
+    //printf("convscale %f\n", scale);
+    //scale = .02;
+    //for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
+    for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_normal();
     int out_w = convolutional_out_width(l);
+    int out_h = convolutional_out_height(l);
     l.out_h = out_h;
     l.out_w = out_w;
     l.out_c = n;
@@ -213,12 +222,12 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
     l.backward = backward_convolutional_layer;
     l.update = update_convolutional_layer;
     if(binary){
-        l.binary_weights = calloc(c*n*size*size, sizeof(float));
-        l.cweights = calloc(c*n*size*size, sizeof(char));
+        l.binary_weights = calloc(l.nweights, sizeof(float));
+        l.cweights = calloc(l.nweights, sizeof(char));
         l.scales = calloc(n, sizeof(float));
     }
     if(xnor){
-        l.binary_weights = calloc(c*n*size*size, sizeof(float));
+        l.binary_weights = calloc(l.nweights, sizeof(float));
         l.binary_input = calloc(l.inputs*l.batch, sizeof(float));
     }
 
@@ -241,9 +250,12 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
         l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
     }
     if(adam){
-        l.adam = 1;
-        l.m = calloc(c*n*size*size, sizeof(float));
-        l.v = calloc(c*n*size*size, sizeof(float));
+        l.m = calloc(l.nweights, sizeof(float));
+        l.v = calloc(l.nweights, sizeof(float));
+        l.bias_m = calloc(n, sizeof(float));
+        l.scale_m = calloc(n, sizeof(float));
+        l.bias_v = calloc(n, sizeof(float));
+        l.scale_v = calloc(n, sizeof(float));
     }
 
 #ifdef GPU
@@ -253,12 +265,16 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
 
     if(gpu_index >= 0){
         if (adam) {
-            l.m_gpu = cuda_make_array(l.m, c*n*size*size);
-            l.v_gpu = cuda_make_array(l.v, c*n*size*size);
+            l.m_gpu = cuda_make_array(l.m, l.nweights);
+            l.v_gpu = cuda_make_array(l.v, l.nweights);
+            l.bias_m_gpu = cuda_make_array(l.bias_m, n);
+            l.bias_v_gpu = cuda_make_array(l.bias_v, n);
+            l.scale_m_gpu = cuda_make_array(l.scale_m, n);
+            l.scale_v_gpu = cuda_make_array(l.scale_v, n);
         }
 
-        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
-        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+        l.weights_gpu = cuda_make_array(l.weights, l.nweights);
+        l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
 
         l.biases_gpu = cuda_make_array(l.biases, n);
         l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
@@ -267,10 +283,10 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
         l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
 
         if(binary){
-            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
         }
         if(xnor){
-            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
             l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
         }
 
@@ -291,6 +307,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
             l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
         }
 #ifdef CUDNN
+        cudnnCreateTensorDescriptor(&l.normTensorDesc);
         cudnnCreateTensorDescriptor(&l.srcTensorDesc);
         cudnnCreateTensorDescriptor(&l.dstTensorDesc);
         cudnnCreateFilterDescriptor(&l.weightDesc);
@@ -305,7 +322,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
     l.workspace_size = get_workspace_size(l);
     l.activation = activation;
 
-    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d  %5.3f BFLOPs\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, (2.0 * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w)/1000000000.);
 
     return l;
 }
@@ -315,8 +332,8 @@ void denormalize_convolutional_layer(convolutional_layer l)
     int i, j;
     for(i = 0; i < l.n; ++i){
         float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
-        for(j = 0; j < l.c*l.size*l.size; ++j){
-            l.weights[i*l.c*l.size*l.size + j] *= scale;
+        for(j = 0; j < l.c/l.groups*l.size*l.size; ++j){
+            l.weights[i*l.c/l.groups*l.size*l.size + j] *= scale;
         }
         l.biases[i] -= l.rolling_mean[i] * scale;
         l.scales[i] = 1;
@@ -325,6 +342,7 @@ void denormalize_convolutional_layer(convolutional_layer l)
     }
 }
 
+/*
 void test_convolutional_layer()
 {
     convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0);
@@ -344,10 +362,10 @@ void test_convolutional_layer()
         3,3,3,3,3,
         3,3,3,3,3,
         3,3,3,3,3};
-    network_state state = {0};
-    state.input = data;
-    forward_convolutional_layer(l, state);
+    //net.input = data;
+    //forward_convolutional_layer(l);
 }
+*/
 
 void resize_convolutional_layer(convolutional_layer *l, int w, int h)
 {
@@ -424,88 +442,106 @@ void backward_bias(float *bias_updates, float *delta, int batch, int n, int size
     }
 }
 
-void forward_convolutional_layer(convolutional_layer l, network_state state)
+void forward_convolutional_layer(convolutional_layer l, network net)
 {
-    int out_h = convolutional_out_height(l);
-    int out_w = convolutional_out_width(l);
-    int i;
+    int i, j;
 
     fill_cpu(l.outputs*l.batch, 0, l.output, 1);
 
     if(l.xnor){
-        binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
+        binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights);
         swap_binary(&l);
-        binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input);
-        state.input = l.binary_input;
+        binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input);
+        net.input = l.binary_input;
     }
 
-    int m = l.n;
-    int k = l.size*l.size*l.c;
-    int n = out_h*out_w;
-
-
-    float *a = l.weights;
-    float *b = state.workspace;
-    float *c = l.output;
-
+    int m = l.n/l.groups;
+    int k = l.size*l.size*l.c/l.groups;
+    int n = l.out_w*l.out_h;
     for(i = 0; i < l.batch; ++i){
-        im2col_cpu(state.input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, b);
-        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
-        c += n*m;
-        state.input += l.c*l.h*l.w;
+        for(j = 0; j < l.groups; ++j){
+            float *a = l.weights + j*l.nweights/l.groups;
+            float *b = net.workspace;
+            float *c = l.output + (i*l.groups + j)*n*m;
+            float *im =  net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
+
+            if (l.size == 1) {
+                b = im;
+            } else {
+                im2col_cpu(im, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+            }
+            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+        }
     }
 
     if(l.batch_normalize){
-        forward_batchnorm_layer(l, state);
+        forward_batchnorm_layer(l, net);
+    } else {
+        add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
     }
-    add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
 
-    activate_array(l.output, m*n*l.batch, l.activation);
+    activate_array(l.output, l.outputs*l.batch, l.activation);
     if(l.binary || l.xnor) swap_binary(&l);
 }
 
-void backward_convolutional_layer(convolutional_layer l, network_state state)
+void backward_convolutional_layer(convolutional_layer l, network net)
 {
-    int i;
-    int m = l.n;
-    int n = l.size*l.size*l.c;
-    int k = convolutional_out_height(l)*
-        convolutional_out_width(l);
+    int i, j;
+    int m = l.n/l.groups;
+    int n = l.size*l.size*l.c/l.groups;
+    int k = l.out_w*l.out_h;
 
-    gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
-    backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
 
     if(l.batch_normalize){
-        backward_batchnorm_layer(l, state);
+        backward_batchnorm_layer(l, net);
+    } else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
     }
 
     for(i = 0; i < l.batch; ++i){
-        float *a = l.delta + i*m*k;
-        float *b = state.workspace;
-        float *c = l.weight_updates;
-
-        float *im = state.input+i*l.c*l.h*l.w;
+        for(j = 0; j < l.groups; ++j){
+            float *a = l.delta + (i*l.groups + j)*m*k;
+            float *b = net.workspace;
+            float *c = l.weight_updates + j*l.nweights/l.groups;
+
+            float *im  = net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
+            float *imd = net.delta + (i*l.groups + j)*l.c/l.groups*l.h*l.w;
+
+            if(l.size == 1){
+                b = im;
+            } else {
+                im2col_cpu(im, l.c/l.groups, l.h, l.w, 
+                        l.size, l.stride, l.pad, b);
+            }
 
-        im2col_cpu(im, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, b);
-        gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
+            gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
 
-        if(state.delta){
-            a = l.weights;
-            b = l.delta + i*m*k;
-            c = state.workspace;
+            if (net.delta) {
+                a = l.weights + j*l.nweights/l.groups;
+                b = l.delta + (i*l.groups + j)*m*k;
+                c = net.workspace;
+                if (l.size == 1) {
+                    c = imd;
+                }
 
-            gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);
+                gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);
 
-            col2im_cpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+                if (l.size != 1) {
+                    col2im_cpu(net.workspace, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, imd);
+                }
+            }
         }
     }
 }
 
-void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_convolutional_layer(convolutional_layer l, update_args a)
 {
-    int size = l.size*l.size*l.c*l.n;
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
     axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
     scal_cpu(l.n, momentum, l.bias_updates, 1);
 
@@ -514,9 +550,9 @@ void update_convolutional_layer(convolutional_layer l, int batch, float learning
         scal_cpu(l.n, momentum, l.scale_updates, 1);
     }
 
-    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
-    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
-    scal_cpu(size, momentum, l.weight_updates, 1);
+    axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(l.nweights, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(l.nweights, momentum, l.weight_updates, 1);
 }
 
 
@@ -524,7 +560,7 @@ image get_convolutional_weight(convolutional_layer l, int i)
 {
     int h = l.size;
     int w = l.size;
-    int c = l.c;
+    int c = l.c/l.groups;
     return float_to_image(w,h,c,l.weights+i*h*w*c);
 }
 
@@ -558,8 +594,14 @@ image *get_weights(convolutional_layer l)
     int i;
     for(i = 0; i < l.n; ++i){
         weights[i] = copy_image(get_convolutional_weight(l, i));
-        //normalize_image(weights[i]);
+        normalize_image(weights[i]);
+        /*
+           char buff[256];
+           sprintf(buff, "filter%d", i);
+           save_image(weights[i], buff);
+         */
     }
+    //error("hey");
     return weights;
 }
 
diff --git a/image.darknet/src/convolutional_layer.h b/image.darknet/src/convolutional_layer.h
index 970aa10..6c261f5 100644
--- a/image.darknet/src/convolutional_layer.h
+++ b/image.darknet/src/convolutional_layer.h
@@ -10,31 +10,31 @@
 typedef layer convolutional_layer;
 
 #ifdef GPU
-void forward_convolutional_layer_gpu(convolutional_layer layer, network_state state);
-void backward_convolutional_layer_gpu(convolutional_layer layer, network_state state);
-void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay);
+void forward_convolutional_layer_gpu(convolutional_layer layer, network net);
+void backward_convolutional_layer_gpu(convolutional_layer layer, network net);
+void update_convolutional_layer_gpu(convolutional_layer layer, update_args a);
 
 void push_convolutional_layer(convolutional_layer layer);
 void pull_convolutional_layer(convolutional_layer layer);
 
 void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
+void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
 #ifdef CUDNN
 void cudnn_convolutional_setup(layer *l);
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
-void denormalize_convolutional_layer(convolutional_layer l);
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
 void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
-void forward_convolutional_layer(const convolutional_layer layer, network_state state);
-void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay);
+void forward_convolutional_layer(const convolutional_layer layer, network net);
+void update_convolutional_layer(convolutional_layer layer, update_args a);
 image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights);
 void binarize_weights(float *weights, int n, int size, float *binary);
 void swap_binary(convolutional_layer *l);
 void binarize_weights2(float *weights, int n, int size, char *binary, float *scales);
 
-void backward_convolutional_layer(convolutional_layer layer, network_state state);
+void backward_convolutional_layer(convolutional_layer layer, network net);
 
 void add_bias(float *output, float *biases, int batch, int n, int size);
 void backward_bias(float *bias_updates, float *delta, int batch, int n, int size);
@@ -45,8 +45,6 @@ image get_convolutional_weight(convolutional_layer layer, int i);
 
 int convolutional_out_height(convolutional_layer layer);
 int convolutional_out_width(convolutional_layer layer);
-void rescale_weights(convolutional_layer l, float scale, float trans);
-void rgbgr_weights(convolutional_layer l);
 
 #endif
 
diff --git a/image.darknet/src/cost_layer.c b/image.darknet/src/cost_layer.c
index 39d2398..2138ff2 100644
--- a/image.darknet/src/cost_layer.c
+++ b/image.darknet/src/cost_layer.c
@@ -9,9 +9,12 @@
 
 COST_TYPE get_cost_type(char *s)
 {
+    if (strcmp(s, "seg")==0) return SEG;
     if (strcmp(s, "sse")==0) return SSE;
     if (strcmp(s, "masked")==0) return MASKED;
     if (strcmp(s, "smooth")==0) return SMOOTH;
+    if (strcmp(s, "L1")==0) return L1;
+    if (strcmp(s, "wgan")==0) return WGAN;
     fprintf(stderr, "Couldn't find cost type %s, going with SSE\n", s);
     return SSE;
 }
@@ -19,12 +22,18 @@ COST_TYPE get_cost_type(char *s)
 char *get_cost_string(COST_TYPE a)
 {
     switch(a){
+        case SEG:
+            return "seg";
         case SSE:
             return "sse";
         case MASKED:
             return "masked";
         case SMOOTH:
             return "smooth";
+        case L1:
+            return "L1";
+        case WGAN:
+            return "wgan";
     }
     return "sse";
 }
@@ -70,26 +79,28 @@ void resize_cost_layer(cost_layer *l, int inputs)
 #endif
 }
 
-void forward_cost_layer(cost_layer l, network_state state)
+void forward_cost_layer(cost_layer l, network net)
 {
-    if (!state.truth) return;
+    if (!net.truth) return;
     if(l.cost_type == MASKED){
         int i;
         for(i = 0; i < l.batch*l.inputs; ++i){
-            if(state.truth[i] == SECRET_NUM) state.input[i] = SECRET_NUM;
+            if(net.truth[i] == SECRET_NUM) net.input[i] = SECRET_NUM;
         }
     }
     if(l.cost_type == SMOOTH){
-        smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
+        smooth_l1_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output);
+    }else if(l.cost_type == L1){
+        l1_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output);
     } else {
-        l2_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
+        l2_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output);
     }
     l.cost[0] = sum_array(l.output, l.batch*l.inputs);
 }
 
-void backward_cost_layer(const cost_layer l, network_state state)
+void backward_cost_layer(const cost_layer l, network net)
 {
-    axpy_cpu(l.batch*l.inputs, l.scale, l.delta, 1, state.delta, 1);
+    axpy_cpu(l.batch*l.inputs, l.scale, l.delta, 1, net.delta, 1);
 }
 
 #ifdef GPU
@@ -113,17 +124,30 @@ int float_abs_compare (const void * a, const void * b)
     return (fa > fb) - (fa < fb);
 }
 
-void forward_cost_layer_gpu(cost_layer l, network_state state)
+void forward_cost_layer_gpu(cost_layer l, network net)
 {
-    if (!state.truth) return;
-    if (l.cost_type == MASKED) {
-        mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth);
+    if (!net.truth) return;
+    if(l.smooth){
+        scal_gpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
+        add_gpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
     }
 
     if(l.cost_type == SMOOTH){
-        smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu);
+        smooth_l1_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
+    } else if (l.cost_type == L1){
+        l1_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
+    } else if (l.cost_type == WGAN){
+        wgan_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
     } else {
-        l2_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu);
+        l2_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
+    }
+
+    if (l.cost_type == SEG && l.noobject_scale != 1) {
+        scale_mask_gpu(l.batch*l.inputs, l.delta_gpu, 0, net.truth_gpu, l.noobject_scale);
+        scale_mask_gpu(l.batch*l.inputs, l.output_gpu, 0, net.truth_gpu, l.noobject_scale);
+    }
+    if (l.cost_type == MASKED) {
+        mask_gpu(l.batch*l.inputs, net.delta_gpu, SECRET_NUM, net.truth_gpu, 0);
     }
 
     if(l.ratio){
@@ -133,16 +157,20 @@ void forward_cost_layer_gpu(cost_layer l, network_state state)
         float thresh = l.delta[n];
         thresh = 0;
         printf("%f\n", thresh);
-        supp_ongpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
+        supp_gpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
+    }
+
+    if(l.thresh){
+        supp_gpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1);
     }
 
     cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs);
     l.cost[0] = sum_array(l.output, l.batch*l.inputs);
 }
 
-void backward_cost_layer_gpu(const cost_layer l, network_state state)
+void backward_cost_layer_gpu(const cost_layer l, network net)
 {
-    axpy_ongpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, state.delta, 1);
+    axpy_gpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
 
diff --git a/image.darknet/src/cost_layer.h b/image.darknet/src/cost_layer.h
index a692831..ceb64de 100644
--- a/image.darknet/src/cost_layer.h
+++ b/image.darknet/src/cost_layer.h
@@ -8,13 +8,13 @@ typedef layer cost_layer;
 COST_TYPE get_cost_type(char *s);
 char *get_cost_string(COST_TYPE a);
 cost_layer make_cost_layer(int batch, int inputs, COST_TYPE type, float scale);
-void forward_cost_layer(const cost_layer l, network_state state);
-void backward_cost_layer(const cost_layer l, network_state state);
+void forward_cost_layer(const cost_layer l, network net);
+void backward_cost_layer(const cost_layer l, network net);
 void resize_cost_layer(cost_layer *l, int inputs);
 
 #ifdef GPU
-void forward_cost_layer_gpu(cost_layer l, network_state state);
-void backward_cost_layer_gpu(const cost_layer l, network_state state);
+void forward_cost_layer_gpu(cost_layer l, network net);
+void backward_cost_layer_gpu(const cost_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/crnn_layer.c b/image.darknet/src/crnn_layer.c
index 5495880..7dd29f6 100644
--- a/image.darknet/src/crnn_layer.c
+++ b/image.darknet/src/crnn_layer.c
@@ -48,17 +48,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
 
     l.input_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
+    *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 1, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
     l.input_layer->batch = batch;
 
     l.self_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
+    *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 1, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
     l.self_layer->batch = batch;
 
     l.output_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
+    *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 1, 3, 1, 1,  activation, batch_normalize, 0, 0, 0);
     l.output_layer->batch = batch;
 
     l.output = l.output_layer->output;
@@ -81,17 +81,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
     return l;
 }
 
-void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_crnn_layer(layer l, update_args a)
 {
-    update_convolutional_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_convolutional_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_convolutional_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_convolutional_layer(*(l.input_layer),  a);
+    update_convolutional_layer(*(l.self_layer),   a);
+    update_convolutional_layer(*(l.output_layer), a);
 }
 
-void forward_crnn_layer(layer l, network_state state)
+void forward_crnn_layer(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
@@ -100,17 +100,17 @@ void forward_crnn_layer(layer l, network_state state)
     fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
     fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1);
     fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1);
-    if(state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+    if(net.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);
 
     for (i = 0; i < l.steps; ++i) {
-        s.input = state.input;
+        s.input = net.input;
         forward_convolutional_layer(input_layer, s);
 
         s.input = l.state;
         forward_convolutional_layer(self_layer, s);
 
         float *old_state = l.state;
-        if(state.train) l.state += l.hidden*l.batch;
+        if(net.train) l.state += l.hidden*l.batch;
         if(l.shortcut){
             copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1);
         }else{
@@ -122,17 +122,16 @@ void forward_crnn_layer(layer l, network_state state)
         s.input = l.state;
         forward_convolutional_layer(output_layer, s);
 
-        state.input += l.inputs*l.batch;
+        net.input += l.inputs*l.batch;
         increment_layer(&input_layer, 1);
         increment_layer(&self_layer, 1);
         increment_layer(&output_layer, 1);
     }
 }
 
-void backward_crnn_layer(layer l, network_state state)
+void backward_crnn_layer(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
@@ -168,8 +167,8 @@ void backward_crnn_layer(layer l, network_state state)
 
         copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1);
         if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1);
-        s.input = state.input + i*l.inputs*l.batch;
-        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
+        s.input = net.input + i*l.inputs*l.batch;
+        if(net.delta) s.delta = net.delta + i*l.inputs*l.batch;
         else s.delta = 0;
         backward_convolutional_layer(input_layer, s);
 
@@ -195,58 +194,57 @@ void push_crnn_layer(layer l)
     push_convolutional_layer(*(l.output_layer));
 }
 
-void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_crnn_layer_gpu(layer l, update_args a)
 {
-    update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_convolutional_layer_gpu(*(l.input_layer),  a);
+    update_convolutional_layer_gpu(*(l.self_layer),   a);
+    update_convolutional_layer_gpu(*(l.output_layer), a);
 }
 
-void forward_crnn_layer_gpu(layer l, network_state state)
+void forward_crnn_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
     layer output_layer = *(l.output_layer);
 
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
-    fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
-    fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
-    if(state.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
+    fill_gpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
+    fill_gpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
+    if(net.train) fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);
 
     for (i = 0; i < l.steps; ++i) {
-        s.input = state.input;
+        s.input_gpu = net.input_gpu;
         forward_convolutional_layer_gpu(input_layer, s);
 
-        s.input = l.state_gpu;
+        s.input_gpu = l.state_gpu;
         forward_convolutional_layer_gpu(self_layer, s);
 
         float *old_state = l.state_gpu;
-        if(state.train) l.state_gpu += l.hidden*l.batch;
+        if(net.train) l.state_gpu += l.hidden*l.batch;
         if(l.shortcut){
-            copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
+            copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
         }else{
-            fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
+            fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);
         }
-        axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
-        axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_gpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
 
-        s.input = l.state_gpu;
+        s.input_gpu = l.state_gpu;
         forward_convolutional_layer_gpu(output_layer, s);
 
-        state.input += l.inputs*l.batch;
+        net.input_gpu += l.inputs*l.batch;
         increment_layer(&input_layer, 1);
         increment_layer(&self_layer, 1);
         increment_layer(&output_layer, 1);
     }
 }
 
-void backward_crnn_layer_gpu(layer l, network_state state)
+void backward_crnn_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
@@ -256,25 +254,25 @@ void backward_crnn_layer_gpu(layer l, network_state state)
     increment_layer(&output_layer, l.steps - 1);
     l.state_gpu += l.hidden*l.batch*l.steps;
     for (i = l.steps-1; i >= 0; --i) {
-        copy_ongpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
-        axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
+        copy_gpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
 
-        s.input = l.state_gpu;
-        s.delta = self_layer.delta_gpu;
+        s.input_gpu = l.state_gpu;
+        s.delta_gpu = self_layer.delta_gpu;
         backward_convolutional_layer_gpu(output_layer, s);
 
         l.state_gpu -= l.hidden*l.batch;
 
-        s.input = l.state_gpu;
-        s.delta = self_layer.delta_gpu - l.hidden*l.batch;
-        if (i == 0) s.delta = 0;
+        s.input_gpu = l.state_gpu;
+        s.delta_gpu = self_layer.delta_gpu - l.hidden*l.batch;
+        if (i == 0) s.delta_gpu = 0;
         backward_convolutional_layer_gpu(self_layer, s);
 
-        copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
-        if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
-        s.input = state.input + i*l.inputs*l.batch;
-        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
-        else s.delta = 0;
+        copy_gpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
+        if (i > 0 && l.shortcut) axpy_gpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
+        s.input_gpu = net.input_gpu + i*l.inputs*l.batch;
+        if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch;
+        else s.delta_gpu = 0;
         backward_convolutional_layer_gpu(input_layer, s);
 
         increment_layer(&input_layer,  -1);
diff --git a/image.darknet/src/crnn_layer.h b/image.darknet/src/crnn_layer.h
index 0da942e..515f378 100644
--- a/image.darknet/src/crnn_layer.h
+++ b/image.darknet/src/crnn_layer.h
@@ -8,14 +8,14 @@
 
 layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize);
 
-void forward_crnn_layer(layer l, network_state state);
-void backward_crnn_layer(layer l, network_state state);
-void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_crnn_layer(layer l, network net);
+void backward_crnn_layer(layer l, network net);
+void update_crnn_layer(layer l, update_args a);
 
 #ifdef GPU
-void forward_crnn_layer_gpu(layer l, network_state state);
-void backward_crnn_layer_gpu(layer l, network_state state);
-void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_crnn_layer_gpu(layer l, network net);
+void backward_crnn_layer_gpu(layer l, network net);
+void update_crnn_layer_gpu(layer l, update_args a);
 void push_crnn_layer(layer l);
 void pull_crnn_layer(layer l);
 #endif
diff --git a/image.darknet/src/crop_layer.c b/image.darknet/src/crop_layer.c
index 11c59b4..3b91852 100644
--- a/image.darknet/src/crop_layer.c
+++ b/image.darknet/src/crop_layer.c
@@ -10,8 +10,8 @@ image get_crop_image(crop_layer l)
     return float_to_image(w,h,c,l.output);
 }
 
-void backward_crop_layer(const crop_layer l, network_state state){}
-void backward_crop_layer_gpu(const crop_layer l, network_state state){}
+void backward_crop_layer(const crop_layer l, network net){}
+void backward_crop_layer_gpu(const crop_layer l, network net){}
 
 crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure)
 {
@@ -64,7 +64,7 @@ void resize_crop_layer(layer *l, int w, int h)
 }
 
 
-void forward_crop_layer(const crop_layer l, network_state state)
+void forward_crop_layer(const crop_layer l, network net)
 {
     int i,j,c,b,row,col;
     int index;
@@ -78,7 +78,7 @@ void forward_crop_layer(const crop_layer l, network_state state)
         scale = 1;
         trans = 0;
     }
-    if(!state.train){
+    if(!net.train){
         flip = 0;
         dh = (l.h - l.out_h)/2;
         dw = (l.w - l.out_w)/2;
@@ -94,7 +94,7 @@ void forward_crop_layer(const crop_layer l, network_state state)
                     }
                     row = i + dh;
                     index = col+l.w*(row+l.h*(c + l.c*b)); 
-                    l.output[count++] = state.input[index]*scale + trans;
+                    l.output[count++] = net.input[index]*scale + trans;
                 }
             }
         }
diff --git a/image.darknet/src/crop_layer.h b/image.darknet/src/crop_layer.h
index 3aa2d3d..3b5883c 100644
--- a/image.darknet/src/crop_layer.h
+++ b/image.darknet/src/crop_layer.h
@@ -9,11 +9,11 @@ typedef layer crop_layer;
 
 image get_crop_image(crop_layer l);
 crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure);
-void forward_crop_layer(const crop_layer l, network_state state);
+void forward_crop_layer(const crop_layer l, network net);
 void resize_crop_layer(layer *l, int w, int h);
 
 #ifdef GPU
-void forward_crop_layer_gpu(crop_layer l, network_state state);
+void forward_crop_layer_gpu(crop_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/crop_layer_kernels.cu b/image.darknet/src/crop_layer_kernels.cu
index 8a08630..b5b9f55 100644
--- a/image.darknet/src/crop_layer_kernels.cu
+++ b/image.darknet/src/crop_layer_kernels.cu
@@ -113,9 +113,9 @@ __global__ void levels_image_kernel(float *image, float *rand, int batch, int w,
     float r3 = rand[8*id + 3];
 
     saturation = r0*(saturation - 1) + 1;
-    saturation = (r1 > .5) ? 1./saturation : saturation;
+    saturation = (r1 > .5f) ? 1.f/saturation : saturation;
     exposure = r2*(exposure - 1) + 1;
-    exposure = (r3 > .5) ? 1./exposure : exposure;
+    exposure = (r3 > .5f) ? 1.f/exposure : exposure;
 
     size_t offset = id * h * w * 3;
     image += offset;
@@ -131,9 +131,9 @@ __global__ void levels_image_kernel(float *image, float *rand, int batch, int w,
     } else {
         shift = 0;
     }
-    image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5)*shift;
-    image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5)*shift;
-    image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5)*shift;
+    image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5f)*shift;
+    image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5f)*shift;
+    image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5f)*shift;
 }
 
 __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, float *output)
@@ -141,8 +141,8 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if(id >= size) return;
 
-    float cx = w/2.;
-    float cy = h/2.;
+    float cx = w/2.f;
+    float cy = h/2.f;
 
     int count = id;
     int j = id % crop_width;
@@ -160,11 +160,11 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
 
     float dw = (w - crop_width)*r4;
     float dh = (h - crop_height)*r5;
-    flip = (flip && (r6 > .5));
+    flip = (flip && (r6 > .5f));
     angle = 2*angle*r7 - angle;
     if(!train){
-        dw = (w - crop_width)/2.;
-        dh = (h - crop_height)/2.;
+        dw = (w - crop_width)/2.f;
+        dh = (h - crop_height)/2.f;
         flip = 0;
         angle = 0;
     }
@@ -174,17 +174,17 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
     float x = (flip) ? w - dw - j - 1 : j + dw;    
     float y = i + dh;
 
-    float rx = cos(angle)*(x-cx) - sin(angle)*(y-cy) + cx;
-    float ry = sin(angle)*(x-cx) + cos(angle)*(y-cy) + cy;
+    float rx = cosf(angle)*(x-cx) - sinf(angle)*(y-cy) + cx;
+    float ry = sinf(angle)*(x-cx) + cosf(angle)*(y-cy) + cy;
 
     output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k);
 }
 
-extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state)
+extern "C" void forward_crop_layer_gpu(crop_layer layer, network net)
 {
     cuda_random(layer.rand_gpu, layer.batch*8);
 
-    float radians = layer.angle*3.14159265/180.;
+    float radians = layer.angle*3.14159265f/180.f;
 
     float scale = 2;
     float translate = -1;
@@ -195,12 +195,12 @@ extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state)
 
     int size = layer.batch * layer.w * layer.h;
 
-    levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, layer.batch, layer.w, layer.h, state.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
+    levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, layer.batch, layer.w, layer.h, net.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
     check_error(cudaPeekAtLastError());
 
     size = layer.batch*layer.c*layer.out_w*layer.out_h;
 
-    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, state.train, layer.flip, radians, layer.output_gpu);
+    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, net.train, layer.flip, radians, layer.output_gpu);
     check_error(cudaPeekAtLastError());
 
 /*
diff --git a/image.darknet/src/cuda.c b/image.darknet/src/cuda.c
index 1b51271..48aba6e 100644
--- a/image.darknet/src/cuda.c
+++ b/image.darknet/src/cuda.c
@@ -5,7 +5,7 @@ int gpu_index = 0;
 #include "cuda.h"
 #include "utils.h"
 #include "blas.h"
-#include "assert.h"
+#include <assert.h>
 #include <stdlib.h>
 #include <time.h>
 
@@ -96,6 +96,8 @@ float *cuda_make_array(float *x, size_t n)
     if(x){
         status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
         check_error(status);
+    } else {
+        fill_gpu(n, 0, x_gpu, 1);
     }
     if(!x_gpu) error("Cuda malloc failed\n");
     return x_gpu;
@@ -128,12 +130,17 @@ float cuda_compare(float *x_gpu, float *x, size_t n, char *s)
     return err;
 }
 
-int *cuda_make_int_array(size_t n)
+int *cuda_make_int_array(int *x, size_t n)
 {
     int *x_gpu;
     size_t size = sizeof(int)*n;
     cudaError_t status = cudaMalloc((void **)&x_gpu, size);
     check_error(status);
+    if(x){
+        status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
+        check_error(status);
+    }
+    if(!x_gpu) error("Cuda malloc failed\n");
     return x_gpu;
 }
 
@@ -157,4 +164,15 @@ void cuda_pull_array(float *x_gpu, float *x, size_t n)
     check_error(status);
 }
 
+float cuda_mag_array(float *x_gpu, size_t n)
+{
+    float *temp = calloc(n, sizeof(float));
+    cuda_pull_array(x_gpu, temp, n);
+    float m = mag_array(temp, n);
+    free(temp);
+    return m;
+}
+#else
+void cuda_set_device(int n){}
+
 #endif
diff --git a/image.darknet/src/cuda.h b/image.darknet/src/cuda.h
index 29b1eef..a1bc216 100644
--- a/image.darknet/src/cuda.h
+++ b/image.darknet/src/cuda.h
@@ -1,28 +1,13 @@
 #ifndef CUDA_H
 #define CUDA_H
 
-extern int gpu_index;
+#include "darknet.h"
 
 #ifdef GPU
 
-#define BLOCK 512
-
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
-
-#ifdef CUDNN
-#include "cudnn.h"
-#endif
-
 void check_error(cudaError_t status);
 cublasHandle_t blas_handle();
-float *cuda_make_array(float *x, size_t n);
-int *cuda_make_int_array(size_t n);
-void cuda_push_array(float *x_gpu, float *x, size_t n);
-void cuda_pull_array(float *x_gpu, float *x, size_t n);
-void cuda_set_device(int n);
-void cuda_free(float *x_gpu);
+int *cuda_make_int_array(int *x, size_t n);
 void cuda_random(float *x_gpu, size_t n);
 float cuda_compare(float *x_gpu, float *x, size_t n, char *s);
 dim3 cuda_gridsize(size_t n);
diff --git a/image.darknet/src/darknet.c b/image.darknet/src/darknet.c
deleted file mode 100644
index 6e56072..0000000
--- a/image.darknet/src/darknet.c
+++ /dev/null
@@ -1,452 +0,0 @@
-#include <time.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "parser.h"
-#include "utils.h"
-#include "cuda.h"
-#include "blas.h"
-#include "connected_layer.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-extern void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int top);
-extern void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh);
-extern void run_voxel(int argc, char **argv);
-extern void run_yolo(int argc, char **argv);
-extern void run_detector(int argc, char **argv);
-extern void run_coco(int argc, char **argv);
-extern void run_writing(int argc, char **argv);
-extern void run_captcha(int argc, char **argv);
-extern void run_nightmare(int argc, char **argv);
-extern void run_dice(int argc, char **argv);
-extern void run_compare(int argc, char **argv);
-extern void run_classifier(int argc, char **argv);
-extern void run_char_rnn(int argc, char **argv);
-extern void run_vid_rnn(int argc, char **argv);
-extern void run_tag(int argc, char **argv);
-extern void run_cifar(int argc, char **argv);
-extern void run_go(int argc, char **argv);
-extern void run_art(int argc, char **argv);
-extern void run_super(int argc, char **argv);
-
-void average(int argc, char *argv[])
-{
-    char *cfgfile = argv[2];
-    char *outfile = argv[3];
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    network sum = parse_network_cfg(cfgfile);
-
-    char *weightfile = argv[4];   
-    load_weights(&sum, weightfile);
-
-    int i, j;
-    int n = argc - 5;
-    for(i = 0; i < n; ++i){
-        weightfile = argv[i+5];   
-        load_weights(&net, weightfile);
-        for(j = 0; j < net.n; ++j){
-            layer l = net.layers[j];
-            layer out = sum.layers[j];
-            if(l.type == CONVOLUTIONAL){
-                int num = l.n*l.c*l.size*l.size;
-                axpy_cpu(l.n, 1, l.biases, 1, out.biases, 1);
-                axpy_cpu(num, 1, l.weights, 1, out.weights, 1);
-                if(l.batch_normalize){
-                    axpy_cpu(l.n, 1, l.scales, 1, out.scales, 1);
-                    axpy_cpu(l.n, 1, l.rolling_mean, 1, out.rolling_mean, 1);
-                    axpy_cpu(l.n, 1, l.rolling_variance, 1, out.rolling_variance, 1);
-                }
-            }
-            if(l.type == CONNECTED){
-                axpy_cpu(l.outputs, 1, l.biases, 1, out.biases, 1);
-                axpy_cpu(l.outputs*l.inputs, 1, l.weights, 1, out.weights, 1);
-            }
-        }
-    }
-    n = n+1;
-    for(j = 0; j < net.n; ++j){
-        layer l = sum.layers[j];
-        if(l.type == CONVOLUTIONAL){
-            int num = l.n*l.c*l.size*l.size;
-            scal_cpu(l.n, 1./n, l.biases, 1);
-            scal_cpu(num, 1./n, l.weights, 1);
-                if(l.batch_normalize){
-                    scal_cpu(l.n, 1./n, l.scales, 1);
-                    scal_cpu(l.n, 1./n, l.rolling_mean, 1);
-                    scal_cpu(l.n, 1./n, l.rolling_variance, 1);
-                }
-        }
-        if(l.type == CONNECTED){
-            scal_cpu(l.outputs, 1./n, l.biases, 1);
-            scal_cpu(l.outputs*l.inputs, 1./n, l.weights, 1);
-        }
-    }
-    save_weights(sum, outfile);
-}
-
-void speed(char *cfgfile, int tics)
-{
-    if (tics == 0) tics = 1000;
-    network net = parse_network_cfg(cfgfile);
-    set_batch_network(&net, 1);
-    int i;
-    time_t start = time(0);
-    image im = make_image(net.w, net.h, net.c);
-    for(i = 0; i < tics; ++i){
-        network_predict(net, im.data);
-    }
-    double t = difftime(time(0), start);
-    printf("\n%d evals, %f Seconds\n", tics, t);
-    printf("Speed: %f sec/eval\n", t/tics);
-    printf("Speed: %f Hz\n", tics/t);
-}
-
-void operations(char *cfgfile)
-{
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    int i;
-    long ops = 0;
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
-        if(l.type == CONVOLUTIONAL){
-            ops += 2l * l.n * l.size*l.size*l.c * l.out_h*l.out_w;
-        } else if(l.type == CONNECTED){
-            ops += 2l * l.inputs * l.outputs;
-        }
-    }
-    printf("Floating Point Operations: %ld\n", ops);
-    printf("Floating Point Operations: %.2f Bn\n", (float)ops/1000000000.);
-}
-
-void oneoff(char *cfgfile, char *weightfile, char *outfile)
-{
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    int oldn = net.layers[net.n - 2].n;
-    int c = net.layers[net.n - 2].c;
-    scal_cpu(oldn*c, .1, net.layers[net.n - 2].weights, 1);
-    scal_cpu(oldn, 0, net.layers[net.n - 2].biases, 1);
-    net.layers[net.n - 2].n = 9418;
-    net.layers[net.n - 2].biases += 5;
-    net.layers[net.n - 2].weights += 5*c;
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    net.layers[net.n - 2].biases -= 5;
-    net.layers[net.n - 2].weights -= 5*c;
-    net.layers[net.n - 2].n = oldn;
-    printf("%d\n", oldn);
-    layer l = net.layers[net.n - 2];
-    copy_cpu(l.n/3, l.biases, 1, l.biases +   l.n/3, 1);
-    copy_cpu(l.n/3, l.biases, 1, l.biases + 2*l.n/3, 1);
-    copy_cpu(l.n/3*l.c, l.weights, 1, l.weights +   l.n/3*l.c, 1);
-    copy_cpu(l.n/3*l.c, l.weights, 1, l.weights + 2*l.n/3*l.c, 1);
-    *net.seen = 0;
-    save_weights(net, outfile);
-}
-
-void partial(char *cfgfile, char *weightfile, char *outfile, int max)
-{
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights_upto(&net, weightfile, max);
-    }
-    *net.seen = 0;
-    save_weights_upto(net, outfile, max);
-}
-
-#include "convolutional_layer.h"
-void rescale_net(char *cfgfile, char *weightfile, char *outfile)
-{
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int i;
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
-        if(l.type == CONVOLUTIONAL){
-            rescale_weights(l, 2, -.5);
-            break;
-        }
-    }
-    save_weights(net, outfile);
-}
-
-void rgbgr_net(char *cfgfile, char *weightfile, char *outfile)
-{
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int i;
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
-        if(l.type == CONVOLUTIONAL){
-            rgbgr_weights(l);
-            break;
-        }
-    }
-    save_weights(net, outfile);
-}
-
-void reset_normalize_net(char *cfgfile, char *weightfile, char *outfile)
-{
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if (weightfile) {
-        load_weights(&net, weightfile);
-    }
-    int i;
-    for (i = 0; i < net.n; ++i) {
-        layer l = net.layers[i];
-        if (l.type == CONVOLUTIONAL && l.batch_normalize) {
-            denormalize_convolutional_layer(l);
-        }
-        if (l.type == CONNECTED && l.batch_normalize) {
-            denormalize_connected_layer(l);
-        }
-        if (l.type == GRU && l.batch_normalize) {
-            denormalize_connected_layer(*l.input_z_layer);
-            denormalize_connected_layer(*l.input_r_layer);
-            denormalize_connected_layer(*l.input_h_layer);
-            denormalize_connected_layer(*l.state_z_layer);
-            denormalize_connected_layer(*l.state_r_layer);
-            denormalize_connected_layer(*l.state_h_layer);
-        }
-    }
-    save_weights(net, outfile);
-}
-
-layer normalize_layer(layer l, int n)
-{
-    int j;
-    l.batch_normalize=1;
-    l.scales = calloc(n, sizeof(float));
-    for(j = 0; j < n; ++j){
-        l.scales[j] = 1;
-    }
-    l.rolling_mean = calloc(n, sizeof(float));
-    l.rolling_variance = calloc(n, sizeof(float));
-    return l;
-}
-
-void normalize_net(char *cfgfile, char *weightfile, char *outfile)
-{
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    int i;
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
-        if(l.type == CONVOLUTIONAL && !l.batch_normalize){
-            net.layers[i] = normalize_layer(l, l.n);
-        }
-        if (l.type == CONNECTED && !l.batch_normalize) {
-            net.layers[i] = normalize_layer(l, l.outputs);
-        }
-        if (l.type == GRU && l.batch_normalize) {
-            *l.input_z_layer = normalize_layer(*l.input_z_layer, l.input_z_layer->outputs);
-            *l.input_r_layer = normalize_layer(*l.input_r_layer, l.input_r_layer->outputs);
-            *l.input_h_layer = normalize_layer(*l.input_h_layer, l.input_h_layer->outputs);
-            *l.state_z_layer = normalize_layer(*l.state_z_layer, l.state_z_layer->outputs);
-            *l.state_r_layer = normalize_layer(*l.state_r_layer, l.state_r_layer->outputs);
-            *l.state_h_layer = normalize_layer(*l.state_h_layer, l.state_h_layer->outputs);
-            net.layers[i].batch_normalize=1;
-        }
-    }
-    save_weights(net, outfile);
-}
-
-void statistics_net(char *cfgfile, char *weightfile)
-{
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if (weightfile) {
-        load_weights(&net, weightfile);
-    }
-    int i;
-    for (i = 0; i < net.n; ++i) {
-        layer l = net.layers[i];
-        if (l.type == CONNECTED && l.batch_normalize) {
-            printf("Connected Layer %d\n", i);
-            statistics_connected_layer(l);
-        }
-        if (l.type == GRU && l.batch_normalize) {
-            printf("GRU Layer %d\n", i);
-            printf("Input Z\n");
-            statistics_connected_layer(*l.input_z_layer);
-            printf("Input R\n");
-            statistics_connected_layer(*l.input_r_layer);
-            printf("Input H\n");
-            statistics_connected_layer(*l.input_h_layer);
-            printf("State Z\n");
-            statistics_connected_layer(*l.state_z_layer);
-            printf("State R\n");
-            statistics_connected_layer(*l.state_r_layer);
-            printf("State H\n");
-            statistics_connected_layer(*l.state_h_layer);
-        }
-        printf("\n");
-    }
-}
-
-void denormalize_net(char *cfgfile, char *weightfile, char *outfile)
-{
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
-    if (weightfile) {
-        load_weights(&net, weightfile);
-    }
-    int i;
-    for (i = 0; i < net.n; ++i) {
-        layer l = net.layers[i];
-        if (l.type == CONVOLUTIONAL && l.batch_normalize) {
-            denormalize_convolutional_layer(l);
-            net.layers[i].batch_normalize=0;
-        }
-        if (l.type == CONNECTED && l.batch_normalize) {
-            denormalize_connected_layer(l);
-            net.layers[i].batch_normalize=0;
-        }
-        if (l.type == GRU && l.batch_normalize) {
-            denormalize_connected_layer(*l.input_z_layer);
-            denormalize_connected_layer(*l.input_r_layer);
-            denormalize_connected_layer(*l.input_h_layer);
-            denormalize_connected_layer(*l.state_z_layer);
-            denormalize_connected_layer(*l.state_r_layer);
-            denormalize_connected_layer(*l.state_h_layer);
-            l.input_z_layer->batch_normalize = 0;
-            l.input_r_layer->batch_normalize = 0;
-            l.input_h_layer->batch_normalize = 0;
-            l.state_z_layer->batch_normalize = 0;
-            l.state_r_layer->batch_normalize = 0;
-            l.state_h_layer->batch_normalize = 0;
-            net.layers[i].batch_normalize=0;
-        }
-    }
-    save_weights(net, outfile);
-}
-
-void visualize(char *cfgfile, char *weightfile)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    visualize_network(net);
-#ifdef OPENCV
-    cvWaitKey(0);
-#endif
-}
-
-int main(int argc, char **argv)
-{
-    //test_resize("data/bad.jpg");
-    //test_box();
-    //test_convolutional_layer();
-    if(argc < 2){
-        fprintf(stderr, "usage: %s <function>\n", argv[0]);
-        return 0;
-    }
-    gpu_index = find_int_arg(argc, argv, "-i", 0);
-    if(find_arg(argc, argv, "-nogpu")) {
-        gpu_index = -1;
-    }
-
-#ifndef GPU
-    gpu_index = -1;
-#else
-    if(gpu_index >= 0){
-        cuda_set_device(gpu_index);
-    }
-#endif
-
-    if (0 == strcmp(argv[1], "average")){
-        average(argc, argv);
-    } else if (0 == strcmp(argv[1], "yolo")){
-        run_yolo(argc, argv);
-    } else if (0 == strcmp(argv[1], "voxel")){
-        run_voxel(argc, argv);
-    } else if (0 == strcmp(argv[1], "super")){
-        run_super(argc, argv);
-    } else if (0 == strcmp(argv[1], "detector")){
-        run_detector(argc, argv);
-    } else if (0 == strcmp(argv[1], "detect")){
-        float thresh = find_float_arg(argc, argv, "-thresh", .24);
-        char *filename = (argc > 4) ? argv[4]: 0;
-        test_detector("cfg/coco.data", argv[2], argv[3], filename, thresh, .5);
-    } else if (0 == strcmp(argv[1], "cifar")){
-        run_cifar(argc, argv);
-    } else if (0 == strcmp(argv[1], "go")){
-        run_go(argc, argv);
-    } else if (0 == strcmp(argv[1], "rnn")){
-        run_char_rnn(argc, argv);
-    } else if (0 == strcmp(argv[1], "vid")){
-        run_vid_rnn(argc, argv);
-    } else if (0 == strcmp(argv[1], "coco")){
-        run_coco(argc, argv);
-    } else if (0 == strcmp(argv[1], "classify")){
-        predict_classifier("cfg/imagenet1k.data", argv[2], argv[3], argv[4], 5);
-    } else if (0 == strcmp(argv[1], "classifier")){
-        run_classifier(argc, argv);
-    } else if (0 == strcmp(argv[1], "art")){
-        run_art(argc, argv);
-    } else if (0 == strcmp(argv[1], "tag")){
-        run_tag(argc, argv);
-    } else if (0 == strcmp(argv[1], "compare")){
-        run_compare(argc, argv);
-    } else if (0 == strcmp(argv[1], "dice")){
-        run_dice(argc, argv);
-    } else if (0 == strcmp(argv[1], "writing")){
-        run_writing(argc, argv);
-    } else if (0 == strcmp(argv[1], "3d")){
-        composite_3d(argv[2], argv[3], argv[4], (argc > 5) ? atof(argv[5]) : 0);
-    } else if (0 == strcmp(argv[1], "test")){
-        test_resize(argv[2]);
-    } else if (0 == strcmp(argv[1], "captcha")){
-        run_captcha(argc, argv);
-    } else if (0 == strcmp(argv[1], "nightmare")){
-        run_nightmare(argc, argv);
-    } else if (0 == strcmp(argv[1], "rgbgr")){
-        rgbgr_net(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "reset")){
-        reset_normalize_net(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "denormalize")){
-        denormalize_net(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "statistics")){
-        statistics_net(argv[2], argv[3]);
-    } else if (0 == strcmp(argv[1], "normalize")){
-        normalize_net(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "rescale")){
-        rescale_net(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "ops")){
-        operations(argv[2]);
-    } else if (0 == strcmp(argv[1], "speed")){
-        speed(argv[2], (argc > 3 && argv[3]) ? atoi(argv[3]) : 0);
-    } else if (0 == strcmp(argv[1], "oneoff")){
-        oneoff(argv[2], argv[3], argv[4]);
-    } else if (0 == strcmp(argv[1], "partial")){
-        partial(argv[2], argv[3], argv[4], atoi(argv[5]));
-    } else if (0 == strcmp(argv[1], "average")){
-        average(argc, argv);
-    } else if (0 == strcmp(argv[1], "visualize")){
-        visualize(argv[2], (argc > 3) ? argv[3] : 0);
-    } else if (0 == strcmp(argv[1], "imtest")){
-        test_resize(argv[2]);
-    } else {
-        fprintf(stderr, "Not an option: %s\n", argv[1]);
-    }
-    return 0;
-}
-
diff --git a/image.darknet/src/darknet.h b/image.darknet/src/darknet.h
new file mode 100644
index 0000000..4390c61
--- /dev/null
+++ b/image.darknet/src/darknet.h
@@ -0,0 +1,805 @@
+#ifndef DARKNET_API
+#define DARKNET_API
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+
+#ifdef GPU
+    #define BLOCK 512
+
+    #include "cuda_runtime.h"
+    #include "curand.h"
+    #include "cublas_v2.h"
+
+    #ifdef CUDNN
+    #include "cudnn.h"
+    #endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SECRET_NUM -1234
+extern int gpu_index;
+
+typedef struct{
+    int classes;
+    char **names;
+} metadata;
+
+metadata get_metadata(char *file);
+
+typedef struct{
+    int *leaf;
+    int n;
+    int *parent;
+    int *child;
+    int *group;
+    char **name;
+
+    int groups;
+    int *group_size;
+    int *group_offset;
+} tree;
+tree *read_tree(char *filename);
+
+typedef enum{
+    LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN, SELU
+} ACTIVATION;
+
+typedef enum{
+    PNG, BMP, TGA, JPG
+} IMTYPE;
+
+typedef enum{
+    MULT, ADD, SUB, DIV
+} BINARY_ACTIVATION;
+
+typedef enum {
+    CONVOLUTIONAL,
+    DECONVOLUTIONAL,
+    CONNECTED,
+    MAXPOOL,
+    SOFTMAX,
+    DETECTION,
+    DROPOUT,
+    CROP,
+    ROUTE,
+    COST,
+    NORMALIZATION,
+    AVGPOOL,
+    LOCAL,
+    SHORTCUT,
+    ACTIVE,
+    RNN,
+    GRU,
+    LSTM,
+    CRNN,
+    BATCHNORM,
+    NETWORK,
+    XNOR,
+    REGION,
+    YOLO,
+    ISEG,
+    REORG,
+    UPSAMPLE,
+    LOGXENT,
+    L2NORM,
+    BLANK
+} LAYER_TYPE;
+
+typedef enum{
+    SSE, MASKED, L1, SEG, SMOOTH,WGAN
+} COST_TYPE;
+
+typedef struct{
+    int batch;
+    float learning_rate;
+    float momentum;
+    float decay;
+    int adam;
+    float B1;
+    float B2;
+    float eps;
+    int t;
+} update_args;
+
+struct network;
+typedef struct network network;
+
+struct layer;
+typedef struct layer layer;
+
+struct layer{
+    LAYER_TYPE type;
+    ACTIVATION activation;
+    COST_TYPE cost_type;
+    void (*forward)   (struct layer, struct network);
+    void (*backward)  (struct layer, struct network);
+    void (*update)    (struct layer, update_args);
+    void (*forward_gpu)   (struct layer, struct network);
+    void (*backward_gpu)  (struct layer, struct network);
+    void (*update_gpu)    (struct layer, update_args);
+    int batch_normalize;
+    int shortcut;
+    int batch;
+    int forced;
+    int flipped;
+    int inputs;
+    int outputs;
+    int nweights;
+    int nbiases;
+    int extra;
+    int truths;
+    int h,w,c;
+    int out_h, out_w, out_c;
+    int n;
+    int max_boxes;
+    int groups;
+    int size;
+    int side;
+    int stride;
+    int reverse;
+    int flatten;
+    int spatial;
+    int pad;
+    int sqrt;
+    int flip;
+    int index;
+    int binary;
+    int xnor;
+    int steps;
+    int hidden;
+    int truth;
+    float smooth;
+    float dot;
+    float angle;
+    float jitter;
+    float saturation;
+    float exposure;
+    float shift;
+    float ratio;
+    float learning_rate_scale;
+    float clip;
+    int noloss;
+    int softmax;
+    int classes;
+    int coords;
+    int background;
+    int rescore;
+    int objectness;
+    int joint;
+    int noadjust;
+    int reorg;
+    int log;
+    int tanh;
+    int *mask;
+    int total;
+
+    float alpha;
+    float beta;
+    float kappa;
+
+    float coord_scale;
+    float object_scale;
+    float noobject_scale;
+    float mask_scale;
+    float class_scale;
+    int bias_match;
+    int random;
+    float ignore_thresh;
+    float truth_thresh;
+    float thresh;
+    float focus;
+    int classfix;
+    int absolute;
+
+    int onlyforward;
+    int stopbackward;
+    int dontload;
+    int dontsave;
+    int dontloadscales;
+    int numload;
+
+    float temperature;
+    float probability;
+    float scale;
+
+    char  * cweights;
+    int   * indexes;
+    int   * input_layers;
+    int   * input_sizes;
+    int   * map;
+    int   * counts;
+    float ** sums;
+    float * rand;
+    float * cost;
+    float * state;
+    float * prev_state;
+    float * forgot_state;
+    float * forgot_delta;
+    float * state_delta;
+    float * combine_cpu;
+    float * combine_delta_cpu;
+
+    float * concat;
+    float * concat_delta;
+
+    float * binary_weights;
+
+    float * biases;
+    float * bias_updates;
+
+    float * scales;
+    float * scale_updates;
+
+    float * weights;
+    float * weight_updates;
+
+    float * delta;
+    float * output;
+    float * loss;
+    float * squared;
+    float * norms;
+
+    float * spatial_mean;
+    float * mean;
+    float * variance;
+
+    float * mean_delta;
+    float * variance_delta;
+
+    float * rolling_mean;
+    float * rolling_variance;
+
+    float * x;
+    float * x_norm;
+
+    float * m;
+    float * v;
+    
+    float * bias_m;
+    float * bias_v;
+    float * scale_m;
+    float * scale_v;
+
+
+    float *z_cpu;
+    float *r_cpu;
+    float *h_cpu;
+    float * prev_state_cpu;
+
+    float *temp_cpu;
+    float *temp2_cpu;
+    float *temp3_cpu;
+
+    float *dh_cpu;
+    float *hh_cpu;
+    float *prev_cell_cpu;
+    float *cell_cpu;
+    float *f_cpu;
+    float *i_cpu;
+    float *g_cpu;
+    float *o_cpu;
+    float *c_cpu;
+    float *dc_cpu; 
+
+    float * binary_input;
+
+    struct layer *input_layer;
+    struct layer *self_layer;
+    struct layer *output_layer;
+
+    struct layer *reset_layer;
+    struct layer *update_layer;
+    struct layer *state_layer;
+
+    struct layer *input_gate_layer;
+    struct layer *state_gate_layer;
+    struct layer *input_save_layer;
+    struct layer *state_save_layer;
+    struct layer *input_state_layer;
+    struct layer *state_state_layer;
+
+    struct layer *input_z_layer;
+    struct layer *state_z_layer;
+
+    struct layer *input_r_layer;
+    struct layer *state_r_layer;
+
+    struct layer *input_h_layer;
+    struct layer *state_h_layer;
+	
+    struct layer *wz;
+    struct layer *uz;
+    struct layer *wr;
+    struct layer *ur;
+    struct layer *wh;
+    struct layer *uh;
+    struct layer *uo;
+    struct layer *wo;
+    struct layer *uf;
+    struct layer *wf;
+    struct layer *ui;
+    struct layer *wi;
+    struct layer *ug;
+    struct layer *wg;
+
+    tree *softmax_tree;
+
+    size_t workspace_size;
+
+#ifdef GPU
+    int *indexes_gpu;
+
+    float *z_gpu;
+    float *r_gpu;
+    float *h_gpu;
+
+    float *temp_gpu;
+    float *temp2_gpu;
+    float *temp3_gpu;
+
+    float *dh_gpu;
+    float *hh_gpu;
+    float *prev_cell_gpu;
+    float *cell_gpu;
+    float *f_gpu;
+    float *i_gpu;
+    float *g_gpu;
+    float *o_gpu;
+    float *c_gpu;
+    float *dc_gpu; 
+
+    float *m_gpu;
+    float *v_gpu;
+    float *bias_m_gpu;
+    float *scale_m_gpu;
+    float *bias_v_gpu;
+    float *scale_v_gpu;
+
+    float * combine_gpu;
+    float * combine_delta_gpu;
+
+    float * prev_state_gpu;
+    float * forgot_state_gpu;
+    float * forgot_delta_gpu;
+    float * state_gpu;
+    float * state_delta_gpu;
+    float * gate_gpu;
+    float * gate_delta_gpu;
+    float * save_gpu;
+    float * save_delta_gpu;
+    float * concat_gpu;
+    float * concat_delta_gpu;
+
+    float * binary_input_gpu;
+    float * binary_weights_gpu;
+
+    float * mean_gpu;
+    float * variance_gpu;
+
+    float * rolling_mean_gpu;
+    float * rolling_variance_gpu;
+
+    float * variance_delta_gpu;
+    float * mean_delta_gpu;
+
+    float * x_gpu;
+    float * x_norm_gpu;
+    float * weights_gpu;
+    float * weight_updates_gpu;
+    float * weight_change_gpu;
+
+    float * biases_gpu;
+    float * bias_updates_gpu;
+    float * bias_change_gpu;
+
+    float * scales_gpu;
+    float * scale_updates_gpu;
+    float * scale_change_gpu;
+
+    float * output_gpu;
+    float * loss_gpu;
+    float * delta_gpu;
+    float * rand_gpu;
+    float * squared_gpu;
+    float * norms_gpu;
+#ifdef CUDNN
+    cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
+    cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
+    cudnnTensorDescriptor_t normTensorDesc;
+    cudnnFilterDescriptor_t weightDesc;
+    cudnnFilterDescriptor_t dweightDesc;
+    cudnnConvolutionDescriptor_t convDesc;
+    cudnnConvolutionFwdAlgo_t fw_algo;
+    cudnnConvolutionBwdDataAlgo_t bd_algo;
+    cudnnConvolutionBwdFilterAlgo_t bf_algo;
+#endif
+#endif
+};
+
+void free_layer(layer);
+
+typedef enum {
+    CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM
+} learning_rate_policy;
+
+typedef struct network{
+    int n;
+    int batch;
+    size_t *seen;
+    int *t;
+    float epoch;
+    int subdivisions;
+    layer *layers;
+    float *output;
+    learning_rate_policy policy;
+
+    float learning_rate;
+    float momentum;
+    float decay;
+    float gamma;
+    float scale;
+    float power;
+    int time_steps;
+    int step;
+    int max_batches;
+    float *scales;
+    int   *steps;
+    int num_steps;
+    int burn_in;
+
+    int adam;
+    float B1;
+    float B2;
+    float eps;
+
+    int inputs;
+    int outputs;
+    int truths;
+    int notruth;
+    int h, w, c;
+    int max_crop;
+    int min_crop;
+    float max_ratio;
+    float min_ratio;
+    int center;
+    float angle;
+    float aspect;
+    float exposure;
+    float saturation;
+    float hue;
+    int random;
+
+    int gpu_index;
+    tree *hierarchy;
+
+    float *input;
+    float *truth;
+    float *delta;
+    float *workspace;
+    int train;
+    int index;
+    float *cost;
+    float clip;
+
+#ifdef GPU
+    float *input_gpu;
+    float *truth_gpu;
+    float *delta_gpu;
+    float *output_gpu;
+#endif
+
+} network;
+
+typedef struct {
+    int w;
+    int h;
+    float scale;
+    float rad;
+    float dx;
+    float dy;
+    float aspect;
+} augment_args;
+
+typedef struct {
+    int w;
+    int h;
+    int c;
+    float *data;
+} image;
+
+typedef struct{
+    float x, y, w, h;
+} box;
+
+typedef struct detection{
+    box bbox;
+    int classes;
+    float *prob;
+    float *mask;
+    float objectness;
+    int sort_class;
+} detection;
+
+typedef struct matrix{
+    int rows, cols;
+    float **vals;
+} matrix;
+
+
+typedef struct{
+    int w, h;
+    matrix X;
+    matrix y;
+    int shallow;
+    int *num_boxes;
+    box **boxes;
+} data;
+
+typedef enum {
+    CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA, LETTERBOX_DATA, REGRESSION_DATA, SEGMENTATION_DATA, INSTANCE_DATA, ISEG_DATA
+} data_type;
+
+typedef struct load_args{
+    int threads;
+    char **paths;
+    char *path;
+    int n;
+    int m;
+    char **labels;
+    int h;
+    int w;
+    int out_w;
+    int out_h;
+    int nh;
+    int nw;
+    int num_boxes;
+    int min, max, size;
+    int classes;
+    int background;
+    int scale;
+    int center;
+    int coords;
+    float jitter;
+    float angle;
+    float aspect;
+    float saturation;
+    float exposure;
+    float hue;
+    data *d;
+    image *im;
+    image *resized;
+    data_type type;
+    tree *hierarchy;
+} load_args;
+
+typedef struct{
+    int id;
+    float x,y,w,h;
+    float left, right, top, bottom;
+} box_label;
+
+
+network *load_network(char *cfg, char *weights, int clear);
+load_args get_base_args(network *net);
+
+void free_data(data d);
+
+typedef struct node{
+    void *val;
+    struct node *next;
+    struct node *prev;
+} node;
+
+typedef struct list{
+    int size;
+    node *front;
+    node *back;
+} list;
+
+pthread_t load_data(load_args args);
+list *read_data_cfg(char *filename);
+list *read_cfg(char *filename);
+unsigned char *read_file(char *filename);
+data resize_data(data orig, int w, int h);
+data *tile_data(data orig, int divs, int size);
+data select_data(data *orig, int *inds);
+
+void forward_network(network *net);
+void backward_network(network *net);
+void update_network(network *net);
+
+
+float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
+void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
+void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
+void scal_cpu(int N, float ALPHA, float *X, int INCX);
+void fill_cpu(int N, float ALPHA, float * X, int INCX);
+void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
+void softmax(float *input, int n, float temp, int stride, float *output);
+
+int best_3d_shift_r(image a, image b, int min, int max);
+#ifdef GPU
+void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
+void fill_gpu(int N, float ALPHA, float * X, int INCX);
+void scal_gpu(int N, float ALPHA, float * X, int INCX);
+void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
+
+void cuda_set_device(int n);
+void cuda_free(float *x_gpu);
+float *cuda_make_array(float *x, size_t n);
+void cuda_pull_array(float *x_gpu, float *x, size_t n);
+float cuda_mag_array(float *x_gpu, size_t n);
+void cuda_push_array(float *x_gpu, float *x, size_t n);
+
+void forward_network_gpu(network *net);
+void backward_network_gpu(network *net);
+void update_network_gpu(network *net);
+
+float train_networks(network **nets, int n, data d, int interval);
+void sync_nets(network **nets, int n, int interval);
+void harmless_update_network_gpu(network *net);
+#endif
+image get_label(image **characters, char *string, int size);
+void draw_label(image a, int r, int c, image label, const float *rgb);
+void save_image(image im, const char *name);
+void save_image_options(image im, const char *name, IMTYPE f, int quality);
+void get_next_batch(data d, int n, int offset, float *X, float *y);
+void grayscale_image_3c(image im);
+void normalize_image(image p);
+void matrix_to_csv(matrix m);
+float train_network_sgd(network *net, data d, int n);
+void rgbgr_image(image im);
+data copy_data(data d);
+data concat_data(data d1, data d2);
+data load_cifar10_data(char *filename);
+float matrix_topk_accuracy(matrix truth, matrix guess, int k);
+void matrix_add_matrix(matrix from, matrix to);
+void scale_matrix(matrix m, float scale);
+matrix csv_to_matrix(char *filename);
+float *network_accuracies(network *net, data d, int n);
+float train_network_datum(network *net);
+image make_random_image(int w, int h, int c);
+
+void denormalize_connected_layer(layer l);
+void denormalize_convolutional_layer(layer l);
+void statistics_connected_layer(layer l);
+void rescale_weights(layer l, float scale, float trans);
+void rgbgr_weights(layer l);
+image *get_weights(layer l);
+
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, int avg, float hier_thresh, int w, int h, int fps, int fullscreen);
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets);
+
+char *option_find_str(list *l, char *key, char *def);
+int option_find_int(list *l, char *key, int def);
+int option_find_int_quiet(list *l, char *key, int def);
+
+network *parse_network_cfg(char *filename);
+void save_weights(network *net, char *filename);
+void load_weights(network *net, char *filename);
+void save_weights_upto(network *net, char *filename, int cutoff);
+void load_weights_upto(network *net, char *filename, int start, int cutoff);
+
+void zero_objectness(layer l);
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets);
+int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets);
+void free_network(network *net);
+void set_batch_network(network *net, int b);
+void set_temp_network(network *net, float t);
+image load_image(char *filename, int w, int h, int c);
+image load_image_color(char *filename, int w, int h);
+image make_image(int w, int h, int c);
+image resize_image(image im, int w, int h);
+void censor_image(image im, int dx, int dy, int w, int h);
+image letterbox_image(image im, int w, int h);
+image crop_image(image im, int dx, int dy, int w, int h);
+image center_crop_image(image im, int w, int h);
+image resize_min(image im, int min);
+image resize_max(image im, int max);
+image threshold_image(image im, float thresh);
+image mask_to_rgb(image mask);
+int resize_network(network *net, int w, int h);
+void free_matrix(matrix m);
+void test_resize(char *filename);
+int show_image(image p, const char *name, int ms);
+image copy_image(image p);
+void draw_box_width(image a, int x1, int y1, int x2, int y2, int w, float r, float g, float b);
+float get_current_rate(network *net);
+void composite_3d(char *f1, char *f2, char *out, int delta);
+data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h);
+size_t get_current_batch(network *net);
+void constrain_image(image im);
+image get_network_image_layer(network *net, int i);
+layer get_network_output_layer(network *net);
+void top_predictions(network *net, int n, int *index);
+void flip_image(image a);
+image float_to_image(int w, int h, int c, float *data);
+void ghost_image(image source, image dest, int dx, int dy);
+float network_accuracy(network *net, data d);
+void random_distort_image(image im, float hue, float saturation, float exposure);
+void fill_image(image m, float s);
+image grayscale_image(image im);
+void rotate_image_cw(image im, int times);
+double what_time_is_it_now();
+image rotate_image(image m, float rad);
+void visualize_network(network *net);
+float box_iou(box a, box b);
+data load_all_cifar10();
+box_label *read_boxes(char *filename, int *n);
+box float_to_box(float *f, int stride);
+void draw_detections(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes);
+
+matrix network_predict_data(network *net, data test);
+image **load_alphabet();
+image get_network_image(network *net);
+float *network_predict(network *net, float *input);
+
+int network_width(network *net);
+int network_height(network *net);
+float *network_predict_image(network *net, image im);
+void network_detect(network *net, image im, float thresh, float hier_thresh, float nms, detection *dets);
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num);
+void free_detections(detection *dets, int n);
+
+void reset_network_state(network *net, int b);
+
+char **get_labels(char *filename);
+void do_nms_obj(detection *dets, int total, int classes, float thresh);
+void do_nms_sort(detection *dets, int total, int classes, float thresh);
+
+matrix make_matrix(int rows, int cols);
+
+#ifdef OPENCV
+void *open_video_stream(const char *f, int c, int w, int h, int fps);
+image get_image_from_stream(void *p);
+void make_window(char *name, int w, int h, int fullscreen);
+#endif
+
+void free_image(image m);
+float train_network(network *net, data d);
+pthread_t load_data_in_thread(load_args args);
+void load_data_blocking(load_args args);
+list *get_paths(char *filename);
+void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves, int stride);
+void change_leaves(tree *t, char *leaf_list);
+
+int find_int_arg(int argc, char **argv, char *arg, int def);
+float find_float_arg(int argc, char **argv, char *arg, float def);
+int find_arg(int argc, char* argv[], char *arg);
+char *find_char_arg(int argc, char **argv, char *arg, char *def);
+char *basecfg(char *cfgfile);
+void find_replace(char *str, char *orig, char *rep, char *output);
+void free_ptrs(void **ptrs, int n);
+char *fgetl(FILE *fp);
+void strip(char *s);
+float sec(clock_t clocks);
+void **list_to_array(list *l);
+void top_k(float *a, int n, int k, int *index);
+int *read_map(char *filename);
+void error(const char *s);
+int max_index(float *a, int n);
+int max_int_index(int *a, int n);
+int sample_array(float *a, int n);
+int *random_index_order(int min, int max);
+void free_list(list *l);
+float mse_array(float *a, int n);
+float variance_array(float *a, int n);
+float mag_array(float *a, int n);
+void scale_array(float *a, int n, float s);
+float mean_array(float *a, int n);
+float sum_array(float *a, int n);
+void normalize_array(float *a, int n);
+int *read_intlist(char *s, int *n, int d);
+size_t rand_size_t();
+float rand_normal();
+float rand_uniform(float min, float max);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/image.darknet/src/data.c b/image.darknet/src/data.c
index 05e5a91..59051b4 100644
--- a/image.darknet/src/data.c
+++ b/image.darknet/src/data.c
@@ -102,7 +102,7 @@ matrix load_image_paths(char **paths, int n, int w, int h)
     return X;
 }
 
-matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
+matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center)
 {
     int i;
     matrix X;
@@ -112,7 +112,12 @@ matrix load_image_augment_paths(char **paths, int n, int min, int max, int size,
 
     for(i = 0; i < n; ++i){
         image im = load_image_color(paths[i], 0, 0);
-        image crop = random_augment_image(im, angle, aspect, min, max, size);
+        image crop;
+        if(center){
+            crop = center_crop_image(im, size, size);
+        } else {
+            crop = random_augment_image(im, angle, aspect, min, max, size, size);
+        }
         int flip = rand()%2;
         if (flip) flip_image(crop);
         random_distort_image(crop, hue, saturation, exposure);
@@ -122,6 +127,7 @@ matrix load_image_augment_paths(char **paths, int n, int min, int max, int size,
         show_image(crop, "crop");
         cvWaitKey(0);
         */
+        //grayscale_image_3c(crop);
         free_image(im);
         X.vals[i] = crop.data;
         X.cols = crop.h*crop.w*crop.c;
@@ -132,14 +138,18 @@ matrix load_image_augment_paths(char **paths, int n, int min, int max, int size,
 
 box_label *read_boxes(char *filename, int *n)
 {
-    box_label *boxes = calloc(1, sizeof(box_label));
     FILE *file = fopen(filename, "r");
     if(!file) file_error(filename);
     float x, y, h, w;
     int id;
     int count = 0;
+    int size = 64;
+    box_label *boxes = calloc(size, sizeof(box_label));
     while(fscanf(file, "%d %f %f %f %f", &id, &x, &y, &w, &h) == 5){
-        boxes = realloc(boxes, (count+1)*sizeof(box_label));
+        if(count == size) {
+            size = size * 2;
+            boxes = realloc(boxes, size*sizeof(box_label));
+        }
         boxes[count].id = id;
         boxes[count].x = x;
         boxes[count].y = y;
@@ -221,7 +231,7 @@ void fill_truth_swag(char *path, float *truth, int classes, int flip, float dx,
     int id;
     int i;
 
-    for (i = 0; i < count && i < 30; ++i) {
+    for (i = 0; i < count && i < 90; ++i) {
         x =  boxes[i].x;
         y =  boxes[i].y;
         w =  boxes[i].w;
@@ -290,6 +300,150 @@ void fill_truth_region(char *path, float *truth, int classes, int num_boxes, int
     free(boxes);
 }
 
+void load_rle(image im, int *rle, int n)
+{
+    int count = 0;
+    int curr = 0;
+    int i,j;
+    for(i = 0; i < n; ++i){
+        for(j = 0; j < rle[i]; ++j){
+            im.data[count++] = curr;
+        }
+        curr = 1 - curr;
+    }
+    for(; count < im.h*im.w*im.c; ++count){
+        im.data[count] = curr;
+    }
+}
+
+void or_image(image src, image dest, int c)
+{
+    int i;
+    for(i = 0; i < src.w*src.h; ++i){
+        if(src.data[i]) dest.data[dest.w*dest.h*c + i] = 1;
+    }
+}
+
+void exclusive_image(image src)
+{
+    int k, j, i;
+    int s = src.w*src.h;
+    for(k = 0; k < src.c-1; ++k){
+        for(i = 0; i < s; ++i){
+            if (src.data[k*s + i]){
+                for(j = k+1; j < src.c; ++j){
+                    src.data[j*s + i] = 0;
+                }
+            }
+        }
+    }
+}
+
+box bound_image(image im)
+{
+    int x,y;
+    int minx = im.w;
+    int miny = im.h;
+    int maxx = 0;
+    int maxy = 0;
+    for(y = 0; y < im.h; ++y){
+        for(x = 0; x < im.w; ++x){
+            if(im.data[y*im.w + x]){
+                minx = (x < minx) ? x : minx;
+                miny = (y < miny) ? y : miny;
+                maxx = (x > maxx) ? x : maxx;
+                maxy = (y > maxy) ? y : maxy;
+            }
+        }
+    }
+    box b = {minx, miny, maxx-minx + 1, maxy-miny + 1};
+    //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
+    return b;
+}
+
+void fill_truth_iseg(char *path, int num_boxes, float *truth, int classes, int w, int h, augment_args aug, int flip, int mw, int mh)
+{
+    char labelpath[4096];
+    find_replace(path, "images", "mask", labelpath);
+    find_replace(labelpath, "JPEGImages", "mask", labelpath);
+    find_replace(labelpath, ".jpg", ".txt", labelpath);
+    find_replace(labelpath, ".JPG", ".txt", labelpath);
+    find_replace(labelpath, ".JPEG", ".txt", labelpath);
+    FILE *file = fopen(labelpath, "r");
+    if(!file) file_error(labelpath);
+    char buff[32788];
+    int id;
+    int i = 0;
+    int j;
+    image part = make_image(w, h, 1);
+    while((fscanf(file, "%d %s", &id, buff) == 2) && i < num_boxes){
+        int n = 0;
+        int *rle = read_intlist(buff, &n, 0);
+        load_rle(part, rle, n);
+        image sized = rotate_crop_image(part, aug.rad, aug.scale, aug.w, aug.h, aug.dx, aug.dy, aug.aspect);
+        if(flip) flip_image(sized);
+
+        image mask = resize_image(sized, mw, mh);
+        truth[i*(mw*mh+1)] = id;
+        for(j = 0; j < mw*mh; ++j){
+            truth[i*(mw*mh + 1) + 1 + j] = mask.data[j];
+        }
+        ++i;
+
+        free_image(mask);
+        free_image(sized);
+        free(rle);
+    }
+    if(i < num_boxes) truth[i*(mw*mh+1)] = -1;
+    fclose(file);
+    free_image(part);
+}
+
+void fill_truth_mask(char *path, int num_boxes, float *truth, int classes, int w, int h, augment_args aug, int flip, int mw, int mh)
+{
+    char labelpath[4096];
+    find_replace(path, "images", "mask", labelpath);
+    find_replace(labelpath, "JPEGImages", "mask", labelpath);
+    find_replace(labelpath, ".jpg", ".txt", labelpath);
+    find_replace(labelpath, ".JPG", ".txt", labelpath);
+    find_replace(labelpath, ".JPEG", ".txt", labelpath);
+    FILE *file = fopen(labelpath, "r");
+    if(!file) file_error(labelpath);
+    char buff[32788];
+    int id;
+    int i = 0;
+    image part = make_image(w, h, 1);
+    while((fscanf(file, "%d %s", &id, buff) == 2) && i < num_boxes){
+        int n = 0;
+        int *rle = read_intlist(buff, &n, 0);
+        load_rle(part, rle, n);
+        image sized = rotate_crop_image(part, aug.rad, aug.scale, aug.w, aug.h, aug.dx, aug.dy, aug.aspect);
+        if(flip) flip_image(sized);
+        box b = bound_image(sized);
+        if(b.w > 0){
+            image crop = crop_image(sized, b.x, b.y, b.w, b.h);
+            image mask = resize_image(crop, mw, mh);
+            truth[i*(4 + mw*mh + 1) + 0] = (b.x + b.w/2.)/sized.w;
+            truth[i*(4 + mw*mh + 1) + 1] = (b.y + b.h/2.)/sized.h;
+            truth[i*(4 + mw*mh + 1) + 2] = b.w/sized.w;
+            truth[i*(4 + mw*mh + 1) + 3] = b.h/sized.h;
+            int j;
+            for(j = 0; j < mw*mh; ++j){
+                truth[i*(4 + mw*mh + 1) + 4 + j] = mask.data[j];
+            }
+            truth[i*(4 + mw*mh + 1) + 4 + mw*mh] = id;
+            free_image(crop);
+            free_image(mask);
+            ++i;
+        }
+        free_image(sized);
+        free(rle);
+    }
+    fclose(file);
+    free_image(part);
+}
+
+
 void fill_truth_detection(char *path, int num_boxes, float *truth, int classes, int flip, float dx, float dy, float sx, float sy)
 {
     char labelpath[4096];
@@ -309,6 +463,7 @@ void fill_truth_detection(char *path, int num_boxes, float *truth, int classes,
     float x,y,w,h;
     int id;
     int i;
+    int sub = 0;
 
     for (i = 0; i < count; ++i) {
         x =  boxes[i].x;
@@ -317,13 +472,16 @@ void fill_truth_detection(char *path, int num_boxes, float *truth, int classes,
         h =  boxes[i].h;
         id = boxes[i].id;
 
-        if ((w < .005 || h < .005)) continue;
+        if ((w < .001 || h < .001)) {
+            ++sub;
+            continue;
+        }
 
-        truth[i*5+0] = x;
-        truth[i*5+1] = y;
-        truth[i*5+2] = w;
-        truth[i*5+3] = h;
-        truth[i*5+4] = id;
+        truth[(i-sub)*5+0] = x;
+        truth[(i-sub)*5+1] = y;
+        truth[(i-sub)*5+2] = w;
+        truth[(i-sub)*5+3] = h;
+        truth[(i-sub)*5+4] = id;
     }
     free(boxes);
 }
@@ -391,9 +549,10 @@ void fill_truth(char *path, char **labels, int k, float *truth)
         if(strstr(path, labels[i])){
             truth[i] = 1;
             ++count;
+            //printf("%s %s %d\n", path, labels[i], i);
         }
     }
-    if(count != 1) printf("Too many or too few labels: %d, %s\n", count, path);
+    if(count != 1 && (k != 1 || count != 0)) printf("Too many or too few labels: %d, %s\n", count, path);
 }
 
 void fill_hierarchy(float *truth, int k, tree *hierarchy)
@@ -428,6 +587,36 @@ void fill_hierarchy(float *truth, int k, tree *hierarchy)
     }
 }
 
+matrix load_regression_labels_paths(char **paths, int n, int k)
+{
+    matrix y = make_matrix(n, k);
+    int i,j;
+    for(i = 0; i < n; ++i){
+        char labelpath[4096];
+        find_replace(paths[i], "images", "labels", labelpath);
+        find_replace(labelpath, "JPEGImages", "labels", labelpath);
+        find_replace(labelpath, ".BMP", ".txt", labelpath);
+        find_replace(labelpath, ".JPEG", ".txt", labelpath);
+        find_replace(labelpath, ".JPG", ".txt", labelpath);
+        find_replace(labelpath, ".JPeG", ".txt", labelpath);
+        find_replace(labelpath, ".Jpeg", ".txt", labelpath);
+        find_replace(labelpath, ".PNG", ".txt", labelpath);
+        find_replace(labelpath, ".TIF", ".txt", labelpath);
+        find_replace(labelpath, ".bmp", ".txt", labelpath);
+        find_replace(labelpath, ".jpeg", ".txt", labelpath);
+        find_replace(labelpath, ".jpg", ".txt", labelpath);
+        find_replace(labelpath, ".png", ".txt", labelpath);
+        find_replace(labelpath, ".tif", ".txt", labelpath);
+
+        FILE *file = fopen(labelpath, "r");
+        for(j = 0; j < k; ++j){
+            fscanf(file, "%f", &(y.vals[i][j]));
+        }
+        fclose(file);
+    }
+    return y;
+}
+
 matrix load_labels_paths(char **paths, int n, char **labels, int k, tree *hierarchy)
 {
     matrix y = make_matrix(n, k);
@@ -445,18 +634,14 @@ matrix load_tags_paths(char **paths, int n, int k)
 {
     matrix y = make_matrix(n, k);
     int i;
-    int count = 0;
+    //int count = 0;
     for(i = 0; i < n; ++i){
         char label[4096];
-        find_replace(paths[i], "imgs", "labels", label);
-        find_replace(label, "_iconl.jpeg", ".txt", label);
+        find_replace(paths[i], "images", "labels", label);
+        find_replace(label, ".jpg", ".txt", label);
         FILE *file = fopen(label, "r");
-        if(!file){
-            find_replace(label, "labels", "labels2", label);
-            file = fopen(label, "r");
-            if(!file) continue;
-        }
-        ++count;
+        if (!file) continue;
+        //++count;
         int tag;
         while(fscanf(file, "%d", &tag) == 1){
             if(tag < k){
@@ -465,7 +650,7 @@ matrix load_tags_paths(char **paths, int n, int k)
         }
         fclose(file);
     }
-    printf("%d/%d\n", count, n);
+    //printf("%d/%d\n", count, n);
     return y;
 }
 
@@ -488,6 +673,195 @@ void free_data(data d)
     }
 }
 
+image get_segmentation_image(char *path, int w, int h, int classes)
+{
+    char labelpath[4096];
+    find_replace(path, "images", "mask", labelpath);
+    find_replace(labelpath, "JPEGImages", "mask", labelpath);
+    find_replace(labelpath, ".jpg", ".txt", labelpath);
+    find_replace(labelpath, ".JPG", ".txt", labelpath);
+    find_replace(labelpath, ".JPEG", ".txt", labelpath);
+    image mask = make_image(w, h, classes);
+    FILE *file = fopen(labelpath, "r");
+    if(!file) file_error(labelpath);
+    char buff[32788];
+    int id;
+    image part = make_image(w, h, 1);
+    while(fscanf(file, "%d %s", &id, buff) == 2){
+        int n = 0;
+        int *rle = read_intlist(buff, &n, 0);
+        load_rle(part, rle, n);
+        or_image(part, mask, id);
+        free(rle);
+    }
+    //exclusive_image(mask);
+    fclose(file);
+    free_image(part);
+    return mask;
+}
+
+image get_segmentation_image2(char *path, int w, int h, int classes)
+{
+    char labelpath[4096];
+    find_replace(path, "images", "mask", labelpath);
+    find_replace(labelpath, "JPEGImages", "mask", labelpath);
+    find_replace(labelpath, ".jpg", ".txt", labelpath);
+    find_replace(labelpath, ".JPG", ".txt", labelpath);
+    find_replace(labelpath, ".JPEG", ".txt", labelpath);
+    image mask = make_image(w, h, classes+1);
+    int i;
+    for(i = 0; i < w*h; ++i){
+        mask.data[w*h*classes + i] = 1;
+    }
+    FILE *file = fopen(labelpath, "r");
+    if(!file) file_error(labelpath);
+    char buff[32788];
+    int id;
+    image part = make_image(w, h, 1);
+    while(fscanf(file, "%d %s", &id, buff) == 2){
+        int n = 0;
+        int *rle = read_intlist(buff, &n, 0);
+        load_rle(part, rle, n);
+        or_image(part, mask, id);
+        for(i = 0; i < w*h; ++i){
+            if(part.data[i]) mask.data[w*h*classes + i] = 0;
+        }
+        free(rle);
+    }
+    //exclusive_image(mask);
+    fclose(file);
+    free_image(part);
+    return mask;
+}
+
+data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int min, int max, float angle, float aspect, float hue, float saturation, float exposure, int div)
+{
+    char **random_paths = get_random_paths(paths, n, m);
+    int i;
+    data d = {0};
+    d.shallow = 0;
+
+    d.X.rows = n;
+    d.X.vals = calloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*3;
+
+
+    d.y.rows = n;
+    d.y.cols = h*w*classes/div/div;
+    d.y.vals = calloc(d.X.rows, sizeof(float*));
+
+    for(i = 0; i < n; ++i){
+        image orig = load_image_color(random_paths[i], 0, 0);
+        augment_args a = random_augment_args(orig, angle, aspect, min, max, w, h);
+        image sized = rotate_crop_image(orig, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
+
+        int flip = rand()%2;
+        if(flip) flip_image(sized);
+        random_distort_image(sized, hue, saturation, exposure);
+        d.X.vals[i] = sized.data;
+
+        image mask = get_segmentation_image(random_paths[i], orig.w, orig.h, classes);
+        //image mask = make_image(orig.w, orig.h, classes+1);
+        image sized_m = rotate_crop_image(mask, a.rad, a.scale/div, a.w/div, a.h/div, a.dx/div, a.dy/div, a.aspect);
+
+        if(flip) flip_image(sized_m);
+        d.y.vals[i] = sized_m.data;
+
+        free_image(orig);
+        free_image(mask);
+
+        /*
+           image rgb = mask_to_rgb(sized_m, classes);
+           show_image(rgb, "part");
+           show_image(sized, "orig");
+           cvWaitKey(0);
+           free_image(rgb);
+         */
+    }
+    free(random_paths);
+    return d;
+}
+
+data load_data_iseg(int n, char **paths, int m, int w, int h, int classes, int boxes, int div, int min, int max, float angle, float aspect, float hue, float saturation, float exposure)
+{
+    char **random_paths = get_random_paths(paths, n, m);
+    int i;
+    data d = {0};
+    d.shallow = 0;
+
+    d.X.rows = n;
+    d.X.vals = calloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*3;
+
+    d.y = make_matrix(n, (((w/div)*(h/div))+1)*boxes);
+
+    for(i = 0; i < n; ++i){
+        image orig = load_image_color(random_paths[i], 0, 0);
+        augment_args a = random_augment_args(orig, angle, aspect, min, max, w, h);
+        image sized = rotate_crop_image(orig, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
+
+        int flip = rand()%2;
+        if(flip) flip_image(sized);
+        random_distort_image(sized, hue, saturation, exposure);
+        d.X.vals[i] = sized.data;
+        //show_image(sized, "image");
+
+        fill_truth_iseg(random_paths[i], boxes, d.y.vals[i], classes, orig.w, orig.h, a, flip, w/div, h/div);
+
+        free_image(orig);
+
+        /*
+           image rgb = mask_to_rgb(sized_m, classes);
+           show_image(rgb, "part");
+           show_image(sized, "orig");
+           cvWaitKey(0);
+           free_image(rgb);
+         */
+    }
+    free(random_paths);
+    return d;
+}
+
+data load_data_mask(int n, char **paths, int m, int w, int h, int classes, int boxes, int coords, int min, int max, float angle, float aspect, float hue, float saturation, float exposure)
+{
+    char **random_paths = get_random_paths(paths, n, m);
+    int i;
+    data d = {0};
+    d.shallow = 0;
+
+    d.X.rows = n;
+    d.X.vals = calloc(d.X.rows, sizeof(float*));
+    d.X.cols = h*w*3;
+
+    d.y = make_matrix(n, (coords+1)*boxes);
+
+    for(i = 0; i < n; ++i){
+        image orig = load_image_color(random_paths[i], 0, 0);
+        augment_args a = random_augment_args(orig, angle, aspect, min, max, w, h);
+        image sized = rotate_crop_image(orig, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
+
+        int flip = rand()%2;
+        if(flip) flip_image(sized);
+        random_distort_image(sized, hue, saturation, exposure);
+        d.X.vals[i] = sized.data;
+        //show_image(sized, "image");
+
+        fill_truth_mask(random_paths[i], boxes, d.y.vals[i], classes, orig.w, orig.h, a, flip, 14, 14);
+
+        free_image(orig);
+
+        /*
+           image rgb = mask_to_rgb(sized_m, classes);
+           show_image(rgb, "part");
+           show_image(sized, "orig");
+           cvWaitKey(0);
+           free_image(rgb);
+         */
+    }
+    free(random_paths);
+    return d;
+}
+
 data load_data_region(int n, char **paths, int m, int w, int h, int size, int classes, float jitter, float hue, float saturation, float exposure)
 {
     char **random_paths = get_random_paths(paths, n, m);
@@ -624,7 +998,7 @@ data load_data_swag(char **paths, int n, int classes, float jitter)
     d.X.vals = calloc(d.X.rows, sizeof(float*));
     d.X.cols = h*w*3;
 
-    int k = (4+classes)*30;
+    int k = (4+classes)*90;
     d.y = make_matrix(1, k);
 
     int dw = w*jitter;
@@ -673,45 +1047,46 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, in
     d.y = make_matrix(n, 5*boxes);
     for(i = 0; i < n; ++i){
         image orig = load_image_color(random_paths[i], 0, 0);
+        image sized = make_image(w, h, orig.c);
+        fill_image(sized, .5);
 
-        int oh = orig.h;
-        int ow = orig.w;
+        float dw = jitter * orig.w;
+        float dh = jitter * orig.h;
 
-        int dw = (ow*jitter);
-        int dh = (oh*jitter);
+        float new_ar = (orig.w + rand_uniform(-dw, dw)) / (orig.h + rand_uniform(-dh, dh));
+        //float scale = rand_uniform(.25, 2);
+        float scale = 1;
 
-        int pleft  = rand_uniform(-dw, dw);
-        int pright = rand_uniform(-dw, dw);
-        int ptop   = rand_uniform(-dh, dh);
-        int pbot   = rand_uniform(-dh, dh);
+        float nw, nh;
 
-        int swidth =  ow - pleft - pright;
-        int sheight = oh - ptop - pbot;
+        if(new_ar < 1){
+            nh = scale * h;
+            nw = nh * new_ar;
+        } else {
+            nw = scale * w;
+            nh = nw / new_ar;
+        }
 
-        float sx = (float)swidth  / ow;
-        float sy = (float)sheight / oh;
+        float dx = rand_uniform(0, w - nw);
+        float dy = rand_uniform(0, h - nh);
 
-        int flip = rand()%2;
-        image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
+        place_image(orig, nw, nh, dx, dy, sized);
 
-        float dx = ((float)pleft/ow)/sx;
-        float dy = ((float)ptop /oh)/sy;
+        random_distort_image(sized, hue, saturation, exposure);
 
-        image sized = resize_image(cropped, w, h);
+        int flip = rand()%2;
         if(flip) flip_image(sized);
-        random_distort_image(sized, hue, saturation, exposure);
         d.X.vals[i] = sized.data;
 
-        fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, dx, dy, 1./sx, 1./sy);
+
+        fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, -dx/w, -dy/h, nw/w, nh/h);
 
         free_image(orig);
-        free_image(cropped);
     }
     free(random_paths);
     return d;
 }
 
-
 void *load_thread(void *ptr)
 {
     //printf("Loading data: %d\n", rand());
@@ -722,12 +1097,20 @@ void *load_thread(void *ptr)
 
     if (a.type == OLD_CLASSIFICATION_DATA){
         *a.d = load_data_old(a.paths, a.n, a.m, a.labels, a.classes, a.w, a.h);
+    } else if (a.type == REGRESSION_DATA){
+        *a.d = load_data_regression(a.paths, a.n, a.m, a.classes, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
     } else if (a.type == CLASSIFICATION_DATA){
-        *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
+        *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure, a.center);
     } else if (a.type == SUPER_DATA){
         *a.d = load_data_super(a.paths, a.n, a.m, a.w, a.h, a.scale);
     } else if (a.type == WRITING_DATA){
         *a.d = load_data_writing(a.paths, a.n, a.m, a.w, a.h, a.out_w, a.out_h);
+    } else if (a.type == ISEG_DATA){
+        *a.d = load_data_iseg(a.n, a.paths, a.m, a.w, a.h, a.classes, a.num_boxes, a.scale, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
+    } else if (a.type == INSTANCE_DATA){
+        *a.d = load_data_mask(a.n, a.paths, a.m, a.w, a.h, a.classes, a.num_boxes, a.coords, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
+    } else if (a.type == SEGMENTATION_DATA){
+        *a.d = load_data_seg(a.n, a.paths, a.m, a.w, a.h, a.classes, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure, a.scale);
     } else if (a.type == REGION_DATA){
         *a.d = load_data_region(a.n, a.paths, a.m, a.w, a.h, a.num_boxes, a.classes, a.jitter, a.hue, a.saturation, a.exposure);
     } else if (a.type == DETECTION_DATA){
@@ -739,6 +1122,9 @@ void *load_thread(void *ptr)
     } else if (a.type == IMAGE_DATA){
         *(a.im) = load_image_color(a.path, 0, 0);
         *(a.resized) = resize_image(*(a.im), a.w, a.h);
+    } else if (a.type == LETTERBOX_DATA){
+        *(a.im) = load_image_color(a.path, 0, 0);
+        *(a.resized) = letterbox_image(*(a.im), a.w, a.h);
     } else if (a.type == TAG_DATA){
         *a.d = load_data_tag(a.paths, a.n, a.m, a.classes, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
     }
@@ -784,6 +1170,13 @@ void *load_threads(void *ptr)
     return 0;
 }
 
+void load_data_blocking(load_args args)
+{
+    struct load_args *ptr = calloc(1, sizeof(struct load_args));
+    *ptr = args;
+    load_thread(ptr);
+}
+
 pthread_t load_data(load_args args)
 {
     pthread_t thread;
@@ -863,12 +1256,95 @@ data load_data_super(char **paths, int n, int m, int w, int h, int scale)
     return d;
 }
 
-data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
+data load_data_regression(char **paths, int n, int m, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
 {
     if(m) paths = get_random_paths(paths, n, m);
     data d = {0};
     d.shallow = 0;
-    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure);
+    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure, 0);
+    d.y = load_regression_labels_paths(paths, n, k);
+    if(m) free(paths);
+    return d;
+}
+
+data select_data(data *orig, int *inds)
+{
+    data d = {0};
+    d.shallow = 1;
+    d.w = orig[0].w;
+    d.h = orig[0].h;
+
+    d.X.rows = orig[0].X.rows;
+    d.y.rows = orig[0].X.rows;
+
+    d.X.cols = orig[0].X.cols;
+    d.y.cols = orig[0].y.cols;
+
+    d.X.vals = calloc(orig[0].X.rows, sizeof(float *));
+    d.y.vals = calloc(orig[0].y.rows, sizeof(float *));
+    int i;
+    for(i = 0; i < d.X.rows; ++i){
+        d.X.vals[i] = orig[inds[i]].X.vals[i];
+        d.y.vals[i] = orig[inds[i]].y.vals[i];
+    }
+    return d;
+}
+
+data *tile_data(data orig, int divs, int size)
+{
+    data *ds = calloc(divs*divs, sizeof(data));
+    int i, j;
+#pragma omp parallel for
+    for(i = 0; i < divs*divs; ++i){
+        data d;
+        d.shallow = 0;
+        d.w = orig.w/divs * size;
+        d.h = orig.h/divs * size;
+        d.X.rows = orig.X.rows;
+        d.X.cols = d.w*d.h*3;
+        d.X.vals = calloc(d.X.rows, sizeof(float*));
+
+        d.y = copy_matrix(orig.y);
+#pragma omp parallel for
+        for(j = 0; j < orig.X.rows; ++j){
+            int x = (i%divs) * orig.w / divs - (d.w - orig.w/divs)/2;
+            int y = (i/divs) * orig.h / divs - (d.h - orig.h/divs)/2;
+            image im = float_to_image(orig.w, orig.h, 3, orig.X.vals[j]);
+            d.X.vals[j] = crop_image(im, x, y, d.w, d.h).data;
+        }
+        ds[i] = d;
+    }
+    return ds;
+}
+
+data resize_data(data orig, int w, int h)
+{
+    data d = {0};
+    d.shallow = 0;
+    d.w = w;
+    d.h = h;
+    int i;
+    d.X.rows = orig.X.rows;
+    d.X.cols = w*h*3;
+    d.X.vals = calloc(d.X.rows, sizeof(float*));
+
+    d.y = copy_matrix(orig.y);
+#pragma omp parallel for
+    for(i = 0; i < orig.X.rows; ++i){
+        image im = float_to_image(orig.w, orig.h, 3, orig.X.vals[i]);
+        d.X.vals[i] = resize_image(im, w, h).data;
+    }
+    return d;
+}
+
+data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center)
+{
+    if(m) paths = get_random_paths(paths, n, m);
+    data d = {0};
+    d.shallow = 0;
+    d.w=size;
+    d.h=size;
+    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure, center);
     d.y = load_labels_paths(paths, n, labels, k, hierarchy);
     if(m) free(paths);
     return d;
@@ -881,7 +1357,7 @@ data load_data_tag(char **paths, int n, int m, int k, int min, int max, int size
     d.w = size;
     d.h = size;
     d.shallow = 0;
-    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure);
+    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure, 0);
     d.y = load_tags_paths(paths, n, k);
     if(m) free(paths);
     return d;
@@ -909,6 +1385,8 @@ data concat_data(data d1, data d2)
     d.shallow = 1;
     d.X = concat_matrix(d1.X, d2.X);
     d.y = concat_matrix(d1.y, d2.y);
+    d.w = d1.w;
+    d.h = d1.h;
     return d;
 }
 
@@ -962,7 +1440,6 @@ data load_cifar10_data(char *filename)
             X.vals[i][j] = (double)bytes[j+1];
         }
     }
-    //translate_data_rows(d, -128);
     scale_data_rows(d, 1./255);
     //normalize_data_rows(d);
     fclose(fp);
@@ -985,7 +1462,7 @@ void get_next_batch(data d, int n, int offset, float *X, float *y)
     for(j = 0; j < n; ++j){
         int index = offset + j;
         memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
-        memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
+        if(y) memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
     }
 }
 
@@ -1029,7 +1506,6 @@ data load_all_cifar10()
         fclose(fp);
     }
     //normalize_data_rows(d);
-    //translate_data_rows(d, -128);
     scale_data_rows(d, 1./255);
     smooth_data(d);
     return d;
@@ -1113,6 +1589,19 @@ void translate_data_rows(data d, float s)
     }
 }
 
+data copy_data(data d)
+{
+    data c = {0};
+    c.w = d.w;
+    c.h = d.h;
+    c.shallow = 0;
+    c.num_boxes = d.num_boxes;
+    c.boxes = d.boxes;
+    c.X = copy_matrix(d.X);
+    c.y = copy_matrix(d.y);
+    return c;
+}
+
 void normalize_data_rows(data d)
 {
     int i;
diff --git a/image.darknet/src/data.h b/image.darknet/src/data.h
index 3f6ef61..781906f 100644
--- a/image.darknet/src/data.h
+++ b/image.darknet/src/data.h
@@ -2,6 +2,7 @@
 #define DATA_H
 #include <pthread.h>
 
+#include "darknet.h"
 #include "matrix.h"
 #include "list.h"
 #include "image.h"
@@ -17,93 +18,32 @@ static inline float distance_from_edge(int x, int max)
     if (dist > 1) dist = 1;
     return dist;
 }
+void load_data_blocking(load_args args);
 
-typedef struct{
-    int w, h;
-    matrix X;
-    matrix y;
-    int shallow;
-    int *num_boxes;
-    box **boxes;
-} data;
-
-typedef enum {
-    CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA
-} data_type;
-
-typedef struct load_args{
-    int threads;
-    char **paths;
-    char *path;
-    int n;
-    int m;
-    char **labels;
-    int h;
-    int w;
-    int out_w;
-    int out_h;
-    int nh;
-    int nw;
-    int num_boxes;
-    int min, max, size;
-    int classes;
-    int background;
-    int scale;
-    float jitter;
-    float angle;
-    float aspect;
-    float saturation;
-    float exposure;
-    float hue;
-    data *d;
-    image *im;
-    image *resized;
-    data_type type;
-    tree *hierarchy;
-} load_args;
-
-typedef struct{
-    int id;
-    float x,y,w,h;
-    float left, right, top, bottom;
-} box_label;
-
-void free_data(data d);
-
-pthread_t load_data(load_args args);
-
-pthread_t load_data_in_thread(load_args args);
 
 void print_letters(float *pred, int n);
 data load_data_captcha(char **paths, int n, int m, int k, int w, int h);
 data load_data_captcha_encode(char **paths, int n, int m, int w, int h);
-data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h);
 data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, int classes, float jitter, float hue, float saturation, float exposure);
 data load_data_tag(char **paths, int n, int m, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
-matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
+matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center);
 data load_data_super(char **paths, int n, int m, int w, int h, int scale);
-data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
+data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center);
+data load_data_regression(char **paths, int n, int m, int classes, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
 data load_go(char *filename);
 
-box_label *read_boxes(char *filename, int *n);
-data load_cifar10_data(char *filename);
-data load_all_cifar10();
 
 data load_data_writing(char **paths, int n, int m, int w, int h, int out_w, int out_h);
 
-list *get_paths(char *filename);
-char **get_labels(char *filename);
 void get_random_batch(data d, int n, float *X, float *y);
 data get_data_part(data d, int part, int total);
 data get_random_data(data d, int num);
-void get_next_batch(data d, int n, int offset, float *X, float *y);
 data load_categorical_data_csv(char *filename, int target, int k);
 void normalize_data_rows(data d);
 void scale_data_rows(data d, float s);
 void translate_data_rows(data d, float s);
 void randomize_data(data d);
 data *split_data(data d, int part, int total);
-data concat_data(data d1, data d2);
 data concat_datas(data *d, int n);
 void fill_truth(char *path, char **labels, int k, float *truth);
 
diff --git a/image.darknet/src/deconvolutional_kernels.cu b/image.darknet/src/deconvolutional_kernels.cu
index d6259fb..8267dcf 100644
--- a/image.darknet/src/deconvolutional_kernels.cu
+++ b/image.darknet/src/deconvolutional_kernels.cu
@@ -5,6 +5,7 @@
 extern "C" {
 #include "convolutional_layer.h"
 #include "deconvolutional_layer.h"
+#include "batchnorm_layer.h"
 #include "gemm.h"
 #include "blas.h"
 #include "im2col.h"
@@ -13,97 +14,126 @@ extern "C" {
 #include "cuda.h"
 }
 
-extern "C" void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state)
+extern "C" void forward_deconvolutional_layer_gpu(layer l, network net)
 {
     int i;
-    int out_h = deconvolutional_out_height(layer);
-    int out_w = deconvolutional_out_width(layer);
-    int size = out_h*out_w;
 
-    int m = layer.size*layer.size*layer.n;
-    int n = layer.h*layer.w;
-    int k = layer.c;
+    int m = l.size*l.size*l.n;
+    int n = l.h*l.w;
+    int k = l.c;
 
-    fill_ongpu(layer.outputs*layer.batch, 0, layer.output_gpu, 1);
+    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
 
-    for(i = 0; i < layer.batch; ++i){
-        float *a = layer.weights_gpu;
-        float *b = state.input + i*layer.c*layer.h*layer.w;
-        float *c = layer.col_image_gpu;
+    for(i = 0; i < l.batch; ++i){
+        float *a = l.weights_gpu;
+        float *b = net.input_gpu + i*l.c*l.h*l.w;
+        float *c = net.workspace;
 
-        gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
+        gemm_gpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
 
-        col2im_ongpu(c, layer.n, out_h, out_w, layer.size, layer.stride, 0, layer.output_gpu+i*layer.n*size);
+        col2im_gpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs);
     }
-    add_bias_gpu(layer.output_gpu, layer.biases_gpu, layer.batch, layer.n, size);
-    activate_array(layer.output_gpu, layer.batch*layer.n*size, layer.activation);
+    if (l.batch_normalize) {
+        forward_batchnorm_layer_gpu(l, net);
+    } else {
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+    }
+    activate_array_gpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation);
 }
 
-extern "C" void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state)
+extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
 {
-    float alpha = 1./layer.batch;
-    int out_h = deconvolutional_out_height(layer);
-    int out_w = deconvolutional_out_width(layer);
-    int size = out_h*out_w;
     int i;
 
-    gradient_array(layer.output_gpu, size*layer.n*layer.batch, layer.activation, layer.delta_gpu);
-    backward_bias(layer.bias_updates_gpu, layer.delta, layer.batch, layer.n, size);
+    //constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+
+    if(l.batch_normalize){
+        backward_batchnorm_layer_gpu(l, net);
+    } else {
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+    }
 
-    if(state.delta) memset(state.delta, 0, layer.batch*layer.h*layer.w*layer.c*sizeof(float));
+    //if(net.delta_gpu) memset(net.delta_gpu, 0, l.batch*l.h*l.w*l.c*sizeof(float));
 
-    for(i = 0; i < layer.batch; ++i){
-        int m = layer.c;
-        int n = layer.size*layer.size*layer.n;
-        int k = layer.h*layer.w;
+    for(i = 0; i < l.batch; ++i){
+        int m = l.c;
+        int n = l.size*l.size*l.n;
+        int k = l.h*l.w;
 
-        float *a = state.input + i*m*n;
-        float *b = layer.col_image_gpu;
-        float *c = layer.weight_updates_gpu;
+        float *a = net.input_gpu + i*m*k;
+        float *b = net.workspace;
+        float *c = l.weight_updates_gpu;
 
-        im2col_ongpu(layer.delta_gpu + i*layer.n*size, layer.n, out_h, out_w, 
-                layer.size, layer.stride, 0, b);
-        gemm_ongpu(0,1,m,n,k,alpha,a,k,b,k,1,c,n);
+        im2col_gpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w, 
+                l.size, l.stride, l.pad, b);
+        gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
 
-        if(state.delta){
-            int m = layer.c;
-            int n = layer.h*layer.w;
-            int k = layer.size*layer.size*layer.n;
+        if(net.delta_gpu){
+            int m = l.c;
+            int n = l.h*l.w;
+            int k = l.size*l.size*l.n;
 
-            float *a = layer.weights_gpu;
-            float *b = layer.col_image_gpu;
-            float *c = state.delta + i*n*m;
+            float *a = l.weights_gpu;
+            float *b = net.workspace;
+            float *c = net.delta_gpu + i*n*m;
 
-            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+            gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
         }
     }
 }
 
-extern "C" void pull_deconvolutional_layer(deconvolutional_layer layer)
+extern "C" void pull_deconvolutional_layer(layer l)
 {
-    cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
+    cuda_pull_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size);
+    cuda_pull_array(l.biases_gpu, l.biases, l.n);
+    cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size);
+    cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_pull_array(l.scales_gpu, l.scales, l.n);
+        cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
+    }
 }
 
-extern "C" void push_deconvolutional_layer(deconvolutional_layer layer)
+extern "C" void push_deconvolutional_layer(layer l)
 {
-    cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
+    cuda_push_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size);
+    cuda_push_array(l.biases_gpu, l.biases, l.n);
+    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size);
+    cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_push_array(l.scales_gpu, l.scales, l.n);
+        cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
+    }
 }
 
-extern "C" void update_deconvolutional_layer_gpu(deconvolutional_layer layer, float learning_rate, float momentum, float decay)
+void update_deconvolutional_layer_gpu(layer l, update_args a)
 {
-    int size = layer.size*layer.size*layer.c*layer.n;
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
+    if(a.adam){
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
+        adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
+        if(l.scales_gpu){
+            adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
+        }
+    }else{
+        axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);
 
-    axpy_ongpu(layer.n, learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1);
-    scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1);
+        axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+        scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
 
-    axpy_ongpu(size, -decay, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
-    axpy_ongpu(size, learning_rate, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
-    scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);
+        if(l.scales_gpu){
+            axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+            scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
+        }
+    }
 }
 
diff --git a/image.darknet/src/deconvolutional_layer.c b/image.darknet/src/deconvolutional_layer.c
index fbef9d5..00c0e85 100644
--- a/image.darknet/src/deconvolutional_layer.c
+++ b/image.darknet/src/deconvolutional_layer.c
@@ -1,52 +1,41 @@
 #include "deconvolutional_layer.h"
 #include "convolutional_layer.h"
+#include "batchnorm_layer.h"
 #include "utils.h"
 #include "im2col.h"
 #include "col2im.h"
 #include "blas.h"
 #include "gemm.h"
+
 #include <stdio.h>
 #include <time.h>
 
-int deconvolutional_out_height(deconvolutional_layer l)
-{
-    int h = l.stride*(l.h - 1) + l.size;
-    return h;
-}
 
-int deconvolutional_out_width(deconvolutional_layer l)
-{
-    int w = l.stride*(l.w - 1) + l.size;
-    return w;
-}
-
-int deconvolutional_out_size(deconvolutional_layer l)
-{
-    return deconvolutional_out_height(l) * deconvolutional_out_width(l);
+static size_t get_workspace_size(layer l){
+    return (size_t)l.h*l.w*l.size*l.size*l.n*sizeof(float);
 }
 
-image get_deconvolutional_image(deconvolutional_layer l)
+void bilinear_init(layer l)
 {
-    int h,w,c;
-    h = deconvolutional_out_height(l);
-    w = deconvolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.output);
+    int i,j,f;
+    float center = (l.size-1) / 2.;
+    for(f = 0; f < l.n; ++f){
+        for(j = 0; j < l.size; ++j){
+            for(i = 0; i < l.size; ++i){
+                float val = (1 - fabs(i - center)) * (1 - fabs(j - center));
+                int c = f%l.c;
+                int ind = f*l.size*l.size*l.c + c*l.size*l.size + j*l.size + i;
+                l.weights[ind] = val;
+            }
+        }
+    }
 }
 
-image get_deconvolutional_delta(deconvolutional_layer l)
-{
-    int h,w,c;
-    h = deconvolutional_out_height(l);
-    w = deconvolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.delta);
-}
 
-deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation)
+layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam)
 {
     int i;
-    deconvolutional_layer l = {0};
+    layer l = {0};
     l.type = DECONVOLUTIONAL;
 
     l.h = h;
@@ -57,82 +46,182 @@ deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c,
     l.stride = stride;
     l.size = size;
 
+    l.nweights = c*n*size*size;
+    l.nbiases = n;
+
     l.weights = calloc(c*n*size*size, sizeof(float));
     l.weight_updates = calloc(c*n*size*size, sizeof(float));
 
     l.biases = calloc(n, sizeof(float));
     l.bias_updates = calloc(n, sizeof(float));
-    float scale = 1./sqrt(size*size*c);
+    //float scale = n/(size*size*c);
+    //printf("scale: %f\n", scale);
+    float scale = .02;
     for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal();
+    //bilinear_init(l);
     for(i = 0; i < n; ++i){
-        l.biases[i] = scale;
+        l.biases[i] = 0;
     }
-    int out_h = deconvolutional_out_height(l);
-    int out_w = deconvolutional_out_width(l);
+    l.pad = padding;
 
-    l.out_h = out_h;
-    l.out_w = out_w;
+    l.out_h = (l.h - 1) * l.stride + l.size - 2*l.pad;
+    l.out_w = (l.w - 1) * l.stride + l.size - 2*l.pad;
     l.out_c = n;
     l.outputs = l.out_w * l.out_h * l.out_c;
     l.inputs = l.w * l.h * l.c;
 
-    l.col_image = calloc(h*w*size*size*n, sizeof(float));
-    l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
-    l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));
+    scal_cpu(l.nweights, (float)l.out_w*l.out_h/(l.w*l.h), l.weights, 1);
+
+    l.output = calloc(l.batch*l.outputs, sizeof(float));
+    l.delta  = calloc(l.batch*l.outputs, sizeof(float));
 
     l.forward = forward_deconvolutional_layer;
     l.backward = backward_deconvolutional_layer;
     l.update = update_deconvolutional_layer;
 
-    #ifdef GPU
-    l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
-    l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+    l.batch_normalize = batch_normalize;
+
+    if(batch_normalize){
+        l.scales = calloc(n, sizeof(float));
+        l.scale_updates = calloc(n, sizeof(float));
+        for(i = 0; i < n; ++i){
+            l.scales[i] = 1;
+        }
+
+        l.mean = calloc(n, sizeof(float));
+        l.variance = calloc(n, sizeof(float));
+
+        l.mean_delta = calloc(n, sizeof(float));
+        l.variance_delta = calloc(n, sizeof(float));
+
+        l.rolling_mean = calloc(n, sizeof(float));
+        l.rolling_variance = calloc(n, sizeof(float));
+        l.x = calloc(l.batch*l.outputs, sizeof(float));
+        l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
+    }
+    if(adam){
+        l.m = calloc(c*n*size*size, sizeof(float));
+        l.v = calloc(c*n*size*size, sizeof(float));
+        l.bias_m = calloc(n, sizeof(float));
+        l.scale_m = calloc(n, sizeof(float));
+        l.bias_v = calloc(n, sizeof(float));
+        l.scale_v = calloc(n, sizeof(float));
+    }
+
+#ifdef GPU
+    l.forward_gpu = forward_deconvolutional_layer_gpu;
+    l.backward_gpu = backward_deconvolutional_layer_gpu;
+    l.update_gpu = update_deconvolutional_layer_gpu;
+
+    if(gpu_index >= 0){
+
+        if (adam) {
+            l.m_gpu = cuda_make_array(l.m, c*n*size*size);
+            l.v_gpu = cuda_make_array(l.v, c*n*size*size);
+            l.bias_m_gpu = cuda_make_array(l.bias_m, n);
+            l.bias_v_gpu = cuda_make_array(l.bias_v, n);
+            l.scale_m_gpu = cuda_make_array(l.scale_m, n);
+            l.scale_v_gpu = cuda_make_array(l.scale_v, n);
+        }
+        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+
+        l.biases_gpu = cuda_make_array(l.biases, n);
+        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
 
-    l.biases_gpu = cuda_make_array(l.biases, n);
-    l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+        l.delta_gpu = cuda_make_array(l.delta, l.batch*l.out_h*l.out_w*n);
+        l.output_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n);
 
-    l.col_image_gpu = cuda_make_array(l.col_image, h*w*size*size*n);
-    l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
-    l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+        if(batch_normalize){
+            l.mean_gpu = cuda_make_array(0, n);
+            l.variance_gpu = cuda_make_array(0, n);
+
+            l.rolling_mean_gpu = cuda_make_array(0, n);
+            l.rolling_variance_gpu = cuda_make_array(0, n);
+
+            l.mean_delta_gpu = cuda_make_array(0, n);
+            l.variance_delta_gpu = cuda_make_array(0, n);
+
+            l.scales_gpu = cuda_make_array(l.scales, n);
+            l.scale_updates_gpu = cuda_make_array(0, n);
+
+            l.x_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n);
+            l.x_norm_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n);
+        }
+    }
+    #ifdef CUDNN
+        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
+        cudnnCreateTensorDescriptor(&l.normTensorDesc);
+        cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
+        cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
     #endif
+#endif
 
     l.activation = activation;
+    l.workspace_size = get_workspace_size(l);
 
-    fprintf(stderr, "Deconvolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
+    fprintf(stderr, "deconv%5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
 
     return l;
 }
 
-void resize_deconvolutional_layer(deconvolutional_layer *l, int h, int w)
+void denormalize_deconvolutional_layer(layer l)
+{
+    int i, j;
+    for(i = 0; i < l.n; ++i){
+        float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
+        for(j = 0; j < l.c*l.size*l.size; ++j){
+            l.weights[i*l.c*l.size*l.size + j] *= scale;
+        }
+        l.biases[i] -= l.rolling_mean[i] * scale;
+        l.scales[i] = 1;
+        l.rolling_mean[i] = 0;
+        l.rolling_variance[i] = 1;
+    }
+}
+
+void resize_deconvolutional_layer(layer *l, int h, int w)
 {
     l->h = h;
     l->w = w;
-    int out_h = deconvolutional_out_height(*l);
-    int out_w = deconvolutional_out_width(*l);
-
-    l->col_image = realloc(l->col_image,
-                                out_h*out_w*l->size*l->size*l->c*sizeof(float));
-    l->output = realloc(l->output,
-                                l->batch*out_h * out_w * l->n*sizeof(float));
-    l->delta  = realloc(l->delta,
-                                l->batch*out_h * out_w * l->n*sizeof(float));
-    #ifdef GPU
-    cuda_free(l->col_image_gpu);
+    l->out_h = (l->h - 1) * l->stride + l->size - 2*l->pad;
+    l->out_w = (l->w - 1) * l->stride + l->size - 2*l->pad;
+
+    l->outputs = l->out_h * l->out_w * l->out_c;
+    l->inputs = l->w * l->h * l->c;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta  = realloc(l->delta,  l->batch*l->outputs*sizeof(float));
+    if(l->batch_normalize){
+        l->x = realloc(l->x, l->batch*l->outputs*sizeof(float));
+        l->x_norm  = realloc(l->x_norm, l->batch*l->outputs*sizeof(float));
+    }
+
+#ifdef GPU
     cuda_free(l->delta_gpu);
     cuda_free(l->output_gpu);
 
-    l->col_image_gpu = cuda_make_array(l->col_image, out_h*out_w*l->size*l->size*l->c);
-    l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n);
-    l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n);
+    l->delta_gpu =  cuda_make_array(l->delta,  l->batch*l->outputs);
+    l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+
+    if(l->batch_normalize){
+        cuda_free(l->x_gpu);
+        cuda_free(l->x_norm_gpu);
+
+        l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+        l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+    }
+    #ifdef CUDNN
+        cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
+        cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
     #endif
+#endif
+    l->workspace_size = get_workspace_size(*l);
 }
 
-void forward_deconvolutional_layer(const deconvolutional_layer l, network_state state)
+void forward_deconvolutional_layer(const layer l, network net)
 {
     int i;
-    int out_h = deconvolutional_out_height(l);
-    int out_w = deconvolutional_out_width(l);
-    int size = out_h*out_w;
 
     int m = l.size*l.size*l.n;
     int n = l.h*l.w;
@@ -142,63 +231,80 @@ void forward_deconvolutional_layer(const deconvolutional_layer l, network_state
 
     for(i = 0; i < l.batch; ++i){
         float *a = l.weights;
-        float *b = state.input + i*l.c*l.h*l.w;
-        float *c = l.col_image;
+        float *b = net.input + i*l.c*l.h*l.w;
+        float *c = net.workspace;
 
-        gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);
+        gemm_cpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
 
-        col2im_cpu(c, l.n, out_h, out_w, l.size, l.stride, 0, l.output+i*l.n*size);
+        col2im_cpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output+i*l.outputs);
+    }
+    if (l.batch_normalize) {
+        forward_batchnorm_layer(l, net);
+    } else {
+        add_bias(l.output, l.biases, l.batch, l.n, l.out_w*l.out_h);
     }
-    add_bias(l.output, l.biases, l.batch, l.n, size);
-    activate_array(l.output, l.batch*l.n*size, l.activation);
+    activate_array(l.output, l.batch*l.n*l.out_w*l.out_h, l.activation);
 }
 
-void backward_deconvolutional_layer(deconvolutional_layer l, network_state state)
+void backward_deconvolutional_layer(layer l, network net)
 {
-    float alpha = 1./l.batch;
-    int out_h = deconvolutional_out_height(l);
-    int out_w = deconvolutional_out_width(l);
-    int size = out_h*out_w;
     int i;
 
-    gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta);
-    backward_bias(l.bias_updates, l.delta, l.batch, l.n, size);
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+
+    if(l.batch_normalize){
+        backward_batchnorm_layer(l, net);
+    } else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h);
+    }
+
+    //if(net.delta) memset(net.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));
 
     for(i = 0; i < l.batch; ++i){
         int m = l.c;
         int n = l.size*l.size*l.n;
         int k = l.h*l.w;
 
-        float *a = state.input + i*m*n;
-        float *b = l.col_image;
+        float *a = net.input + i*m*k;
+        float *b = net.workspace;
         float *c = l.weight_updates;
 
-        im2col_cpu(l.delta + i*l.n*size, l.n, out_h, out_w, 
-                l.size, l.stride, 0, b);
-        gemm(0,1,m,n,k,alpha,a,k,b,k,1,c,n);
+        im2col_cpu(l.delta + i*l.outputs, l.out_c, l.out_h, l.out_w, 
+                l.size, l.stride, l.pad, b);
+        gemm_cpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
 
-        if(state.delta){
+        if(net.delta){
             int m = l.c;
             int n = l.h*l.w;
             int k = l.size*l.size*l.n;
 
             float *a = l.weights;
-            float *b = l.col_image;
-            float *c = state.delta + i*n*m;
+            float *b = net.workspace;
+            float *c = net.delta + i*n*m;
 
-            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+            gemm_cpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
         }
     }
 }
 
-void update_deconvolutional_layer(deconvolutional_layer l, float learning_rate, float momentum, float decay)
+void update_deconvolutional_layer(layer l, update_args a)
 {
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
     int size = l.size*l.size*l.c*l.n;
-    axpy_cpu(l.n, learning_rate, l.bias_updates, 1, l.biases, 1);
+    axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
     scal_cpu(l.n, momentum, l.bias_updates, 1);
 
-    axpy_cpu(size, -decay, l.weights, 1, l.weight_updates, 1);
-    axpy_cpu(size, learning_rate, l.weight_updates, 1, l.weights, 1);
+    if(l.scales){
+        axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
+        scal_cpu(l.n, momentum, l.scale_updates, 1);
+    }
+
+    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
     scal_cpu(size, momentum, l.weight_updates, 1);
 }
 
diff --git a/image.darknet/src/deconvolutional_layer.h b/image.darknet/src/deconvolutional_layer.h
index 2d36e02..b254fb9 100644
--- a/image.darknet/src/deconvolutional_layer.h
+++ b/image.darknet/src/deconvolutional_layer.h
@@ -7,28 +7,19 @@
 #include "layer.h"
 #include "network.h"
 
-typedef layer deconvolutional_layer;
-
 #ifdef GPU
-void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state);
-void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state);
-void update_deconvolutional_layer_gpu(deconvolutional_layer layer, float learning_rate, float momentum, float decay);
-void push_deconvolutional_layer(deconvolutional_layer layer);
-void pull_deconvolutional_layer(deconvolutional_layer layer);
+void forward_deconvolutional_layer_gpu(layer l, network net);
+void backward_deconvolutional_layer_gpu(layer l, network net);
+void update_deconvolutional_layer_gpu(layer l, update_args a);
+void push_deconvolutional_layer(layer l);
+void pull_deconvolutional_layer(layer l);
 #endif
 
-deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation);
-void resize_deconvolutional_layer(deconvolutional_layer *layer, int h, int w);
-void forward_deconvolutional_layer(const deconvolutional_layer layer, network_state state);
-void update_deconvolutional_layer(deconvolutional_layer layer, float learning_rate, float momentum, float decay);
-void backward_deconvolutional_layer(deconvolutional_layer layer, network_state state);
-
-image get_deconvolutional_image(deconvolutional_layer layer);
-image get_deconvolutional_delta(deconvolutional_layer layer);
-image get_deconvolutional_filter(deconvolutional_layer layer, int i);
-
-int deconvolutional_out_height(deconvolutional_layer layer);
-int deconvolutional_out_width(deconvolutional_layer layer);
+layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam);
+void resize_deconvolutional_layer(layer *l, int h, int w);
+void forward_deconvolutional_layer(const layer l, network net);
+void update_deconvolutional_layer(layer l, update_args a);
+void backward_deconvolutional_layer(layer l, network net);
 
 #endif
 
diff --git a/image.darknet/src/demo.c b/image.darknet/src/demo.c
index 7818bc3..b89efb8 100644
--- a/image.darknet/src/demo.c
+++ b/image.darknet/src/demo.c
@@ -9,213 +9,339 @@
 #include "demo.h"
 #include <sys/time.h>
 
-#define FRAMES 3
+#define DEMO 1
 
 #ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#include "opencv2/imgproc/imgproc_c.h"
-image get_image_from_stream(CvCapture *cap);
 
 static char **demo_names;
 static image **demo_alphabet;
 static int demo_classes;
 
-static float **probs;
-static box *boxes;
-static network net;
-static image in   ;
-static image in_s ;
-static image det  ;
-static image det_s;
-static image disp = {0};
-static CvCapture * cap;
+static network *net;
+static image buff [3];
+static image buff_letter[3];
+static int buff_index = 0;
+static void * cap;
 static float fps = 0;
 static float demo_thresh = 0;
-static float demo_hier_thresh = .5;
+static float demo_hier = .5;
+static int running = 0;
 
-static float *predictions[FRAMES];
+static int demo_frame = 3;
 static int demo_index = 0;
-static image images[FRAMES];
+static float **predictions;
 static float *avg;
+static int demo_done = 0;
+static int demo_total = 0;
+double demo_time;
 
-void *fetch_in_thread(void *ptr)
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num);
+
+int size_network(network *net)
 {
-    in = get_image_from_stream(cap);
-    if(!in.data){
-        error("Stream closed.");
+    int i;
+    int count = 0;
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        if(l.type == YOLO || l.type == REGION || l.type == DETECTION){
+            count += l.outputs;
+        }
     }
-    in_s = resize_image(in, net.w, net.h);
-    return 0;
+    return count;
+}
+
+void remember_network(network *net)
+{
+    int i;
+    int count = 0;
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        if(l.type == YOLO || l.type == REGION || l.type == DETECTION){
+            memcpy(predictions[demo_index] + count, net->layers[i].output, sizeof(float) * l.outputs);
+            count += l.outputs;
+        }
+    }
+}
+
+detection *avg_predictions(network *net, int *nboxes)
+{
+    int i, j;
+    int count = 0;
+    fill_cpu(demo_total, 0, avg, 1);
+    for(j = 0; j < demo_frame; ++j){
+        axpy_cpu(demo_total, 1./demo_frame, predictions[j], 1, avg, 1);
+    }
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        if(l.type == YOLO || l.type == REGION || l.type == DETECTION){
+            memcpy(l.output, avg + count, sizeof(float) * l.outputs);
+            count += l.outputs;
+        }
+    }
+    detection *dets = get_network_boxes(net, buff[0].w, buff[0].h, demo_thresh, demo_hier, 0, 1, nboxes);
+    return dets;
 }
 
 void *detect_in_thread(void *ptr)
 {
+    running = 1;
     float nms = .4;
 
-    layer l = net.layers[net.n-1];
-    float *X = det_s.data;
-    float *prediction = network_predict(net, X);
-
-    memcpy(predictions[demo_index], prediction, l.outputs*sizeof(float));
-    mean_arrays(predictions, FRAMES, l.outputs, avg);
-    l.output = avg;
-
-    free_image(det_s);
-    if(l.type == DETECTION){
-        get_detection_boxes(l, 1, 1, demo_thresh, probs, boxes, 0);
-    } else if (l.type == REGION){
-        get_region_boxes(l, 1, 1, demo_thresh, probs, boxes, 0, 0, demo_hier_thresh);
-    } else {
-        error("Last layer must produce detections\n");
+    layer l = net->layers[net->n-1];
+    float *X = buff_letter[(buff_index+2)%3].data;
+    network_predict(net, X);
+
+    /*
+       if(l.type == DETECTION){
+       get_detection_boxes(l, 1, 1, demo_thresh, probs, boxes, 0);
+       } else */
+    remember_network(net);
+    detection *dets = 0;
+    int nboxes = 0;
+    dets = avg_predictions(net, &nboxes);
+
+
+    /*
+       int i,j;
+       box zero = {0};
+       int classes = l.classes;
+       for(i = 0; i < demo_detections; ++i){
+       avg[i].objectness = 0;
+       avg[i].bbox = zero;
+       memset(avg[i].prob, 0, classes*sizeof(float));
+       for(j = 0; j < demo_frame; ++j){
+       axpy_cpu(classes, 1./demo_frame, dets[j][i].prob, 1, avg[i].prob, 1);
+       avg[i].objectness += dets[j][i].objectness * 1./demo_frame;
+       avg[i].bbox.x += dets[j][i].bbox.x * 1./demo_frame;
+       avg[i].bbox.y += dets[j][i].bbox.y * 1./demo_frame;
+       avg[i].bbox.w += dets[j][i].bbox.w * 1./demo_frame;
+       avg[i].bbox.h += dets[j][i].bbox.h * 1./demo_frame;
+       }
+    //copy_cpu(classes, dets[0][i].prob, 1, avg[i].prob, 1);
+    //avg[i].objectness = dets[0][i].objectness;
     }
-    if (nms > 0) do_nms(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+     */
+
+    if (nms > 0) do_nms_obj(dets, nboxes, l.classes, nms);
+
     printf("\033[2J");
     printf("\033[1;1H");
     printf("\nFPS:%.1f\n",fps);
     printf("Objects:\n\n");
+    image display = buff[(buff_index+2) % 3];
+    draw_detections(display, dets, nboxes, demo_thresh, demo_names, demo_alphabet, demo_classes);
+    free_detections(dets, nboxes);
 
-    images[demo_index] = det;
-    det = images[(demo_index + FRAMES/2 + 1)%FRAMES];
-    demo_index = (demo_index + 1)%FRAMES;
-
-    draw_detections(det, l.w*l.h*l.n, demo_thresh, boxes, probs, demo_names, demo_alphabet, demo_classes);
+    demo_index = (demo_index + 1)%demo_frame;
+    running = 0;
+    return 0;
+}
 
+void *fetch_in_thread(void *ptr)
+{
+    free_image(buff[buff_index]);
+    buff[buff_index] = get_image_from_stream(cap);
+    if(buff[buff_index].data == 0) {
+        demo_done = 1;
+        return 0;
+    }
+    letterbox_image_into(buff[buff_index], net->w, net->h, buff_letter[buff_index]);
     return 0;
 }
 
-double get_wall_time()
+void *display_in_thread(void *ptr)
 {
-    struct timeval time;
-    if (gettimeofday(&time,NULL)){
+    int c = show_image(buff[(buff_index + 1)%3], "Demo", 1);
+    if (c != -1) c = c%256;
+    if (c == 27) {
+        demo_done = 1;
         return 0;
+    } else if (c == 82) {
+        demo_thresh += .02;
+    } else if (c == 84) {
+        demo_thresh -= .02;
+        if(demo_thresh <= .02) demo_thresh = .02;
+    } else if (c == 83) {
+        demo_hier += .02;
+    } else if (c == 81) {
+        demo_hier -= .02;
+        if(demo_hier <= .0) demo_hier = .0;
     }
-    return (double)time.tv_sec + (double)time.tv_usec * .000001;
+    return 0;
 }
 
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, float hier_thresh)
+void *display_loop(void *ptr)
 {
-    //skip = frame_skip;
+    while(1){
+        display_in_thread(0);
+    }
+}
+
+void *detect_loop(void *ptr)
+{
+    while(1){
+        detect_in_thread(0);
+    }
+}
+
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg_frames, float hier, int w, int h, int frames, int fullscreen)
+{
+    //demo_frame = avg_frames;
     image **alphabet = load_alphabet();
-    int delay = frame_skip;
     demo_names = names;
     demo_alphabet = alphabet;
     demo_classes = classes;
     demo_thresh = thresh;
-    demo_hier_thresh = hier_thresh;
+    demo_hier = hier;
     printf("Demo\n");
-    net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
+    net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    pthread_t detect_thread;
+    pthread_t fetch_thread;
 
     srand(2222222);
 
+    int i;
+    demo_total = size_network(net);
+    predictions = calloc(demo_frame, sizeof(float*));
+    for (i = 0; i < demo_frame; ++i){
+        predictions[i] = calloc(demo_total, sizeof(float));
+    }
+    avg = calloc(demo_total, sizeof(float));
+
     if(filename){
         printf("video file: %s\n", filename);
-        cap = cvCaptureFromFile(filename);
+        cap = open_video_stream(filename, 0, 0, 0, 0);
     }else{
-        cap = cvCaptureFromCAM(cam_index);
+        cap = open_video_stream(0, cam_index, w, h, frames);
     }
 
     if(!cap) error("Couldn't connect to webcam.\n");
 
-    layer l = net.layers[net.n-1];
-    int j;
-
-    avg = (float *) calloc(l.outputs, sizeof(float));
-    for(j = 0; j < FRAMES; ++j) predictions[j] = (float *) calloc(l.outputs, sizeof(float));
-    for(j = 0; j < FRAMES; ++j) images[j] = make_image(1,1,3);
-
-    boxes = (box *)calloc(l.w*l.h*l.n, sizeof(box));
-    probs = (float **)calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float *)calloc(l.classes, sizeof(float));
-
-    pthread_t fetch_thread;
-    pthread_t detect_thread;
-
-    fetch_in_thread(0);
-    det = in;
-    det_s = in_s;
-
-    fetch_in_thread(0);
-    detect_in_thread(0);
-    disp = det;
-    det = in;
-    det_s = in_s;
-
-    for(j = 0; j < FRAMES/2; ++j){
-        fetch_in_thread(0);
-        detect_in_thread(0);
-        disp = det;
-        det = in;
-        det_s = in_s;
-    }
+    buff[0] = get_image_from_stream(cap);
+    buff[1] = copy_image(buff[0]);
+    buff[2] = copy_image(buff[0]);
+    buff_letter[0] = letterbox_image(buff[0], net->w, net->h);
+    buff_letter[1] = letterbox_image(buff[0], net->w, net->h);
+    buff_letter[2] = letterbox_image(buff[0], net->w, net->h);
 
     int count = 0;
     if(!prefix){
-        cvNamedWindow("Demo", CV_WINDOW_NORMAL); 
-        cvMoveWindow("Demo", 0, 0);
-        cvResizeWindow("Demo", 1352, 1013);
+        make_window("Demo", 1352, 1013, fullscreen);
     }
 
-    double before = get_wall_time();
-
-    while(1){
-        ++count;
-        if(1){
-            if(pthread_create(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed");
-            if(pthread_create(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed");
-
-            if(!prefix){
-                show_image(disp, "Demo");
-                int c = cvWaitKey(1);
-                if (c == 10){
-                    if(frame_skip == 0) frame_skip = 60;
-                    else if(frame_skip == 4) frame_skip = 0;
-                    else if(frame_skip == 60) frame_skip = 4;   
-                    else frame_skip = 0;
-                }
-            }else{
-                char buff[256];
-                sprintf(buff, "%s_%08d", prefix, count);
-                save_image(disp, buff);
-            }
-
-            pthread_join(fetch_thread, 0);
-            pthread_join(detect_thread, 0);
-
-            if(delay == 0){
-                free_image(disp);
-                disp  = det;
-            }
-            det   = in;
-            det_s = in_s;
-        }else {
-            fetch_in_thread(0);
-            det   = in;
-            det_s = in_s;
-            detect_in_thread(0);
-            if(delay == 0) {
-                free_image(disp);
-                disp = det;
-            }
-            show_image(disp, "Demo");
-            cvWaitKey(1);
-        }
-        --delay;
-        if(delay < 0){
-            delay = frame_skip;
-
-            double after = get_wall_time();
-            float curr = 1./(after - before);
-            fps = curr;
-            before = after;
+    demo_time = what_time_is_it_now();
+
+    while(!demo_done){
+        buff_index = (buff_index + 1) %3;
+        if(pthread_create(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed");
+        if(pthread_create(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed");
+        if(!prefix){
+            fps = 1./(what_time_is_it_now() - demo_time);
+            demo_time = what_time_is_it_now();
+            display_in_thread(0);
+        }else{
+            char name[256];
+            sprintf(name, "%s_%08d", prefix, count);
+            save_image(buff[(buff_index + 1)%3], name);
         }
+        pthread_join(fetch_thread, 0);
+        pthread_join(detect_thread, 0);
+        ++count;
     }
 }
+
+/*
+   void demo_compare(char *cfg1, char *weight1, char *cfg2, char *weight2, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg_frames, float hier, int w, int h, int frames, int fullscreen)
+   {
+   demo_frame = avg_frames;
+   predictions = calloc(demo_frame, sizeof(float*));
+   image **alphabet = load_alphabet();
+   demo_names = names;
+   demo_alphabet = alphabet;
+   demo_classes = classes;
+   demo_thresh = thresh;
+   demo_hier = hier;
+   printf("Demo\n");
+   net = load_network(cfg1, weight1, 0);
+   set_batch_network(net, 1);
+   pthread_t detect_thread;
+   pthread_t fetch_thread;
+
+   srand(2222222);
+
+   if(filename){
+   printf("video file: %s\n", filename);
+   cap = cvCaptureFromFile(filename);
+   }else{
+   cap = cvCaptureFromCAM(cam_index);
+
+   if(w){
+   cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_WIDTH, w);
+   }
+   if(h){
+   cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_HEIGHT, h);
+   }
+   if(frames){
+   cvSetCaptureProperty(cap, CV_CAP_PROP_FPS, frames);
+   }
+   }
+
+   if(!cap) error("Couldn't connect to webcam.\n");
+
+   layer l = net->layers[net->n-1];
+   demo_detections = l.n*l.w*l.h;
+   int j;
+
+   avg = (float *) calloc(l.outputs, sizeof(float));
+   for(j = 0; j < demo_frame; ++j) predictions[j] = (float *) calloc(l.outputs, sizeof(float));
+
+   boxes = (box *)calloc(l.w*l.h*l.n, sizeof(box));
+   probs = (float **)calloc(l.w*l.h*l.n, sizeof(float *));
+   for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float *)calloc(l.classes+1, sizeof(float));
+
+   buff[0] = get_image_from_stream(cap);
+   buff[1] = copy_image(buff[0]);
+   buff[2] = copy_image(buff[0]);
+   buff_letter[0] = letterbox_image(buff[0], net->w, net->h);
+   buff_letter[1] = letterbox_image(buff[0], net->w, net->h);
+   buff_letter[2] = letterbox_image(buff[0], net->w, net->h);
+   ipl = cvCreateImage(cvSize(buff[0].w,buff[0].h), IPL_DEPTH_8U, buff[0].c);
+
+   int count = 0;
+   if(!prefix){
+   cvNamedWindow("Demo", CV_WINDOW_NORMAL); 
+   if(fullscreen){
+   cvSetWindowProperty("Demo", CV_WND_PROP_FULLSCREEN, CV_WINDOW_FULLSCREEN);
+   } else {
+   cvMoveWindow("Demo", 0, 0);
+   cvResizeWindow("Demo", 1352, 1013);
+   }
+   }
+
+   demo_time = what_time_is_it_now();
+
+   while(!demo_done){
+buff_index = (buff_index + 1) %3;
+if(pthread_create(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed");
+if(pthread_create(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed");
+if(!prefix){
+    fps = 1./(what_time_is_it_now() - demo_time);
+    demo_time = what_time_is_it_now();
+    display_in_thread(0);
+}else{
+    char name[256];
+    sprintf(name, "%s_%08d", prefix, count);
+    save_image(buff[(buff_index + 1)%3], name);
+}
+pthread_join(fetch_thread, 0);
+pthread_join(detect_thread, 0);
+++count;
+}
+}
+*/
 #else
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, float hier_thresh)
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg, float hier, int w, int h, int frames, int fullscreen)
 {
     fprintf(stderr, "Demo needs OpenCV for webcam images.\n");
 }
diff --git a/image.darknet/src/demo.h b/image.darknet/src/demo.h
index c3d6a61..86e4654 100644
--- a/image.darknet/src/demo.h
+++ b/image.darknet/src/demo.h
@@ -1,7 +1,6 @@
-#ifndef DEMO
-#define DEMO
+#ifndef DEMO_H
+#define DEMO_H
 
 #include "image.h"
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, float hier_thresh);
 
 #endif
diff --git a/image.darknet/src/detection_layer.c b/image.darknet/src/detection_layer.c
index cd98b4b..d0e0194 100644
--- a/image.darknet/src/detection_layer.c
+++ b/image.darknet/src/detection_layer.c
@@ -5,6 +5,7 @@
 #include "box.h"
 #include "cuda.h"
 #include "utils.h"
+
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
@@ -46,11 +47,11 @@ detection_layer make_detection_layer(int batch, int inputs, int n, int side, int
     return l;
 }
 
-void forward_detection_layer(const detection_layer l, network_state state)
+void forward_detection_layer(const detection_layer l, network net)
 {
     int locations = l.side*l.side;
     int i,j;
-    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
+    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
     //if(l.reorg) reorg(l.output, l.w*l.h, size*l.n, l.batch, 1);
     int b;
     if (l.softmax){
@@ -58,12 +59,12 @@ void forward_detection_layer(const detection_layer l, network_state state)
             int index = b*l.inputs;
             for (i = 0; i < locations; ++i) {
                 int offset = i*l.classes;
-                softmax(l.output + index + offset, l.classes, 1,
+                softmax(l.output + index + offset, l.classes, 1, 1,
                         l.output + index + offset);
             }
         }
     }
-    if(state.train){
+    if(net.train){
         float avg_iou = 0;
         float avg_cat = 0;
         float avg_allcat = 0;
@@ -77,7 +78,7 @@ void forward_detection_layer(const detection_layer l, network_state state)
             int index = b*l.inputs;
             for (i = 0; i < locations; ++i) {
                 int truth_index = (b*locations + i)*(1+l.coords+l.classes);
-                int is_obj = state.truth[truth_index];
+                int is_obj = net.truth[truth_index];
                 for (j = 0; j < l.n; ++j) {
                     int p_index = index + locations*l.classes + i*l.n + j;
                     l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]);
@@ -95,19 +96,19 @@ void forward_detection_layer(const detection_layer l, network_state state)
 
                 int class_index = index + i*l.classes;
                 for(j = 0; j < l.classes; ++j) {
-                    l.delta[class_index+j] = l.class_scale * (state.truth[truth_index+1+j] - l.output[class_index+j]);
-                    *(l.cost) += l.class_scale * pow(state.truth[truth_index+1+j] - l.output[class_index+j], 2);
-                    if(state.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j];
+                    l.delta[class_index+j] = l.class_scale * (net.truth[truth_index+1+j] - l.output[class_index+j]);
+                    *(l.cost) += l.class_scale * pow(net.truth[truth_index+1+j] - l.output[class_index+j], 2);
+                    if(net.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j];
                     avg_allcat += l.output[class_index+j];
                 }
 
-                box truth = float_to_box(state.truth + truth_index + 1 + l.classes);
+                box truth = float_to_box(net.truth + truth_index + 1 + l.classes, 1);
                 truth.x /= l.side;
                 truth.y /= l.side;
 
                 for(j = 0; j < l.n; ++j){
                     int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
-                    box out = float_to_box(l.output + box_index);
+                    box out = float_to_box(l.output + box_index, 1);
                     out.x /= l.side;
                     out.y /= l.side;
 
@@ -139,14 +140,14 @@ void forward_detection_layer(const detection_layer l, network_state state)
                         best_index = 0;
                     }
                 }
-                if(l.random && *(state.net.seen) < 64000){
+                if(l.random && *(net.seen) < 64000){
                     best_index = rand()%l.n;
                 }
 
                 int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
                 int tbox_index = truth_index + 1 + l.classes;
 
-                box out = float_to_box(l.output + box_index);
+                box out = float_to_box(l.output + box_index, 1);
                 out.x /= l.side;
                 out.y /= l.side;
                 if (l.sqrt) {
@@ -166,13 +167,13 @@ void forward_detection_layer(const detection_layer l, network_state state)
                     l.delta[p_index] = l.object_scale * (iou - l.output[p_index]);
                 }
 
-                l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]);
-                l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]);
-                l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]);
-                l.delta[box_index+3] = l.coord_scale*(state.truth[tbox_index + 3] - l.output[box_index + 3]);
+                l.delta[box_index+0] = l.coord_scale*(net.truth[tbox_index + 0] - l.output[box_index + 0]);
+                l.delta[box_index+1] = l.coord_scale*(net.truth[tbox_index + 1] - l.output[box_index + 1]);
+                l.delta[box_index+2] = l.coord_scale*(net.truth[tbox_index + 2] - l.output[box_index + 2]);
+                l.delta[box_index+3] = l.coord_scale*(net.truth[tbox_index + 3] - l.output[box_index + 3]);
                 if(l.sqrt){
-                    l.delta[box_index+2] = l.coord_scale*(sqrt(state.truth[tbox_index + 2]) - l.output[box_index + 2]);
-                    l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]);
+                    l.delta[box_index+2] = l.coord_scale*(sqrt(net.truth[tbox_index + 2]) - l.output[box_index + 2]);
+                    l.delta[box_index+3] = l.coord_scale*(sqrt(net.truth[tbox_index + 3]) - l.output[box_index + 3]);
                 }
 
                 *(l.cost) += pow(1-iou, 2);
@@ -216,12 +217,12 @@ void forward_detection_layer(const detection_layer l, network_state state)
     }
 }
 
-void backward_detection_layer(const detection_layer l, network_state state)
+void backward_detection_layer(const detection_layer l, network net)
 {
-    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
 }
 
-void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness)
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets)
 {
     int i,j,n;
     float *predictions = l.output;
@@ -234,17 +235,17 @@ void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box
             int p_index = l.side*l.side*l.classes + i*l.n + n;
             float scale = predictions[p_index];
             int box_index = l.side*l.side*(l.classes + l.n) + (i*l.n + n)*4;
-            boxes[index].x = (predictions[box_index + 0] + col) / l.side * w;
-            boxes[index].y = (predictions[box_index + 1] + row) / l.side * h;
-            boxes[index].w = pow(predictions[box_index + 2], (l.sqrt?2:1)) * w;
-            boxes[index].h = pow(predictions[box_index + 3], (l.sqrt?2:1)) * h;
+            box b;
+            b.x = (predictions[box_index + 0] + col) / l.side * w;
+            b.y = (predictions[box_index + 1] + row) / l.side * h;
+            b.w = pow(predictions[box_index + 2], (l.sqrt?2:1)) * w;
+            b.h = pow(predictions[box_index + 3], (l.sqrt?2:1)) * h;
+            dets[index].bbox = b;
+            dets[index].objectness = scale;
             for(j = 0; j < l.classes; ++j){
                 int class_index = i*l.classes;
                 float prob = scale*predictions[class_index+j];
-                probs[index][j] = (prob > thresh) ? prob : 0;
-            }
-            if(only_objectness){
-                probs[index][0] = scale;
+                dets[index].prob[j] = (prob > thresh) ? prob : 0;
             }
         }
     }
@@ -252,36 +253,23 @@ void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box
 
 #ifdef GPU
 
-void forward_detection_layer_gpu(const detection_layer l, network_state state)
+void forward_detection_layer_gpu(const detection_layer l, network net)
 {
-    if(!state.train){
-        copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
+    if(!net.train){
+        copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
         return;
     }
 
-    float *in_cpu = calloc(l.batch*l.inputs, sizeof(float));
-    float *truth_cpu = 0;
-    if(state.truth){
-        int num_truth = l.batch*l.side*l.side*(1+l.coords+l.classes);
-        truth_cpu = calloc(num_truth, sizeof(float));
-        cuda_pull_array(state.truth, truth_cpu, num_truth);
-    }
-    cuda_pull_array(state.input, in_cpu, l.batch*l.inputs);
-    network_state cpu_state = state;
-    cpu_state.train = state.train;
-    cpu_state.truth = truth_cpu;
-    cpu_state.input = in_cpu;
-    forward_detection_layer(l, cpu_state);
+    cuda_pull_array(net.input_gpu, net.input, l.batch*l.inputs);
+    forward_detection_layer(l, net);
     cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs);
     cuda_push_array(l.delta_gpu, l.delta, l.batch*l.inputs);
-    free(cpu_state.input);
-    if(cpu_state.truth) free(cpu_state.truth);
 }
 
-void backward_detection_layer_gpu(detection_layer l, network_state state)
+void backward_detection_layer_gpu(detection_layer l, network net)
 {
-    axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1);
-    //copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, state.delta, 1);
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+    //copy_gpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
 
diff --git a/image.darknet/src/detection_layer.h b/image.darknet/src/detection_layer.h
index e847a09..1c81853 100644
--- a/image.darknet/src/detection_layer.h
+++ b/image.darknet/src/detection_layer.h
@@ -7,13 +7,12 @@
 typedef layer detection_layer;
 
 detection_layer make_detection_layer(int batch, int inputs, int n, int size, int classes, int coords, int rescore);
-void forward_detection_layer(const detection_layer l, network_state state);
-void backward_detection_layer(const detection_layer l, network_state state);
-void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness);
+void forward_detection_layer(const detection_layer l, network net);
+void backward_detection_layer(const detection_layer l, network net);
 
 #ifdef GPU
-void forward_detection_layer_gpu(const detection_layer l, network_state state);
-void backward_detection_layer_gpu(detection_layer l, network_state state);
+void forward_detection_layer_gpu(const detection_layer l, network net);
+void backward_detection_layer_gpu(detection_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/detector.c b/image.darknet/src/detector.c
deleted file mode 100644
index 1416c05..0000000
--- a/image.darknet/src/detector.c
+++ /dev/null
@@ -1,552 +0,0 @@
-#include "network.h"
-#include "region_layer.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "box.h"
-#include "demo.h"
-#include "option_list.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-static int coco_ids[] = {1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90};
-
-void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
-{
-    list *options = read_data_cfg(datacfg);
-    char *train_images = option_find_str(options, "train", "data/train.list");
-    char *backup_directory = option_find_str(options, "backup", "/backup/");
-
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    float avg_loss = -1;
-    network *nets = calloc(ngpus, sizeof(network));
-
-    srand(time(0));
-    int seed = rand();
-    int i;
-    for(i = 0; i < ngpus; ++i){
-        srand(seed);
-#ifdef GPU
-        cuda_set_device(gpus[i]);
-#endif
-        nets[i] = parse_network_cfg(cfgfile);
-        if(weightfile){
-            load_weights(&nets[i], weightfile);
-        }
-        if(clear) *nets[i].seen = 0;
-        nets[i].learning_rate *= ngpus;
-    }
-    srand(time(0));
-    network net = nets[0];
-
-    int imgs = net.batch * net.subdivisions * ngpus;
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    data train, buffer;
-
-    layer l = net.layers[net.n - 1];
-
-    int classes = l.classes;
-    float jitter = l.jitter;
-
-    list *plist = get_paths(train_images);
-    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.paths = paths;
-    args.n = imgs;
-    args.m = plist->size;
-    args.classes = classes;
-    args.jitter = jitter;
-    args.num_boxes = l.max_boxes;
-    args.d = &buffer;
-    args.type = DETECTION_DATA;
-    args.threads = 8;
-
-    args.angle = net.angle;
-    args.exposure = net.exposure;
-    args.saturation = net.saturation;
-    args.hue = net.hue;
-
-    pthread_t load_thread = load_data(args);
-    clock_t time;
-    int count = 0;
-    //while(i*imgs < N*120){
-    while(get_current_batch(net) < net.max_batches){
-        if(l.random && count++%10 == 0){
-            printf("Resizing\n");
-            int dim = (rand() % 10 + 10) * 32;
-            if (get_current_batch(net)+200 > net.max_batches) dim = 608;
-            //int dim = (rand() % 4 + 16) * 32;
-            printf("%d\n", dim);
-            args.w = dim;
-            args.h = dim;
-
-            pthread_join(load_thread, 0);
-            train = buffer;
-            free_data(train);
-            load_thread = load_data(args);
-
-            for(i = 0; i < ngpus; ++i){
-                resize_network(nets + i, dim, dim);
-            }
-            net = nets[0];
-        }
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data(args);
-
-        /*
-           int k;
-           for(k = 0; k < l.max_boxes; ++k){
-           box b = float_to_box(train.y.vals[10] + 1 + k*5);
-           if(!b.x) break;
-           printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
-           }
-           image im = float_to_image(448, 448, 3, train.X.vals[10]);
-           int k;
-           for(k = 0; k < l.max_boxes; ++k){
-           box b = float_to_box(train.y.vals[10] + 1 + k*5);
-           printf("%d %d %d %d\n", truth.x, truth.y, truth.w, truth.h);
-           draw_bbox(im, b, 8, 1,0,0);
-           }
-           save_image(im, "truth11");
-         */
-
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-
-        time=clock();
-        float loss = 0;
-#ifdef GPU
-        if(ngpus == 1){
-            loss = train_network(net, train);
-        } else {
-            loss = train_networks(nets, ngpus, train, 4);
-        }
-#else
-        loss = train_network(net, train);
-#endif
-        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-
-        i = get_current_batch(net);
-        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
-        if(i%1000==0 || (i < 1000 && i%100 == 0)){
-#ifdef GPU
-            if(ngpus != 1) sync_nets(nets, ngpus, 0);
-#endif
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-        }
-        free_data(train);
-    }
-#ifdef GPU
-    if(ngpus != 1) sync_nets(nets, ngpus, 0);
-#endif
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
-}
-
-
-static int get_coco_image_id(char *filename)
-{
-    char *p = strrchr(filename, '_');
-    return atoi(p+1);
-}
-
-static void print_cocos(FILE *fp, char *image_path, box *boxes, float **probs, int num_boxes, int classes, int w, int h)
-{
-    int i, j;
-    int image_id = get_coco_image_id(image_path);
-    for(i = 0; i < num_boxes; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
-
-        if (xmin < 0) xmin = 0;
-        if (ymin < 0) ymin = 0;
-        if (xmax > w) xmax = w;
-        if (ymax > h) ymax = h;
-
-        float bx = xmin;
-        float by = ymin;
-        float bw = xmax - xmin;
-        float bh = ymax - ymin;
-
-        for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, probs[i][j]);
-        }
-    }
-}
-
-void print_detector_detections(FILE **fps, char *id, box *boxes, float **probs, int total, int classes, int w, int h)
-{
-    int i, j;
-    for(i = 0; i < total; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
-
-        if (xmin < 0) xmin = 0;
-        if (ymin < 0) ymin = 0;
-        if (xmax > w) xmax = w;
-        if (ymax > h) ymax = h;
-
-        for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, probs[i][j],
-                    xmin, ymin, xmax, ymax);
-        }
-    }
-}
-
-void print_imagenet_detections(FILE *fp, int id, box *boxes, float **probs, int total, int classes, int w, int h)
-{
-    int i, j;
-    for(i = 0; i < total; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
-
-        if (xmin < 0) xmin = 0;
-        if (ymin < 0) ymin = 0;
-        if (xmax > w) xmax = w;
-        if (ymax > h) ymax = h;
-
-        for(j = 0; j < classes; ++j){
-            int class = j;
-            if (probs[i][class]) fprintf(fp, "%d %d %f %f %f %f %f\n", id, j+1, probs[i][class],
-                    xmin, ymin, xmax, ymax);
-        }
-    }
-}
-
-void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
-{
-    int j;
-    list *options = read_data_cfg(datacfg);
-    char *valid_images = option_find_str(options, "valid", "data/train.list");
-    char *name_list = option_find_str(options, "names", "data/names.list");
-    char *prefix = option_find_str(options, "results", "results");
-    char **names = get_labels(name_list);
-    char *mapf = option_find_str(options, "map", 0);
-    int *map = 0;
-    if (mapf) map = read_map(mapf);
-
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    srand(time(0));
-
-    list *plist = get_paths(valid_images);
-    char **paths = (char **)list_to_array(plist);
-
-    layer l = net.layers[net.n-1];
-    int classes = l.classes;
-
-    char buff[1024];
-    char *type = option_find_str(options, "eval", "voc");
-    FILE *fp = 0;
-    FILE **fps = 0;
-    int coco = 0;
-    int imagenet = 0;
-    if(0==strcmp(type, "coco")){
-        if(!outfile) outfile = "coco_results";
-        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
-        fp = fopen(buff, "w");
-        fprintf(fp, "[\n");
-        coco = 1;
-    } else if(0==strcmp(type, "imagenet")){
-        if(!outfile) outfile = "imagenet-detection";
-        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
-        fp = fopen(buff, "w");
-        imagenet = 1;
-        classes = 200;
-    } else {
-        if(!outfile) outfile = "comp4_det_test_";
-        fps = calloc(classes, sizeof(FILE *));
-        for(j = 0; j < classes; ++j){
-            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
-            fps[j] = fopen(buff, "w");
-        }
-    }
-
-
-    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
-
-    int m = plist->size;
-    int i=0;
-    int t;
-
-    float thresh = .005;
-    float nms = .45;
-
-    int nthreads = 4;
-    image *val = calloc(nthreads, sizeof(image));
-    image *val_resized = calloc(nthreads, sizeof(image));
-    image *buf = calloc(nthreads, sizeof(image));
-    image *buf_resized = calloc(nthreads, sizeof(image));
-    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.type = IMAGE_DATA;
-
-    for(t = 0; t < nthreads; ++t){
-        args.path = paths[i+t];
-        args.im = &buf[t];
-        args.resized = &buf_resized[t];
-        thr[t] = load_data_in_thread(args);
-    }
-    time_t start = time(0);
-    for(i = nthreads; i < m+nthreads; i += nthreads){
-        fprintf(stderr, "%d\n", i);
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            pthread_join(thr[t], 0);
-            val[t] = buf[t];
-            val_resized[t] = buf_resized[t];
-        }
-        for(t = 0; t < nthreads && i+t < m; ++t){
-            args.path = paths[i+t];
-            args.im = &buf[t];
-            args.resized = &buf_resized[t];
-            thr[t] = load_data_in_thread(args);
-        }
-        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
-            char *path = paths[i+t-nthreads];
-            char *id = basecfg(path);
-            float *X = val_resized[t].data;
-            network_predict(net, X);
-            int w = val[t].w;
-            int h = val[t].h;
-            get_region_boxes(l, w, h, thresh, probs, boxes, 0, map, .5);
-            if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, classes, nms);
-            if (coco){
-                print_cocos(fp, path, boxes, probs, l.w*l.h*l.n, classes, w, h);
-            } else if (imagenet){
-                print_imagenet_detections(fp, i+t-nthreads+1, boxes, probs, l.w*l.h*l.n, classes, w, h);
-            } else {
-                print_detector_detections(fps, id, boxes, probs, l.w*l.h*l.n, classes, w, h);
-            }
-            free(id);
-            free_image(val[t]);
-            free_image(val_resized[t]);
-        }
-    }
-    for(j = 0; j < classes; ++j){
-        if(fps) fclose(fps[j]);
-    }
-    if(coco){
-        fseek(fp, -2, SEEK_CUR); 
-        fprintf(fp, "\n]\n");
-        fclose(fp);
-    }
-    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
-}
-
-void validate_detector_recall(char *cfgfile, char *weightfile)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    srand(time(0));
-
-    list *plist = get_paths("data/voc.2007.test");
-    char **paths = (char **)list_to_array(plist);
-
-    layer l = net.layers[net.n-1];
-    int classes = l.classes;
-
-    int j, k;
-    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
-
-    int m = plist->size;
-    int i=0;
-
-    float thresh = .001;
-    float iou_thresh = .5;
-    float nms = .4;
-
-    int total = 0;
-    int correct = 0;
-    int proposals = 0;
-    float avg_iou = 0;
-
-    for(i = 0; i < m; ++i){
-        char *path = paths[i];
-        image orig = load_image_color(path, 0, 0);
-        image sized = resize_image(orig, net.w, net.h);
-        char *id = basecfg(path);
-        network_predict(net, sized.data);
-        get_region_boxes(l, 1, 1, thresh, probs, boxes, 1, 0, .5);
-        if (nms) do_nms(boxes, probs, l.w*l.h*l.n, 1, nms);
-
-        char labelpath[4096];
-        find_replace(path, "images", "labels", labelpath);
-        find_replace(labelpath, "JPEGImages", "labels", labelpath);
-        find_replace(labelpath, ".jpg", ".txt", labelpath);
-        find_replace(labelpath, ".JPEG", ".txt", labelpath);
-
-        int num_labels = 0;
-        box_label *truth = read_boxes(labelpath, &num_labels);
-        for(k = 0; k < l.w*l.h*l.n; ++k){
-            if(probs[k][0] > thresh){
-                ++proposals;
-            }
-        }
-        for (j = 0; j < num_labels; ++j) {
-            ++total;
-            box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
-            float best_iou = 0;
-            for(k = 0; k < l.w*l.h*l.n; ++k){
-                float iou = box_iou(boxes[k], t);
-                if(probs[k][0] > thresh && iou > best_iou){
-                    best_iou = iou;
-                }
-            }
-            avg_iou += best_iou;
-            if(best_iou > iou_thresh){
-                ++correct;
-            }
-        }
-
-        fprintf(stderr, "%5d %5d %5d\tRPs/Img: %.2f\tIOU: %.2f%%\tRecall:%.2f%%\n", i, correct, total, (float)proposals/(i+1), avg_iou*100/total, 100.*correct/total);
-        free(id);
-        free_image(orig);
-        free_image(sized);
-    }
-}
-
-void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh)
-{
-    list *options = read_data_cfg(datacfg);
-    char *name_list = option_find_str(options, "names", "data/names.list");
-    char **names = get_labels(name_list);
-
-    image **alphabet = load_alphabet();
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(2222222);
-    clock_t time;
-    char buff[256];
-    char *input = buff;
-    int j;
-    float nms=.4;
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        } else {
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image im = load_image_color(input,0,0);
-        image sized = resize_image(im, net.w, net.h);
-        layer l = net.layers[net.n-1];
-
-        box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-        float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-        for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes + 1, sizeof(float *));
-
-        float *X = sized.data;
-        time=clock();
-        network_predict(net, X);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0, hier_thresh);
-        if (l.softmax_tree && nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
-        else if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
-        draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes);
-        save_image(im, "predictions");
-        show_image(im, "predictions");
-
-        free_image(im);
-        free_image(sized);
-        free(boxes);
-        free_ptrs((void **)probs, l.w*l.h*l.n);
-#ifdef OPENCV
-        cvWaitKey(0);
-        cvDestroyAllWindows();
-#endif
-        if (filename) break;
-    }
-}
-
-void run_detector(int argc, char **argv)
-{
-    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
-    float thresh = find_float_arg(argc, argv, "-thresh", .24);
-    float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
-    int cam_index = find_int_arg(argc, argv, "-c", 0);
-    int frame_skip = find_int_arg(argc, argv, "-s", 0);
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
-    char *outfile = find_char_arg(argc, argv, "-out", 0);
-    int *gpus = 0;
-    int gpu = 0;
-    int ngpus = 0;
-    if(gpu_list){
-        printf("%s\n", gpu_list);
-        int len = strlen(gpu_list);
-        ngpus = 1;
-        int i;
-        for(i = 0; i < len; ++i){
-            if (gpu_list[i] == ',') ++ngpus;
-        }
-        gpus = calloc(ngpus, sizeof(int));
-        for(i = 0; i < ngpus; ++i){
-            gpus[i] = atoi(gpu_list);
-            gpu_list = strchr(gpu_list, ',')+1;
-        }
-    } else {
-        gpu = gpu_index;
-        gpus = &gpu;
-        ngpus = 1;
-    }
-
-    int clear = find_arg(argc, argv, "-clear");
-
-    char *datacfg = argv[3];
-    char *cfg = argv[4];
-    char *weights = (argc > 5) ? argv[5] : 0;
-    char *filename = (argc > 6) ? argv[6]: 0;
-    if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh);
-    else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
-    else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
-    else if(0==strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights);
-    else if(0==strcmp(argv[2], "demo")) {
-        list *options = read_data_cfg(datacfg);
-        int classes = option_find_int(options, "classes", 20);
-        char *name_list = option_find_str(options, "names", "data/names.list");
-        char **names = get_labels(name_list);
-        demo(cfg, weights, thresh, cam_index, filename, names, classes, frame_skip, prefix, hier_thresh);
-    }
-}
diff --git a/image.darknet/src/dice.c b/image.darknet/src/dice.c
deleted file mode 100644
index 2286459..0000000
--- a/image.darknet/src/dice.c
+++ /dev/null
@@ -1,118 +0,0 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-
-char *dice_labels[] = {"face1","face2","face3","face4","face5","face6"};
-
-void train_dice(char *cfgfile, char *weightfile)
-{
-    srand(time(0));
-    float avg_loss = -1;
-    char *base = basecfg(cfgfile);
-    char *backup_directory = "/home/pjreddie/backup/";
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = 1024;
-    int i = *net.seen/imgs;
-    char **labels = dice_labels;
-    list *plist = get_paths("data/dice/dice.train.list");
-    char **paths = (char **)list_to_array(plist);
-    printf("%d\n", plist->size);
-    clock_t time;
-    while(1){
-        ++i;
-        time=clock();
-        data train = load_data_old(paths, imgs, plist->size, labels, 6, net.w, net.h);
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-
-        time=clock();
-        float loss = train_network(net, train);
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-        printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), *net.seen);
-        free_data(train);
-        if((i % 100) == 0) net.learning_rate *= .1;
-        if(i%100==0){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, i);
-            save_weights(net, buff);
-        }
-    }
-}
-
-void validate_dice(char *filename, char *weightfile)
-{
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-
-    char **labels = dice_labels;
-    list *plist = get_paths("data/dice/dice.val.list");
-
-    char **paths = (char **)list_to_array(plist);
-    int m = plist->size;
-    free_list(plist);
-
-    data val = load_data_old(paths, m, 0, labels, 6, net.w, net.h);
-    float *acc = network_accuracies(net, val, 2);
-    printf("Validation Accuracy: %f, %d images\n", acc[0], m);
-    free_data(val);
-}
-
-void test_dice(char *cfgfile, char *weightfile, char *filename)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(2222222);
-    int i = 0;
-    char **names = dice_labels;
-    char buff[256];
-    char *input = buff;
-    int indexes[6];
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        }else{
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-        image im = load_image_color(input, net.w, net.h);
-        float *X = im.data;
-        float *predictions = network_predict(net, X);
-        top_predictions(net, 6, indexes);
-        for(i = 0; i < 6; ++i){
-            int index = indexes[i];
-            printf("%s: %f\n", names[index], predictions[index]);
-        }
-        free_image(im);
-        if (filename) break;
-    }
-}
-
-void run_dice(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    char *filename = (argc > 5) ? argv[5]: 0;
-    if(0==strcmp(argv[2], "test")) test_dice(cfg, weights, filename);
-    else if(0==strcmp(argv[2], "train")) train_dice(cfg, weights);
-    else if(0==strcmp(argv[2], "valid")) validate_dice(cfg, weights);
-}
-
diff --git a/image.darknet/src/dropout_layer.c b/image.darknet/src/dropout_layer.c
index b1381e6..780554f 100644
--- a/image.darknet/src/dropout_layer.c
+++ b/image.darknet/src/dropout_layer.c
@@ -35,26 +35,26 @@ void resize_dropout_layer(dropout_layer *l, int inputs)
     #endif
 }
 
-void forward_dropout_layer(dropout_layer l, network_state state)
+void forward_dropout_layer(dropout_layer l, network net)
 {
     int i;
-    if (!state.train) return;
+    if (!net.train) return;
     for(i = 0; i < l.batch * l.inputs; ++i){
         float r = rand_uniform(0, 1);
         l.rand[i] = r;
-        if(r < l.probability) state.input[i] = 0;
-        else state.input[i] *= l.scale;
+        if(r < l.probability) net.input[i] = 0;
+        else net.input[i] *= l.scale;
     }
 }
 
-void backward_dropout_layer(dropout_layer l, network_state state)
+void backward_dropout_layer(dropout_layer l, network net)
 {
     int i;
-    if(!state.delta) return;
+    if(!net.delta) return;
     for(i = 0; i < l.batch * l.inputs; ++i){
         float r = l.rand[i];
-        if(r < l.probability) state.delta[i] = 0;
-        else state.delta[i] *= l.scale;
+        if(r < l.probability) net.delta[i] = 0;
+        else net.delta[i] *= l.scale;
     }
 }
 
diff --git a/image.darknet/src/dropout_layer.h b/image.darknet/src/dropout_layer.h
index 691cfc5..01f94d4 100644
--- a/image.darknet/src/dropout_layer.h
+++ b/image.darknet/src/dropout_layer.h
@@ -8,13 +8,13 @@ typedef layer dropout_layer;
 
 dropout_layer make_dropout_layer(int batch, int inputs, float probability);
 
-void forward_dropout_layer(dropout_layer l, network_state state);
-void backward_dropout_layer(dropout_layer l, network_state state);
+void forward_dropout_layer(dropout_layer l, network net);
+void backward_dropout_layer(dropout_layer l, network net);
 void resize_dropout_layer(dropout_layer *l, int inputs);
 
 #ifdef GPU
-void forward_dropout_layer_gpu(dropout_layer l, network_state state);
-void backward_dropout_layer_gpu(dropout_layer l, network_state state);
+void forward_dropout_layer_gpu(dropout_layer l, network net);
+void backward_dropout_layer_gpu(dropout_layer l, network net);
 
 #endif
 #endif
diff --git a/image.darknet/src/dropout_layer_kernels.cu b/image.darknet/src/dropout_layer_kernels.cu
index 7e51bd5..bd12b67 100644
--- a/image.darknet/src/dropout_layer_kernels.cu
+++ b/image.darknet/src/dropout_layer_kernels.cu
@@ -14,9 +14,9 @@ __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand
     if(id < size) input[id] = (rand[id] < prob) ? 0 : input[id]*scale;
 }
 
-void forward_dropout_layer_gpu(dropout_layer layer, network_state state)
+void forward_dropout_layer_gpu(dropout_layer layer, network net)
 {
-    if (!state.train) return;
+    if (!net.train) return;
     int size = layer.inputs*layer.batch;
     cuda_random(layer.rand_gpu, size);
     /*
@@ -27,15 +27,15 @@ void forward_dropout_layer_gpu(dropout_layer layer, network_state state)
     cuda_push_array(layer.rand_gpu, layer.rand, size);
     */
 
-    yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(state.input, size, layer.rand_gpu, layer.probability, layer.scale);
+    yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(net.input_gpu, size, layer.rand_gpu, layer.probability, layer.scale);
     check_error(cudaPeekAtLastError());
 }
 
-void backward_dropout_layer_gpu(dropout_layer layer, network_state state)
+void backward_dropout_layer_gpu(dropout_layer layer, network net)
 {
-    if(!state.delta) return;
+    if(!net.delta_gpu) return;
     int size = layer.inputs*layer.batch;
 
-    yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(state.delta, size, layer.rand_gpu, layer.probability, layer.scale);
+    yoloswag420blazeit360noscope<<<cuda_gridsize(size), BLOCK>>>(net.delta_gpu, size, layer.rand_gpu, layer.probability, layer.scale);
     check_error(cudaPeekAtLastError());
 }
diff --git a/image.darknet/src/gemm.c b/image.darknet/src/gemm.c
index 3003be0..648027f 100644
--- a/image.darknet/src/gemm.c
+++ b/image.darknet/src/gemm.c
@@ -77,6 +77,7 @@ void gemm_nn(int M, int N, int K, float ALPHA,
         float *C, int ldc)
 {
     int i,j,k;
+    #pragma omp parallel for
     for(i = 0; i < M; ++i){
         for(k = 0; k < K; ++k){
             register float A_PART = ALPHA*A[i*lda+k];
@@ -93,6 +94,7 @@ void gemm_nt(int M, int N, int K, float ALPHA,
         float *C, int ldc)
 {
     int i,j,k;
+    #pragma omp parallel for
     for(i = 0; i < M; ++i){
         for(j = 0; j < N; ++j){
             register float sum = 0;
@@ -110,6 +112,7 @@ void gemm_tn(int M, int N, int K, float ALPHA,
         float *C, int ldc)
 {
     int i,j,k;
+    #pragma omp parallel for
     for(i = 0; i < M; ++i){
         for(k = 0; k < K; ++k){
             register float A_PART = ALPHA*A[k*lda+i];
@@ -126,6 +129,7 @@ void gemm_tt(int M, int N, int K, float ALPHA,
         float *C, int ldc)
 {
     int i,j,k;
+    #pragma omp parallel for
     for(i = 0; i < M; ++i){
         for(j = 0; j < N; ++j){
             register float sum = 0;
@@ -165,7 +169,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
 
 #include <math.h>
 
-void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, 
+void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
         float *A_gpu, int lda, 
         float *B_gpu, int ldb,
         float BETA,
@@ -177,24 +181,6 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
     check_error(status);
 }
 
-void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float BETA,
-        float *C, int ldc)
-{
-    float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M));
-    float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K));
-    float *C_gpu = cuda_make_array(C, ldc*M);
-
-    gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
-
-    cuda_pull_array(C_gpu, C, ldc*M);
-    cuda_free(A_gpu);
-    cuda_free(B_gpu);
-    cuda_free(C_gpu);
-}
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -224,7 +210,7 @@ void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
     free(c);
 }
 
-void time_ongpu(int TA, int TB, int m, int k, int n)
+void time_gpu(int TA, int TB, int m, int k, int n)
 {
     int iter = 10;
     float *a = random_matrix(m,k);
@@ -242,7 +228,7 @@ void time_ongpu(int TA, int TB, int m, int k, int n)
     int i;
     clock_t start = clock(), end;
     for(i = 0; i<iter; ++i){
-        gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
+        gemm_gpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
         cudaThreadSynchronize();
     }
     double flop = ((double)m)*n*(2.*k + 2.)*iter;
@@ -313,24 +299,24 @@ int test_gpu_blas()
 
        test_gpu_accuracy(0,0,10,10,10); 
 
-       time_ongpu(0,0,64,2916,363); 
-       time_ongpu(0,0,64,2916,363); 
-       time_ongpu(0,0,64,2916,363); 
-       time_ongpu(0,0,192,729,1600); 
-       time_ongpu(0,0,384,196,1728); 
-       time_ongpu(0,0,256,196,3456); 
-       time_ongpu(0,0,256,196,2304); 
-       time_ongpu(0,0,128,4096,12544); 
-       time_ongpu(0,0,128,4096,4096); 
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,64,2916,363); 
+       time_gpu(0,0,192,729,1600); 
+       time_gpu(0,0,384,196,1728); 
+       time_gpu(0,0,256,196,3456); 
+       time_gpu(0,0,256,196,2304); 
+       time_gpu(0,0,128,4096,12544); 
+       time_gpu(0,0,128,4096,4096); 
      */
-    time_ongpu(0,0,64,75,12544); 
-    time_ongpu(0,0,64,75,12544); 
-    time_ongpu(0,0,64,75,12544); 
-    time_ongpu(0,0,64,576,12544); 
-    time_ongpu(0,0,256,2304,784); 
-    time_ongpu(1,1,2304,256,784); 
-    time_ongpu(0,0,512,4608,196); 
-    time_ongpu(1,1,4608,512,196); 
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,75,12544); 
+    time_gpu(0,0,64,576,12544); 
+    time_gpu(0,0,256,2304,784); 
+    time_gpu(1,1,2304,256,784); 
+    time_gpu(0,0,512,4608,196); 
+    time_gpu(1,1,4608,512,196); 
 
     return 0;
 }
diff --git a/image.darknet/src/gemm.h b/image.darknet/src/gemm.h
index f0231bf..3ebb0eb 100644
--- a/image.darknet/src/gemm.h
+++ b/image.darknet/src/gemm.h
@@ -19,7 +19,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
         float *C, int ldc);
 
 #ifdef GPU
-void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, 
+void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
         float *A_gpu, int lda, 
         float *B_gpu, int ldb,
         float BETA,
diff --git a/image.darknet/src/go.c b/image.darknet/src/go.c
deleted file mode 100644
index 89297b5..0000000
--- a/image.darknet/src/go.c
+++ /dev/null
@@ -1,833 +0,0 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-#include "option_list.h"
-#include "blas.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-int inverted = 1;
-int noi = 1;
-static const int nind = 5;
-
-typedef struct {
-    char **data;
-    int n;
-} moves;
-
-char *fgetgo(FILE *fp)
-{
-    if(feof(fp)) return 0;
-    size_t size = 94;
-    char *line = malloc(size*sizeof(char));
-    if(size != fread(line, sizeof(char), size, fp)){
-        free(line);
-        return 0;
-    }
-
-    return line;
-}
-
-moves load_go_moves(char *filename)
-{
-    moves m;
-    m.n = 128;
-    m.data = calloc(128, sizeof(char*));
-    FILE *fp = fopen(filename, "rb");
-    int count = 0;
-    char *line = 0;
-    while((line = fgetgo(fp))){
-        if(count >= m.n){
-            m.n *= 2;
-            m.data = realloc(m.data, m.n*sizeof(char*));
-        }
-        m.data[count] = line;
-        ++count;
-    }
-    printf("%d\n", count);
-    m.n = count;
-    m.data = realloc(m.data, count*sizeof(char*));
-    return m;
-}
-
-void string_to_board(char *s, float *board)
-{
-    int i, j;
-    //memset(board, 0, 1*19*19*sizeof(float));
-    int count = 0;
-    for(i = 0; i < 91; ++i){
-        char c = s[i];
-        for(j = 0; j < 4; ++j){
-            int me = (c >> (2*j)) & 1;
-            int you = (c >> (2*j + 1)) & 1;
-            if (me) board[count] = 1;
-            else if (you) board[count] = -1;
-            else board[count] = 0;
-            ++count;
-            if(count >= 19*19) break;
-        }
-    }
-}
-
-void board_to_string(char *s, float *board)
-{
-    int i, j;
-    memset(s, 0, (19*19/4+1)*sizeof(char));
-    int count = 0;
-    for(i = 0; i < 91; ++i){
-        for(j = 0; j < 4; ++j){
-            int me = (board[count] == 1);
-            int you = (board[count] == -1);
-            if (me) s[i] = s[i] | (1<<(2*j));
-            if (you) s[i] = s[i] | (1<<(2*j + 1));
-            ++count;
-            if(count >= 19*19) break;
-        }
-    }
-}
-
-void random_go_moves(moves m, float *boards, float *labels, int n)
-{
-    int i;
-    memset(labels, 0, 19*19*n*sizeof(float));
-    for(i = 0; i < n; ++i){
-        char *b = m.data[rand()%m.n];
-        int row = b[0];
-        int col = b[1];
-        labels[col + 19*(row + i*19)] = 1;
-        string_to_board(b+2, boards+i*19*19);
-        boards[col + 19*(row + i*19)] = 0;
-
-        int flip = rand()%2;
-        int rotate = rand()%4;
-        image in = float_to_image(19, 19, 1, boards+i*19*19);
-        image out = float_to_image(19, 19, 1, labels+i*19*19);
-        if(flip){
-            flip_image(in);
-            flip_image(out);
-        }
-        rotate_image_cw(in, rotate);
-        rotate_image_cw(out, rotate);
-    }
-}
-
-
-void train_go(char *cfgfile, char *weightfile)
-{
-    srand(time(0));
-    float avg_loss = -1;
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-
-    char *backup_directory = "/home/pjreddie/backup/";
-
-    char buff[256];
-    float *board = calloc(19*19*net.batch, sizeof(float));
-    float *move = calloc(19*19*net.batch, sizeof(float));
-    moves m = load_go_moves("/home/pjreddie/backup/go.train");
-    //moves m = load_go_moves("games.txt");
-
-    int N = m.n;
-    int epoch = (*net.seen)/N;
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
-        clock_t time=clock();
-
-        random_go_moves(m, board, move, net.batch);
-        float loss = train_network_datum(net, board, move) / net.batch;
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.95 + loss*.05;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory,base, epoch);
-            save_weights(net, buff);
-
-        }
-        if(get_current_batch(net)%100 == 0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup",backup_directory,base);
-            save_weights(net, buff);
-        }
-        if(get_current_batch(net)%10000 == 0){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.backup",backup_directory,base,get_current_batch(net));
-            save_weights(net, buff);
-        }
-    }
-    sprintf(buff, "%s/%s.weights", backup_directory, base);
-    save_weights(net, buff);
-
-    free_network(net);
-    free(base);
-}
-
-void propagate_liberty(float *board, int *lib, int *visited, int row, int col, int side)
-{
-    if (row < 0 || row > 18 || col < 0 || col > 18) return;
-    int index = row*19 + col;
-    if (board[index] != side) return;
-    if (visited[index]) return;
-    visited[index] = 1;
-    lib[index] += 1;
-    propagate_liberty(board, lib, visited, row+1, col, side);
-    propagate_liberty(board, lib, visited, row-1, col, side);
-    propagate_liberty(board, lib, visited, row, col+1, side);
-    propagate_liberty(board, lib, visited, row, col-1, side);
-}
-
-
-int *calculate_liberties(float *board)
-{
-    int *lib = calloc(19*19, sizeof(int));
-    int visited[361];
-    int i, j;
-    for(j = 0; j < 19; ++j){
-        for(i = 0; i < 19; ++i){
-            memset(visited, 0, 19*19*sizeof(int));
-            int index = j*19 + i;
-            if(board[index] == 0){
-                if ((i > 0)  && board[index - 1]) propagate_liberty(board, lib, visited, j, i-1, board[index-1]);
-                if ((i < 18) && board[index + 1]) propagate_liberty(board, lib, visited, j, i+1, board[index+1]);
-                if ((j > 0)  && board[index - 19]) propagate_liberty(board, lib, visited, j-1, i, board[index-19]);
-                if ((j < 18) && board[index + 19]) propagate_liberty(board, lib, visited, j+1, i, board[index+19]);
-            }
-        }
-    }
-    return lib;
-}
-
-void print_board(float *board, int swap, int *indexes)
-{
-    //FILE *stream = stdout;
-    FILE *stream = stderr;
-    int i,j,n;
-    fprintf(stream, "\n\n");
-    fprintf(stream, "   ");
-    for(i = 0; i < 19; ++i){
-        fprintf(stream, "%c ", 'A' + i + 1*(i > 7 && noi));
-    }
-    fprintf(stream, "\n");
-    for(j = 0; j < 19; ++j){
-        fprintf(stream, "%2d", (inverted) ? 19-j : j+1);
-        for(i = 0; i < 19; ++i){
-            int index = j*19 + i;
-            if(indexes){
-                int found = 0;
-                for(n = 0; n < nind; ++n){
-                    if(index == indexes[n]){
-                        found = 1;
-                        /*
-                        if(n == 0) fprintf(stream, "\uff11");
-                        else if(n == 1) fprintf(stream, "\uff12");
-                        else if(n == 2) fprintf(stream, "\uff13");
-                        else if(n == 3) fprintf(stream, "\uff14");
-                        else if(n == 4) fprintf(stream, "\uff15");
-                        */
-                        if(n == 0) fprintf(stream, " 1");
-                        else if(n == 1) fprintf(stream, " 2");
-                        else if(n == 2) fprintf(stream, " 3");
-                        else if(n == 3) fprintf(stream, " 4");
-                        else if(n == 4) fprintf(stream, " 5");
-                    }
-                }
-                if(found) continue;
-            }
-            //if(board[index]*-swap > 0) fprintf(stream, "\u25C9 ");
-            //else if(board[index]*-swap < 0) fprintf(stream, "\u25EF ");
-            if(board[index]*-swap > 0) fprintf(stream, " O");
-            else if(board[index]*-swap < 0) fprintf(stream, " X");
-            else fprintf(stream, "  ");
-        }
-        fprintf(stream, "\n");
-    }
-}
-
-void flip_board(float *board)
-{
-    int i;
-    for(i = 0; i < 19*19; ++i){
-        board[i] = -board[i];
-    }
-}
-
-void predict_move(network net, float *board, float *move, int multi)
-{
-    float *output = network_predict(net, board);
-    copy_cpu(19*19, output, 1, move, 1);
-    int i;
-    if(multi){
-        image bim = float_to_image(19, 19, 1, board);
-        for(i = 1; i < 8; ++i){
-            rotate_image_cw(bim, i);
-            if(i >= 4) flip_image(bim);
-
-            float *output = network_predict(net, board);
-            image oim = float_to_image(19, 19, 1, output);
-
-            if(i >= 4) flip_image(oim);
-            rotate_image_cw(oim, -i);
-
-            axpy_cpu(19*19, 1, output, 1, move, 1);
-
-            if(i >= 4) flip_image(bim);
-            rotate_image_cw(bim, -i);
-        }
-        scal_cpu(19*19, 1./8., move, 1);
-    }
-    for(i = 0; i < 19*19; ++i){
-        if(board[i]) move[i] = 0;
-    }
-}
-
-void remove_connected(float *b, int *lib, int p, int r, int c)
-{
-    if (r < 0 || r >= 19 || c < 0 || c >= 19) return;
-    if (b[r*19 + c] != p) return;
-    if (lib[r*19 + c] != 1) return;
-    b[r*19 + c] = 0;
-    remove_connected(b, lib, p, r+1, c);
-    remove_connected(b, lib, p, r-1, c);
-    remove_connected(b, lib, p, r, c+1);
-    remove_connected(b, lib, p, r, c-1);
-}
-
-
-void move_go(float *b, int p, int r, int c)
-{
-    int *l = calculate_liberties(b);
-    b[r*19 + c] = p;
-    remove_connected(b, l, -p, r+1, c);
-    remove_connected(b, l, -p, r-1, c);
-    remove_connected(b, l, -p, r, c+1);
-    remove_connected(b, l, -p, r, c-1);
-    free(l);
-}
-
-int makes_safe_go(float *b, int *lib, int p, int r, int c){
-    if (r < 0 || r >= 19 || c < 0 || c >= 19) return 0;
-    if (b[r*19 + c] == -p){
-        if (lib[r*19 + c] > 1) return 0;
-        else return 1;
-    }
-    if (b[r*19 + c] == 0) return 1;
-    if (lib[r*19 + c] > 1) return 1;
-    return 0;
-}
-
-int suicide_go(float *b, int p, int r, int c)
-{
-    int *l = calculate_liberties(b);
-    int safe = 0;
-    safe = safe || makes_safe_go(b, l, p, r+1, c);
-    safe = safe || makes_safe_go(b, l, p, r-1, c);
-    safe = safe || makes_safe_go(b, l, p, r, c+1);
-    safe = safe || makes_safe_go(b, l, p, r, c-1);
-    free(l);
-    return !safe;
-}
-
-int legal_go(float *b, char *ko, int p, int r, int c)
-{
-    if (b[r*19 + c]) return 0;
-    char curr[91];
-    char next[91];
-    board_to_string(curr, b);
-    move_go(b, p, r, c);
-    board_to_string(next, b);
-    string_to_board(curr, b);
-    if(memcmp(next, ko, 91) == 0) return 0;
-    return 1;
-}
-
-int generate_move(network net, int player, float *board, int multi, float thresh, float temp, char *ko, int print)
-{
-    int i, j;
-    for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp;
-
-    float move[361];
-    if (player < 0) flip_board(board);
-    predict_move(net, board, move, multi);
-    if (player < 0) flip_board(board);
-
-    
-    for(i = 0; i < 19; ++i){
-        for(j = 0; j < 19; ++j){
-            if (!legal_go(board, ko, player, i, j)) move[i*19 + j] = 0;
-        }
-    }
-
-    int indexes[nind];
-    top_k(move, 19*19, nind, indexes);
-    if(thresh > move[indexes[0]]) thresh = move[indexes[nind-1]];
-
-    for(i = 0; i < 19; ++i){
-        for(j = 0; j < 19; ++j){
-            if (move[i*19 + j] < thresh) move[i*19 + j] = 0;
-        }
-    }
-
-
-    int max = max_index(move, 19*19);
-    int row = max / 19;
-    int col = max % 19;
-    int index = sample_array(move, 19*19);
-
-    if(print){
-        top_k(move, 19*19, nind, indexes);
-        for(i = 0; i < nind; ++i){
-            if (!move[indexes[i]]) indexes[i] = -1;
-        }
-        print_board(board, player, indexes);
-        for(i = 0; i < nind; ++i){
-            fprintf(stderr, "%d: %f\n", i+1, move[indexes[i]]);
-        }
-    }
-
-    if(suicide_go(board, player, row, col)){
-        return -1; 
-    }
-    if(suicide_go(board, player, index/19, index%19)) index = max;
-    return index;
-}
-
-void valid_go(char *cfgfile, char *weightfile, int multi)
-{
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-
-    float *board = calloc(19*19, sizeof(float));
-    float *move = calloc(19*19, sizeof(float));
-    moves m = load_go_moves("/home/pjreddie/backup/go.test");
-
-    int N = m.n;
-    int i;
-    int correct = 0;
-    for(i = 0; i <N; ++i){
-        char *b = m.data[i];
-        int row = b[0];
-        int col = b[1];
-        int truth = col + 19*row;
-        string_to_board(b+2, board);
-        predict_move(net, board, move, multi);
-        int index = max_index(move, 19*19);
-        if(index == truth) ++correct;
-        printf("%d Accuracy %f\n", i, (float) correct/(i+1));
-    }
-}
-
-void engine_go(char *filename, char *weightfile, int multi)
-{
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    srand(time(0));
-    set_batch_network(&net, 1);
-    float *board = calloc(19*19, sizeof(float));
-    char *one = calloc(91, sizeof(char));
-    char *two = calloc(91, sizeof(char));
-    int passed = 0;
-    while(1){
-        char buff[256];
-        int id = 0;
-        int has_id = (scanf("%d", &id) == 1);
-        scanf("%s", buff);
-        if (feof(stdin)) break;
-        char ids[256];
-        sprintf(ids, "%d", id);
-        //fprintf(stderr, "%s\n", buff);
-        if (!has_id) ids[0] = 0;
-        if (!strcmp(buff, "protocol_version")){
-            printf("=%s 2\n\n", ids);
-        } else if (!strcmp(buff, "name")){
-            printf("=%s DarkGo\n\n", ids);
-        } else if (!strcmp(buff, "version")){
-            printf("=%s 1.0\n\n", ids);
-        } else if (!strcmp(buff, "known_command")){
-            char comm[256];
-            scanf("%s", comm);
-            int known = (!strcmp(comm, "protocol_version") || 
-                    !strcmp(comm, "name") || 
-                    !strcmp(comm, "version") || 
-                    !strcmp(comm, "known_command") || 
-                    !strcmp(comm, "list_commands") || 
-                    !strcmp(comm, "quit") || 
-                    !strcmp(comm, "boardsize") || 
-                    !strcmp(comm, "clear_board") || 
-                    !strcmp(comm, "komi") || 
-                    !strcmp(comm, "final_status_list") || 
-                    !strcmp(comm, "play") || 
-                    !strcmp(comm, "genmove"));
-            if(known) printf("=%s true\n\n", ids);
-            else printf("=%s false\n\n", ids);
-        } else if (!strcmp(buff, "list_commands")){
-            printf("=%s protocol_version\nname\nversion\nknown_command\nlist_commands\nquit\nboardsize\nclear_board\nkomi\nplay\ngenmove\nfinal_status_list\n\n", ids);
-        } else if (!strcmp(buff, "quit")){
-            break;
-        } else if (!strcmp(buff, "boardsize")){
-            int boardsize = 0;
-            scanf("%d", &boardsize);
-            //fprintf(stderr, "%d\n", boardsize);
-            if(boardsize != 19){
-                printf("?%s unacceptable size\n\n", ids);
-            } else {
-                printf("=%s \n\n", ids);
-            }
-        } else if (!strcmp(buff, "clear_board")){
-            passed = 0;
-            memset(board, 0, 19*19*sizeof(float));
-            printf("=%s \n\n", ids);
-        } else if (!strcmp(buff, "komi")){
-            float komi = 0;
-            scanf("%f", &komi);
-            printf("=%s \n\n", ids);
-        } else if (!strcmp(buff, "play")){
-            char color[256];
-            scanf("%s ", color);
-            char c;
-            int r;
-            int count = scanf("%c%d", &c, &r);
-            int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
-            if(c == 'p' && count < 2) {
-                passed = 1;
-                printf("=%s \n\n", ids);
-                char *line = fgetl(stdin);
-                free(line);
-                fflush(stdout);
-                fflush(stderr);
-                continue;
-            } else {
-                passed = 0;
-            }
-            if(c >= 'A' && c <= 'Z') c = c - 'A';
-            if(c >= 'a' && c <= 'z') c = c - 'a';
-            if(c >= 8) --c;
-            r = 19 - r;
-            fprintf(stderr, "move: %d %d\n", r, c);
-
-            char *swap = two;
-            two = one;
-            one = swap;
-            move_go(board, player, r, c);
-            board_to_string(one, board);
-
-            printf("=%s \n\n", ids);
-            print_board(board, 1, 0);
-        } else if (!strcmp(buff, "genmove")){
-            char color[256];
-            scanf("%s", color);
-            int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
-
-            int index = generate_move(net, player, board, multi, .1, .7, two, 1);
-            if(passed || index < 0){
-                printf("=%s pass\n\n", ids);
-                passed = 0;
-            } else {
-                int row = index / 19;
-                int col = index % 19;
-
-                char *swap = two;
-                two = one;
-                one = swap;
-
-                move_go(board, player, row, col);
-                board_to_string(one, board);
-                row = 19 - row;
-                if (col >= 8) ++col;
-                printf("=%s %c%d\n\n", ids, 'A' + col, row);
-                print_board(board, 1, 0);
-            }
-
-        } else if (!strcmp(buff, "p")){
-            //print_board(board, 1, 0);
-        } else if (!strcmp(buff, "final_status_list")){
-            char type[256];
-            scanf("%s", type);
-            fprintf(stderr, "final_status\n");
-            char *line = fgetl(stdin);
-            free(line);
-            if(type[0] == 'd' || type[0] == 'D'){
-                FILE *f = fopen("game.txt", "w");
-                int i, j;
-                int count = 2;
-                fprintf(f, "boardsize 19\n");
-                fprintf(f, "clear_board\n");
-                for(j = 0; j < 19; ++j){
-                    for(i = 0; i < 19; ++i){
-                        if(board[j*19 + i] == 1) fprintf(f, "play black %c%d\n", 'A'+i+(i>=8), 19-j);
-                        if(board[j*19 + i] == -1) fprintf(f, "play white %c%d\n", 'A'+i+(i>=8), 19-j);
-                        if(board[j*19 + i]) ++count;
-                    }
-                }
-                fprintf(f, "final_status_list dead\n");
-                fclose(f);
-                FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
-                for(i = 0; i < count; ++i){
-                    free(fgetl(p));
-                    free(fgetl(p));
-                }
-                char *l = 0;
-                while((l = fgetl(p))){
-                    printf("%s\n", l);
-                    free(l);
-                }
-            } else {
-                printf("?%s unknown command\n\n", ids);
-            }
-        } else {
-            char *line = fgetl(stdin);
-            free(line);
-            printf("?%s unknown command\n\n", ids);
-        }
-        fflush(stdout);
-        fflush(stderr);
-    }
-}
-
-void test_go(char *cfg, char *weights, int multi)
-{
-    network net = parse_network_cfg(cfg);
-    if(weights){
-        load_weights(&net, weights);
-    }
-    srand(time(0));
-    set_batch_network(&net, 1);
-    float *board = calloc(19*19, sizeof(float));
-    float *move = calloc(19*19, sizeof(float));
-    int color = 1;
-    while(1){
-        float *output = network_predict(net, board);
-        copy_cpu(19*19, output, 1, move, 1);
-        int i;
-        if(multi){
-            image bim = float_to_image(19, 19, 1, board);
-            for(i = 1; i < 8; ++i){
-                rotate_image_cw(bim, i);
-                if(i >= 4) flip_image(bim);
-
-                float *output = network_predict(net, board);
-                image oim = float_to_image(19, 19, 1, output);
-
-                if(i >= 4) flip_image(oim);
-                rotate_image_cw(oim, -i);
-
-                axpy_cpu(19*19, 1, output, 1, move, 1);
-
-                if(i >= 4) flip_image(bim);
-                rotate_image_cw(bim, -i);
-            }
-            scal_cpu(19*19, 1./8., move, 1);
-        }
-        for(i = 0; i < 19*19; ++i){
-            if(board[i]) move[i] = 0;
-        }
-
-        int indexes[nind];
-        int row, col;
-        top_k(move, 19*19, nind, indexes);
-        print_board(board, color, indexes);
-        for(i = 0; i < nind; ++i){
-            int index = indexes[i];
-            row = index / 19;
-            col = index % 19;
-            printf("%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100);
-        }
-        //if(color == 1) printf("\u25EF Enter move: ");
-        //else printf("\u25C9 Enter move: ");
-        if(color == 1) printf("X Enter move: ");
-        else printf("O Enter move: ");
-
-        char c;
-        char *line = fgetl(stdin);
-        int picked = 1;
-        int dnum = sscanf(line, "%d", &picked);
-        int cnum = sscanf(line, "%c", &c);
-        if (strlen(line) == 0 || dnum) {
-            --picked;
-            if (picked < nind){
-                int index = indexes[picked];
-                row = index / 19;
-                col = index % 19;
-                board[row*19 + col] = 1;
-            }
-        } else if (cnum){
-            if (c <= 'T' && c >= 'A'){
-                int num = sscanf(line, "%c %d", &c, &row);
-                row = (inverted)?19 - row : row-1;
-                col = c - 'A';
-                if (col > 7 && noi) col -= 1;
-                if (num == 2) board[row*19 + col] = 1;
-            } else if (c == 'p') {
-                // Pass
-            } else if(c=='b' || c == 'w'){
-                char g;
-                int num = sscanf(line, "%c %c %d", &g, &c, &row);
-                row = (inverted)?19 - row : row-1;
-                col = c - 'A';
-                if (col > 7 && noi) col -= 1;
-                if (num == 3) board[row*19 + col] = (g == 'b') ? color : -color;
-            } else if(c == 'c'){
-                char g;
-                int num = sscanf(line, "%c %c %d", &g, &c, &row);
-                row = (inverted)?19 - row : row-1;
-                col = c - 'A';
-                if (col > 7 && noi) col -= 1;
-                if (num == 3) board[row*19 + col] = 0;
-            }
-        }
-        free(line);
-        flip_board(board);
-        color = -color;
-    }
-}
-
-float score_game(float *board)
-{
-    FILE *f = fopen("game.txt", "w");
-    int i, j;
-    int count = 3;
-    fprintf(f, "komi 6.5\n");
-    fprintf(f, "boardsize 19\n");
-    fprintf(f, "clear_board\n");
-    for(j = 0; j < 19; ++j){
-        for(i = 0; i < 19; ++i){
-            if(board[j*19 + i] == 1) fprintf(f, "play black %c%d\n", 'A'+i+(i>=8), 19-j);
-            if(board[j*19 + i] == -1) fprintf(f, "play white %c%d\n", 'A'+i+(i>=8), 19-j);
-            if(board[j*19 + i]) ++count;
-        }
-    }
-    fprintf(f, "final_score\n");
-    fclose(f);
-    FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
-    for(i = 0; i < count; ++i){
-        free(fgetl(p));
-        free(fgetl(p));
-    }
-    char *l = 0;
-    float score = 0;
-    char player = 0;
-    while((l = fgetl(p))){
-        fprintf(stderr, "%s  \t", l);
-        int n = sscanf(l, "= %c+%f", &player, &score);
-        free(l);
-        if (n == 2) break;
-    }
-    if(player == 'W') score = -score;
-    pclose(p);
-    return score;
-}
-
-void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi)
-{
-    network net = parse_network_cfg(filename);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-
-    network net2 = net;
-    if(f2){
-        net2 = parse_network_cfg(f2);
-        if(w2){
-            load_weights(&net2, w2);
-        }
-    }
-    srand(time(0));
-    char boards[300][93];
-    int count = 0;
-    set_batch_network(&net, 1);
-    set_batch_network(&net2, 1);
-    float *board = calloc(19*19, sizeof(float));
-    char *one = calloc(91, sizeof(char));
-    char *two = calloc(91, sizeof(char));
-    int done = 0;
-    int player = 1;
-    int p1 = 0;
-    int p2 = 0;
-    int total = 0;
-    while(1){
-        if (done || count >= 300){
-            float score = score_game(board);
-            int i = (score > 0)? 0 : 1;
-            if((score > 0) == (total%2==0)) ++p1;
-            else ++p2;
-            ++total;
-            fprintf(stderr, "Total: %d, Player 1: %f, Player 2: %f\n", total, (float)p1/total, (float)p2/total);
-            int j;
-            for(; i < count; i += 2){
-                for(j = 0; j < 93; ++j){
-                    printf("%c", boards[i][j]);
-                }
-                printf("\n");
-            }
-            memset(board, 0, 19*19*sizeof(float));
-            player = 1;
-            done = 0;
-            count = 0;
-            fflush(stdout);
-            fflush(stderr);
-        }
-        //print_board(board, 1, 0);
-        //sleep(1);
-        network use = ((total%2==0) == (player==1)) ? net : net2;
-        int index = generate_move(use, player, board, multi, .1, .7, two, 0);
-        if(index < 0){
-            done = 1;
-            continue;
-        }
-        int row = index / 19;
-        int col = index % 19;
-
-        char *swap = two;
-        two = one;
-        one = swap;
-
-        if(player < 0) flip_board(board);
-        boards[count][0] = row;
-        boards[count][1] = col;
-        board_to_string(boards[count] + 2, board);
-        if(player < 0) flip_board(board);
-        ++count;
-
-        move_go(board, player, row, col);
-        board_to_string(one, board);
-
-        player = -player;
-    }
-}
-
-void run_go(int argc, char **argv)
-{
-    //boards_go();
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    char *c2 = (argc > 5) ? argv[5] : 0;
-    char *w2 = (argc > 6) ? argv[6] : 0;
-    int multi = find_arg(argc, argv, "-multi");
-    if(0==strcmp(argv[2], "train")) train_go(cfg, weights);
-    else if(0==strcmp(argv[2], "valid")) valid_go(cfg, weights, multi);
-    else if(0==strcmp(argv[2], "self")) self_go(cfg, weights, c2, w2, multi);
-    else if(0==strcmp(argv[2], "test")) test_go(cfg, weights, multi);
-    else if(0==strcmp(argv[2], "engine")) engine_go(cfg, weights, multi);
-}
-
-
diff --git a/image.darknet/src/gru_layer.c b/image.darknet/src/gru_layer.c
index b78e868..b6601d8 100644
--- a/image.darknet/src/gru_layer.c
+++ b/image.darknet/src/gru_layer.c
@@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps)
 #endif
 }
 
-layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize)
+layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
 {
     fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs);
     batch = batch / steps;
@@ -36,39 +36,37 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
     l.steps = steps;
     l.inputs = inputs;
 
-    l.input_z_layer = malloc(sizeof(layer));
+    l.uz = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_z_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
-    l.input_z_layer->batch = batch;
+    *(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.uz->batch = batch;
 
-    l.state_z_layer = malloc(sizeof(layer));
+    l.wz = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.state_z_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
-    l.state_z_layer->batch = batch;
+    *(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wz->batch = batch;
 
-
-
-    l.input_r_layer = malloc(sizeof(layer));
+    l.ur = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_r_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
-    l.input_r_layer->batch = batch;
+    *(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.ur->batch = batch;
 
-    l.state_r_layer = malloc(sizeof(layer));
+    l.wr = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.state_r_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
-    l.state_r_layer->batch = batch;
+    *(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wr->batch = batch;
 
 
 
-    l.input_h_layer = malloc(sizeof(layer));
+    l.uh = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_h_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
-    l.input_h_layer->batch = batch;
+    *(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.uh->batch = batch;
 
-    l.state_h_layer = malloc(sizeof(layer));
+    l.wh = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.state_h_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
-    l.state_h_layer->batch = batch;
+    *(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wh->batch = batch;
 
     l.batch_normalize = batch_normalize;
 
@@ -94,68 +92,80 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
     l.backward_gpu = backward_gru_layer_gpu;
     l.update_gpu = update_gru_layer_gpu;
 
-    l.forgot_state_gpu = cuda_make_array(l.output, batch*outputs);
-    l.forgot_delta_gpu = cuda_make_array(l.output, batch*outputs);
-    l.prev_state_gpu = cuda_make_array(l.output, batch*outputs);
-    l.state_gpu = cuda_make_array(l.output, batch*outputs);
-    l.output_gpu = cuda_make_array(l.output, batch*outputs*steps);
-    l.delta_gpu = cuda_make_array(l.delta, batch*outputs*steps);
-    l.r_gpu = cuda_make_array(l.output_gpu, batch*outputs);
-    l.z_gpu = cuda_make_array(l.output_gpu, batch*outputs);
-    l.h_gpu = cuda_make_array(l.output_gpu, batch*outputs);
+    l.forgot_state_gpu = cuda_make_array(0, batch*outputs);
+    l.forgot_delta_gpu = cuda_make_array(0, batch*outputs);
+    l.prev_state_gpu = cuda_make_array(0, batch*outputs);
+    l.state_gpu = cuda_make_array(0, batch*outputs);
+    l.output_gpu = cuda_make_array(0, batch*outputs*steps);
+    l.delta_gpu = cuda_make_array(0, batch*outputs*steps);
+    l.r_gpu = cuda_make_array(0, batch*outputs);
+    l.z_gpu = cuda_make_array(0, batch*outputs);
+    l.h_gpu = cuda_make_array(0, batch*outputs);
+
+#ifdef CUDNN
+    cudnnSetTensor4dDescriptor(l.uz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uz->out_c, l.uz->out_h, l.uz->out_w); 
+    cudnnSetTensor4dDescriptor(l.uh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uh->out_c, l.uh->out_h, l.uh->out_w); 
+    cudnnSetTensor4dDescriptor(l.ur->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ur->out_c, l.ur->out_h, l.ur->out_w); 
+    cudnnSetTensor4dDescriptor(l.wz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wz->out_c, l.wz->out_h, l.wz->out_w); 
+    cudnnSetTensor4dDescriptor(l.wh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wh->out_c, l.wh->out_h, l.wh->out_w); 
+    cudnnSetTensor4dDescriptor(l.wr->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wr->out_c, l.wr->out_h, l.wr->out_w); 
+#endif
 #endif
 
     return l;
 }
 
-void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_gru_layer(layer l, update_args a)
 {
-    update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.ur), a);
+    update_connected_layer(*(l.uz), a);
+    update_connected_layer(*(l.uh), a);
+    update_connected_layer(*(l.wr), a);
+    update_connected_layer(*(l.wz), a);
+    update_connected_layer(*(l.wh), a);
 }
 
-void forward_gru_layer(layer l, network_state state)
+void forward_gru_layer(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
+    s.train = net.train;
     int i;
-    layer input_z_layer = *(l.input_z_layer);
-    layer input_r_layer = *(l.input_r_layer);
-    layer input_h_layer = *(l.input_h_layer);
-
-    layer state_z_layer = *(l.state_z_layer);
-    layer state_r_layer = *(l.state_r_layer);
-    layer state_h_layer = *(l.state_h_layer);
-
-    fill_cpu(l.outputs * l.batch * l.steps, 0, input_z_layer.delta, 1);
-    fill_cpu(l.outputs * l.batch * l.steps, 0, input_r_layer.delta, 1);
-    fill_cpu(l.outputs * l.batch * l.steps, 0, input_h_layer.delta, 1);
-
-    fill_cpu(l.outputs * l.batch * l.steps, 0, state_z_layer.delta, 1);
-    fill_cpu(l.outputs * l.batch * l.steps, 0, state_r_layer.delta, 1);
-    fill_cpu(l.outputs * l.batch * l.steps, 0, state_h_layer.delta, 1);
-    if(state.train) {
+    layer uz = *(l.uz);
+    layer ur = *(l.ur);
+    layer uh = *(l.uh);
+
+    layer wz = *(l.wz);
+    layer wr = *(l.wr);
+    layer wh = *(l.wh);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, uz.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, ur.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, uh.delta, 1);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wz.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wr.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wh.delta, 1);
+    if(net.train) {
         fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
         copy_cpu(l.outputs*l.batch, l.state, 1, l.prev_state, 1);
     }
 
     for (i = 0; i < l.steps; ++i) {
         s.input = l.state;
-        forward_connected_layer(state_z_layer, s);
-        forward_connected_layer(state_r_layer, s);
+        forward_connected_layer(wz, s);
+        forward_connected_layer(wr, s);
 
-        s.input = state.input;
-        forward_connected_layer(input_z_layer, s);
-        forward_connected_layer(input_r_layer, s);
-        forward_connected_layer(input_h_layer, s);
+        s.input = net.input;
+        forward_connected_layer(uz, s);
+        forward_connected_layer(ur, s);
+        forward_connected_layer(uh, s);
 
 
-        copy_cpu(l.outputs*l.batch, input_z_layer.output, 1, l.z_cpu, 1);
-        axpy_cpu(l.outputs*l.batch, 1, state_z_layer.output, 1, l.z_cpu, 1);
+        copy_cpu(l.outputs*l.batch, uz.output, 1, l.z_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, wz.output, 1, l.z_cpu, 1);
 
-        copy_cpu(l.outputs*l.batch, input_r_layer.output, 1, l.r_cpu, 1);
-        axpy_cpu(l.outputs*l.batch, 1, state_r_layer.output, 1, l.r_cpu, 1);
+        copy_cpu(l.outputs*l.batch, ur.output, 1, l.r_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, wr.output, 1, l.r_cpu, 1);
 
         activate_array(l.z_cpu, l.outputs*l.batch, LOGISTIC);
         activate_array(l.r_cpu, l.outputs*l.batch, LOGISTIC);
@@ -164,34 +174,34 @@ void forward_gru_layer(layer l, network_state state)
         mul_cpu(l.outputs*l.batch, l.r_cpu, 1, l.forgot_state, 1);
 
         s.input = l.forgot_state;
-        forward_connected_layer(state_h_layer, s);
+        forward_connected_layer(wh, s);
 
-        copy_cpu(l.outputs*l.batch, input_h_layer.output, 1, l.h_cpu, 1);
-        axpy_cpu(l.outputs*l.batch, 1, state_h_layer.output, 1, l.h_cpu, 1);
+        copy_cpu(l.outputs*l.batch, uh.output, 1, l.h_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, wh.output, 1, l.h_cpu, 1);
 
-        #ifdef USET
-        activate_array(l.h_cpu, l.outputs*l.batch, TANH);
-        #else
-        activate_array(l.h_cpu, l.outputs*l.batch, LOGISTIC);
-        #endif
+        if(l.tanh){
+            activate_array(l.h_cpu, l.outputs*l.batch, TANH);
+        } else {
+            activate_array(l.h_cpu, l.outputs*l.batch, LOGISTIC);
+        }
 
         weighted_sum_cpu(l.state, l.h_cpu, l.z_cpu, l.outputs*l.batch, l.output);
 
         copy_cpu(l.outputs*l.batch, l.output, 1, l.state, 1);
 
-        state.input += l.inputs*l.batch;
+        net.input += l.inputs*l.batch;
         l.output += l.outputs*l.batch;
-        increment_layer(&input_z_layer, 1);
-        increment_layer(&input_r_layer, 1);
-        increment_layer(&input_h_layer, 1);
+        increment_layer(&uz, 1);
+        increment_layer(&ur, 1);
+        increment_layer(&uh, 1);
 
-        increment_layer(&state_z_layer, 1);
-        increment_layer(&state_r_layer, 1);
-        increment_layer(&state_h_layer, 1);
+        increment_layer(&wz, 1);
+        increment_layer(&wr, 1);
+        increment_layer(&wh, 1);
     }
 }
 
-void backward_gru_layer(layer l, network_state state)
+void backward_gru_layer(layer l, network net)
 {
 }
 
@@ -205,191 +215,192 @@ void push_gru_layer(layer l)
 {
 }
 
-void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_gru_layer_gpu(layer l, update_args a)
 {
-    update_connected_layer_gpu(*(l.input_r_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.input_z_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.input_h_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.state_r_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.state_z_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.state_h_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer_gpu(*(l.ur), a);
+    update_connected_layer_gpu(*(l.uz), a);
+    update_connected_layer_gpu(*(l.uh), a);
+    update_connected_layer_gpu(*(l.wr), a);
+    update_connected_layer_gpu(*(l.wz), a);
+    update_connected_layer_gpu(*(l.wh), a);
 }
 
-void forward_gru_layer_gpu(layer l, network_state state)
+void forward_gru_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = {0};
+    s.train = net.train;
     int i;
-    layer input_z_layer = *(l.input_z_layer);
-    layer input_r_layer = *(l.input_r_layer);
-    layer input_h_layer = *(l.input_h_layer);
-
-    layer state_z_layer = *(l.state_z_layer);
-    layer state_r_layer = *(l.state_r_layer);
-    layer state_h_layer = *(l.state_h_layer);
-
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, input_z_layer.delta_gpu, 1);
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, input_r_layer.delta_gpu, 1);
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, input_h_layer.delta_gpu, 1);
-
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, state_z_layer.delta_gpu, 1);
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, state_r_layer.delta_gpu, 1);
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, state_h_layer.delta_gpu, 1);
-    if(state.train) {
-        fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
-        copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
+    layer uz = *(l.uz);
+    layer ur = *(l.ur);
+    layer uh = *(l.uh);
+
+    layer wz = *(l.wz);
+    layer wr = *(l.wr);
+    layer wh = *(l.wh);
+
+    fill_gpu(l.outputs * l.batch * l.steps, 0, uz.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, ur.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, uh.delta_gpu, 1);
+
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wz.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wr.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wh.delta_gpu, 1);
+    if(net.train) {
+        fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
     }
 
     for (i = 0; i < l.steps; ++i) {
-        s.input = l.state_gpu;
-        forward_connected_layer_gpu(state_z_layer, s);
-        forward_connected_layer_gpu(state_r_layer, s);
+        s.input_gpu = l.state_gpu;
+        forward_connected_layer_gpu(wz, s);
+        forward_connected_layer_gpu(wr, s);
 
-        s.input = state.input;
-        forward_connected_layer_gpu(input_z_layer, s);
-        forward_connected_layer_gpu(input_r_layer, s);
-        forward_connected_layer_gpu(input_h_layer, s);
+        s.input_gpu = net.input_gpu;
+        forward_connected_layer_gpu(uz, s);
+        forward_connected_layer_gpu(ur, s);
+        forward_connected_layer_gpu(uh, s);
 
+        copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, input_z_layer.output_gpu, 1, l.z_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_z_layer.output_gpu, 1, l.z_gpu, 1);
+        copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, input_r_layer.output_gpu, 1, l.r_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_r_layer.output_gpu, 1, l.r_gpu, 1);
+        activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
+        activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
 
-        activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
-        activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
+        copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
+        mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
-        mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
+        s.input_gpu = l.forgot_state_gpu;
+        forward_connected_layer_gpu(wh, s);
 
-        s.input = l.forgot_state_gpu;
-        forward_connected_layer_gpu(state_h_layer, s);
+        copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, input_h_layer.output_gpu, 1, l.h_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_h_layer.output_gpu, 1, l.h_gpu, 1);
-
-        #ifdef USET
-        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
-        #else
-        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
-        #endif
+        if(l.tanh){
+            activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
+        } else {
+            activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
+        }
 
         weighted_sum_gpu(l.state_gpu, l.h_gpu, l.z_gpu, l.outputs*l.batch, l.output_gpu);
+        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
 
-        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
-
-        state.input += l.inputs*l.batch;
+        net.input_gpu += l.inputs*l.batch;
         l.output_gpu += l.outputs*l.batch;
-        increment_layer(&input_z_layer, 1);
-        increment_layer(&input_r_layer, 1);
-        increment_layer(&input_h_layer, 1);
+        increment_layer(&uz, 1);
+        increment_layer(&ur, 1);
+        increment_layer(&uh, 1);
 
-        increment_layer(&state_z_layer, 1);
-        increment_layer(&state_r_layer, 1);
-        increment_layer(&state_h_layer, 1);
+        increment_layer(&wz, 1);
+        increment_layer(&wr, 1);
+        increment_layer(&wh, 1);
     }
 }
 
-void backward_gru_layer_gpu(layer l, network_state state)
+void backward_gru_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = {0};
+    s.train = net.train;
     int i;
-    layer input_z_layer = *(l.input_z_layer);
-    layer input_r_layer = *(l.input_r_layer);
-    layer input_h_layer = *(l.input_h_layer);
+    layer uz = *(l.uz);
+    layer ur = *(l.ur);
+    layer uh = *(l.uh);
 
-    layer state_z_layer = *(l.state_z_layer);
-    layer state_r_layer = *(l.state_r_layer);
-    layer state_h_layer = *(l.state_h_layer);
+    layer wz = *(l.wz);
+    layer wr = *(l.wr);
+    layer wh = *(l.wh);
 
-    increment_layer(&input_z_layer, l.steps - 1);
-    increment_layer(&input_r_layer, l.steps - 1);
-    increment_layer(&input_h_layer, l.steps - 1);
+    increment_layer(&uz, l.steps - 1);
+    increment_layer(&ur, l.steps - 1);
+    increment_layer(&uh, l.steps - 1);
 
-    increment_layer(&state_z_layer, l.steps - 1);
-    increment_layer(&state_r_layer, l.steps - 1);
-    increment_layer(&state_h_layer, l.steps - 1);
+    increment_layer(&wz, l.steps - 1);
+    increment_layer(&wr, l.steps - 1);
+    increment_layer(&wh, l.steps - 1);
 
-    state.input += l.inputs*l.batch*(l.steps-1);
-    if(state.delta) state.delta += l.inputs*l.batch*(l.steps-1);
+    net.input_gpu += l.inputs*l.batch*(l.steps-1);
+    if(net.delta_gpu) net.delta_gpu += l.inputs*l.batch*(l.steps-1);
     l.output_gpu += l.outputs*l.batch*(l.steps-1);
     l.delta_gpu += l.outputs*l.batch*(l.steps-1);
+    float *end_state = l.output_gpu;
     for (i = l.steps-1; i >= 0; --i) {
-        if(i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
+        if(i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
+        else copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1);
         float *prev_delta_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
 
-        copy_ongpu(l.outputs*l.batch, input_z_layer.output_gpu, 1, l.z_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_z_layer.output_gpu, 1, l.z_gpu, 1);
-
-        copy_ongpu(l.outputs*l.batch, input_r_layer.output_gpu, 1, l.r_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_r_layer.output_gpu, 1, l.r_gpu, 1);
-
-        activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
-        activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
-
-        copy_ongpu(l.outputs*l.batch, input_h_layer.output_gpu, 1, l.h_gpu, 1);
-        axpy_ongpu(l.outputs*l.batch, 1, state_h_layer.output_gpu, 1, l.h_gpu, 1);
-
-        #ifdef USET
-        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
-        #else
-        activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
-        #endif
-        
-        weighted_delta_gpu(l.prev_state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, input_h_layer.delta_gpu, input_z_layer.delta_gpu, l.outputs*l.batch, l.delta_gpu);
-
-        #ifdef USET
-        gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH, input_h_layer.delta_gpu);
-        #else
-        gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, input_h_layer.delta_gpu);
-        #endif
-
-        copy_ongpu(l.outputs*l.batch, input_h_layer.delta_gpu, 1, state_h_layer.delta_gpu, 1);
-        
-        copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.forgot_state_gpu, 1);
-        mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
-        fill_ongpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
-
-        s.input = l.forgot_state_gpu;
-        s.delta = l.forgot_delta_gpu;
-        
-        backward_connected_layer_gpu(state_h_layer, s);
+        copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
+
+        copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
+
+        activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
+        activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
+
+        copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
+
+        if(l.tanh){
+            activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
+        } else {
+            activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
+        }
+
+        weighted_delta_gpu(l.state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, uh.delta_gpu, uz.delta_gpu, l.outputs*l.batch, l.delta_gpu);
+
+        if(l.tanh){
+            gradient_array_gpu(l.h_gpu, l.outputs*l.batch, TANH, uh.delta_gpu);
+        } else {
+            gradient_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, uh.delta_gpu);
+        }
+
+        copy_gpu(l.outputs*l.batch, uh.delta_gpu, 1, wh.delta_gpu, 1);
+
+        copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
+        mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
+        fill_gpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
+
+        s.input_gpu = l.forgot_state_gpu;
+        s.delta_gpu = l.forgot_delta_gpu;
+
+        backward_connected_layer_gpu(wh, s);
         if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu);
-        mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.prev_state_gpu, input_r_layer.delta_gpu);
-
-        gradient_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, input_r_layer.delta_gpu);
-        copy_ongpu(l.outputs*l.batch, input_r_layer.delta_gpu, 1, state_r_layer.delta_gpu, 1);
-
-        gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, input_z_layer.delta_gpu);
-        copy_ongpu(l.outputs*l.batch, input_z_layer.delta_gpu, 1, state_z_layer.delta_gpu, 1);
-        
-        s.input = l.prev_state_gpu;
-        s.delta = prev_delta_gpu;
-        
-        backward_connected_layer_gpu(state_r_layer, s);
-        backward_connected_layer_gpu(state_z_layer, s);
-
-        s.input = state.input;
-        s.delta = state.delta;
-        
-        backward_connected_layer_gpu(input_h_layer, s);
-        backward_connected_layer_gpu(input_r_layer, s);
-        backward_connected_layer_gpu(input_z_layer, s);
-
-
-        state.input -= l.inputs*l.batch;
-        if(state.delta) state.delta -= l.inputs*l.batch;
+        mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.state_gpu, ur.delta_gpu);
+
+        gradient_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, ur.delta_gpu);
+        copy_gpu(l.outputs*l.batch, ur.delta_gpu, 1, wr.delta_gpu, 1);
+
+        gradient_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, uz.delta_gpu);
+        copy_gpu(l.outputs*l.batch, uz.delta_gpu, 1, wz.delta_gpu, 1);
+
+        s.input_gpu = l.state_gpu;
+        s.delta_gpu = prev_delta_gpu;
+
+        backward_connected_layer_gpu(wr, s);
+        backward_connected_layer_gpu(wz, s);
+
+        s.input_gpu = net.input_gpu;
+        s.delta_gpu = net.delta_gpu;
+
+        backward_connected_layer_gpu(uh, s);
+        backward_connected_layer_gpu(ur, s);
+        backward_connected_layer_gpu(uz, s);
+
+
+        net.input_gpu -= l.inputs*l.batch;
+        if(net.delta_gpu) net.delta_gpu -= l.inputs*l.batch;
         l.output_gpu -= l.outputs*l.batch;
         l.delta_gpu -= l.outputs*l.batch;
-        increment_layer(&input_z_layer, -1);
-        increment_layer(&input_r_layer, -1);
-        increment_layer(&input_h_layer, -1);
+        increment_layer(&uz, -1);
+        increment_layer(&ur, -1);
+        increment_layer(&uh, -1);
 
-        increment_layer(&state_z_layer, -1);
-        increment_layer(&state_r_layer, -1);
-        increment_layer(&state_h_layer, -1);
+        increment_layer(&wz, -1);
+        increment_layer(&wr, -1);
+        increment_layer(&wh, -1);
     }
+    copy_gpu(l.outputs*l.batch, end_state, 1, l.state_gpu, 1);
 }
 #endif
diff --git a/image.darknet/src/gru_layer.h b/image.darknet/src/gru_layer.h
index 9e19cee..9067942 100644
--- a/image.darknet/src/gru_layer.h
+++ b/image.darknet/src/gru_layer.h
@@ -6,16 +6,16 @@
 #include "layer.h"
 #include "network.h"
 
-layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize);
+layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
 
-void forward_gru_layer(layer l, network_state state);
-void backward_gru_layer(layer l, network_state state);
-void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_gru_layer(layer l, network state);
+void backward_gru_layer(layer l, network state);
+void update_gru_layer(layer l, update_args a);
 
 #ifdef GPU
-void forward_gru_layer_gpu(layer l, network_state state);
-void backward_gru_layer_gpu(layer l, network_state state);
-void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_gru_layer_gpu(layer l, network state);
+void backward_gru_layer_gpu(layer l, network state);
+void update_gru_layer_gpu(layer l, update_args a);
 void push_gru_layer(layer l);
 void pull_gru_layer(layer l);
 #endif
diff --git a/image.darknet/src/im2col.h b/image.darknet/src/im2col.h
index f0ddeee..02c4247 100644
--- a/image.darknet/src/im2col.h
+++ b/image.darknet/src/im2col.h
@@ -7,7 +7,7 @@ void im2col_cpu(float* data_im,
 
 #ifdef GPU
 
-void im2col_ongpu(float *im,
+void im2col_gpu(float *im,
          int channels, int height, int width,
          int ksize, int stride, int pad,float *data_col);
 
diff --git a/image.darknet/src/im2col_kernels.cu b/image.darknet/src/im2col_kernels.cu
index d42d600..07b5e67 100644
--- a/image.darknet/src/im2col_kernels.cu
+++ b/image.darknet/src/im2col_kernels.cu
@@ -45,7 +45,7 @@ __global__ void im2col_gpu_kernel(const int n, const float* data_im,
     }
 }
 
-void im2col_ongpu(float *im,
+void im2col_gpu(float *im,
          int channels, int height, int width,
          int ksize, int stride, int pad, float *data_col){
     // We are going to launch channels * height_col * width_col kernels, each
diff --git a/image.darknet/src/image.c b/image.darknet/src/image.c
index 5a90efd..4a2c6ba 100644
--- a/image.darknet/src/image.c
+++ b/image.darknet/src/image.c
@@ -10,12 +10,6 @@
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 #include "stb_image_write.h"
 
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#include "opencv2/imgproc/imgproc_c.h"
-#endif
-
-
 int windows = 0;
 
 float colors[6][3] = { {1,0,1}, {0,0,1},{0,1,1},{0,1,0},{1,1,0},{1,0,0} };
@@ -31,6 +25,70 @@ float get_color(int c, int x, int max)
     return r;
 }
 
+image mask_to_rgb(image mask)
+{
+    int n = mask.c;
+    image im = make_image(mask.w, mask.h, 3);
+    int i, j;
+    for(j = 0; j < n; ++j){
+        int offset = j*123457 % n;
+        float red = get_color(2,offset,n);
+        float green = get_color(1,offset,n);
+        float blue = get_color(0,offset,n);
+        for(i = 0; i < im.w*im.h; ++i){
+            im.data[i + 0*im.w*im.h] += mask.data[j*im.h*im.w + i]*red;
+            im.data[i + 1*im.w*im.h] += mask.data[j*im.h*im.w + i]*green;
+            im.data[i + 2*im.w*im.h] += mask.data[j*im.h*im.w + i]*blue;
+        }
+    }
+    return im;
+}
+
+static float get_pixel(image m, int x, int y, int c)
+{
+    assert(x < m.w && y < m.h && c < m.c);
+    return m.data[c*m.h*m.w + y*m.w + x];
+}
+static float get_pixel_extend(image m, int x, int y, int c)
+{
+    if(x < 0 || x >= m.w || y < 0 || y >= m.h) return 0;
+    /*
+    if(x < 0) x = 0;
+    if(x >= m.w) x = m.w-1;
+    if(y < 0) y = 0;
+    if(y >= m.h) y = m.h-1;
+    */
+    if(c < 0 || c >= m.c) return 0;
+    return get_pixel(m, x, y, c);
+}
+static void set_pixel(image m, int x, int y, int c, float val)
+{
+    if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
+    assert(x < m.w && y < m.h && c < m.c);
+    m.data[c*m.h*m.w + y*m.w + x] = val;
+}
+static void add_pixel(image m, int x, int y, int c, float val)
+{
+    assert(x < m.w && y < m.h && c < m.c);
+    m.data[c*m.h*m.w + y*m.w + x] += val;
+}
+
+static float bilinear_interpolate(image im, float x, float y, int c)
+{
+    int ix = (int) floorf(x);
+    int iy = (int) floorf(y);
+
+    float dx = x - ix;
+    float dy = y - iy;
+
+    float val = (1-dy) * (1-dx) * get_pixel_extend(im, ix, iy, c) + 
+        dy     * (1-dx) * get_pixel_extend(im, ix, iy+1, c) + 
+        (1-dy) *   dx   * get_pixel_extend(im, ix+1, iy, c) +
+        dy     *   dx   * get_pixel_extend(im, ix+1, iy+1, c);
+    return val;
+}
+
+
 void composite_image(image source, image dest, int dx, int dy)
 {
     int x,y,k;
@@ -73,6 +131,7 @@ image tile_images(image a, image b, int dx)
 
 image get_label(image **characters, char *string, int size)
 {
+    size = size/10;
     if(size > 7) size = 7;
     image label = make_empty_image(0,0,0);
     while(*string){
@@ -177,23 +236,36 @@ image **load_alphabet()
     return alphabets;
 }
 
-void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **alphabet, int classes)
+void draw_detections(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes)
 {
-    int i;
+    int i,j;
 
     for(i = 0; i < num; ++i){
-        int class = max_index(probs[i], classes);
-        float prob = probs[i][class];
-        if(prob > thresh){
-
-            int width = im.h * .012;
-
-            if(0){
-                width = pow(prob, 1./2.)*10+1;
-                alphabet = 0;
+        char labelstr[4096] = {0};
+        int class = -1;
+        for(j = 0; j < classes; ++j){
+            if (dets[i].prob[j] > thresh){
+                if (class < 0) {
+                    strcat(labelstr, names[j]);
+                    class = j;
+                } else {
+                    strcat(labelstr, ", ");
+                    strcat(labelstr, names[j]);
+                }
+                printf("%s: %.0f%%\n", names[j], dets[i].prob[j]*100);
             }
+        }
+        if(class >= 0){
+            int width = im.h * .006;
 
-            printf("%s: %.0f%%\n", names[class], prob*100);
+            /*
+               if(0){
+               width = pow(prob, 1./2.)*10+1;
+               alphabet = 0;
+               }
+             */
+
+            //printf("%d %s: %.0f%%\n", i, names[class], prob*100);
             int offset = class*123457 % classes;
             float red = get_color(2,offset,classes);
             float green = get_color(1,offset,classes);
@@ -205,7 +277,8 @@ void draw_detections(image im, int num, float thresh, box *boxes, float **probs,
             rgb[0] = red;
             rgb[1] = green;
             rgb[2] = blue;
-            box b = boxes[i];
+            box b = dets[i].bbox;
+            //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
 
             int left  = (b.x-b.w/2.)*im.w;
             int right = (b.x+b.w/2.)*im.w;
@@ -219,8 +292,18 @@ void draw_detections(image im, int num, float thresh, box *boxes, float **probs,
 
             draw_box_width(im, left, top, right, bot, width, red, green, blue);
             if (alphabet) {
-                image label = get_label(alphabet, names[class], (im.h*.03)/10);
+                image label = get_label(alphabet, labelstr, (im.h*.03));
                 draw_label(im, top + width, left, label, rgb);
+                free_image(label);
+            }
+            if (dets[i].mask){
+                image mask = float_to_image(14, 14, 1, dets[i].mask);
+                image resized_mask = resize_image(mask, b.w*im.w, b.h*im.h);
+                image tmask = threshold_image(resized_mask, .5);
+                embed_image(tmask, im, left, top);
+                free_image(mask);
+                free_image(resized_mask);
+                free_image(tmask);
             }
         }
     }
@@ -294,6 +377,54 @@ image image_distance(image a, image b)
     return dist;
 }
 
+void ghost_image(image source, image dest, int dx, int dy)
+{
+    int x,y,k;
+    float max_dist = sqrt((-source.w/2. + .5)*(-source.w/2. + .5));
+    for(k = 0; k < source.c; ++k){
+        for(y = 0; y < source.h; ++y){
+            for(x = 0; x < source.w; ++x){
+                float dist = sqrt((x - source.w/2. + .5)*(x - source.w/2. + .5) + (y - source.h/2. + .5)*(y - source.h/2. + .5));
+                float alpha = (1 - dist/max_dist);
+                if(alpha < 0) alpha = 0;
+                float v1 = get_pixel(source, x,y,k);
+                float v2 = get_pixel(dest, dx+x,dy+y,k);
+                float val = alpha*v1 + (1-alpha)*v2;
+                set_pixel(dest, dx+x, dy+y, k, val);
+            }
+        }
+    }
+}
+
+void blocky_image(image im, int s)
+{
+    int i,j,k;
+    for(k = 0; k < im.c; ++k){
+        for(j = 0; j < im.h; ++j){
+            for(i = 0; i < im.w; ++i){
+                im.data[i + im.w*(j + im.h*k)] = im.data[i/s*s + im.w*(j/s*s + im.h*k)];
+            }
+        }
+    }
+}
+
+void censor_image(image im, int dx, int dy, int w, int h)
+{
+    int i,j,k;
+    int s = 32;
+    if(dx < 0) dx = 0;
+    if(dy < 0) dy = 0;
+
+    for(k = 0; k < im.c; ++k){
+        for(j = dy; j < dy + h && j < im.h; ++j){
+            for(i = dx; i < dx + w && i < im.w; ++i){
+                im.data[i + im.w*(j + im.h*k)] = im.data[i/s*s + im.w*(j/s*s + im.h*k)];
+                //im.data[i + j*im.w + k*im.w*im.h] = 0;
+            }
+        }
+    }
+}
+
 void embed_image(image source, image dest, int dx, int dy)
 {
     int x,y,k;
@@ -380,6 +511,11 @@ void normalize_image2(image p)
     free(max);
 }
 
+void copy_image_into(image src, image dest)
+{
+    memcpy(dest.data, src.data, src.h*src.w*src.c*sizeof(float));
+}
+
 image copy_image(image p)
 {
     image copy = p;
@@ -398,145 +534,27 @@ void rgbgr_image(image im)
     }
 }
 
-#ifdef OPENCV
-void show_image_cv(image p, const char *name)
-{
-    int x,y,k;
-    image copy = copy_image(p);
-    constrain_image(copy);
-    if(p.c == 3) rgbgr_image(copy);
-    //normalize_image(copy);
-
-    char buff[256];
-    //sprintf(buff, "%s (%d)", name, windows);
-    sprintf(buff, "%s", name);
-
-    IplImage *disp = cvCreateImage(cvSize(p.w,p.h), IPL_DEPTH_8U, p.c);
-    int step = disp->widthStep;
-    cvNamedWindow(buff, CV_WINDOW_NORMAL); 
-    //cvMoveWindow(buff, 100*(windows%10) + 200*(windows/10), 100*(windows%10));
-    ++windows;
-    for(y = 0; y < p.h; ++y){
-        for(x = 0; x < p.w; ++x){
-            for(k= 0; k < p.c; ++k){
-                disp->imageData[y*step + x*p.c + k] = (unsigned char)(get_pixel(copy,x,y,k)*255);
-            }
-        }
-    }
-    free_image(copy);
-    if(0){
-        int w = 448;
-        int h = w*p.h/p.w;
-        if(h > 1000){
-            h = 1000;
-            w = h*p.w/p.h;
-        }
-        IplImage *buffer = disp;
-        disp = cvCreateImage(cvSize(w, h), buffer->depth, buffer->nChannels);
-        cvResize(buffer, disp, CV_INTER_LINEAR);
-        cvReleaseImage(&buffer);
-    }
-    cvShowImage(buff, disp);
-    cvReleaseImage(&disp);
-}
-#endif
-
-void show_image(image p, const char *name)
+int show_image(image p, const char *name, int ms)
 {
 #ifdef OPENCV
-    show_image_cv(p, name);
+    int c = show_image_cv(p, name, ms);
+    return c;
 #else
     fprintf(stderr, "Not compiled with OpenCV, saving to %s.png instead\n", name);
     save_image(p, name);
+    return -1;
 #endif
 }
 
-#ifdef OPENCV
-
-image ipl_to_image(IplImage* src)
-{
-    unsigned char *data = (unsigned char *)src->imageData;
-    int h = src->height;
-    int w = src->width;
-    int c = src->nChannels;
-    int step = src->widthStep;
-    image out = make_image(w, h, c);
-    int i, j, k, count=0;;
-
-    for(k= 0; k < c; ++k){
-        for(i = 0; i < h; ++i){
-            for(j = 0; j < w; ++j){
-                out.data[count++] = data[i*step + j*c + k]/255.;
-            }
-        }
-    }
-    return out;
-}
-
-image load_image_cv(char *filename, int channels)
-{
-    IplImage* src = 0;
-    int flag = -1;
-    if (channels == 0) flag = -1;
-    else if (channels == 1) flag = 0;
-    else if (channels == 3) flag = 1;
-    else {
-        fprintf(stderr, "OpenCV can't force load with %d channels\n", channels);
-    }
-
-    if( (src = cvLoadImage(filename, flag)) == 0 )
-    {
-        fprintf(stderr, "Cannot load image \"%s\"\n", filename);
-        char buff[256];
-        sprintf(buff, "echo %s >> bad.list", filename);
-        system(buff);
-        return make_image(10,10,3);
-        //exit(0);
-    }
-    image out = ipl_to_image(src);
-    cvReleaseImage(&src);
-    rgbgr_image(out);
-    return out;
-}
-
-image get_image_from_stream(CvCapture *cap)
-{
-    IplImage* src = cvQueryFrame(cap);
-    if (!src) return make_empty_image(0,0,0);
-    image im = ipl_to_image(src);
-    rgbgr_image(im);
-    return im;
-}
-
-void save_image_jpg(image p, const char *name)
-{
-    image copy = copy_image(p);
-    if(p.c == 3) rgbgr_image(copy);
-    int x,y,k;
-
-    char buff[256];
-    sprintf(buff, "%s.jpg", name);
-
-    IplImage *disp = cvCreateImage(cvSize(p.w,p.h), IPL_DEPTH_8U, p.c);
-    int step = disp->widthStep;
-    for(y = 0; y < p.h; ++y){
-        for(x = 0; x < p.w; ++x){
-            for(k= 0; k < p.c; ++k){
-                disp->imageData[y*step + x*p.c + k] = (unsigned char)(get_pixel(copy,x,y,k)*255);
-            }
-        }
-    }
-    cvSaveImage(buff, disp,0);
-    cvReleaseImage(&disp);
-    free_image(copy);
-}
-#endif
-
-void save_image_png(image im, const char *name)
+void save_image_options(image im, const char *name, IMTYPE f, int quality)
 {
     char buff[256];
     //sprintf(buff, "%s (%d)", name, windows);
-    sprintf(buff, "%s.png", name);
+    if(f == PNG)       sprintf(buff, "%s.png", name);
+    else if (f == BMP) sprintf(buff, "%s.bmp", name);
+    else if (f == TGA) sprintf(buff, "%s.tga", name);
+    else if (f == JPG) sprintf(buff, "%s.jpg", name);
+    else               sprintf(buff, "%s.png", name);
     unsigned char *data = calloc(im.w*im.h*im.c, sizeof(char));
     int i,k;
     for(k = 0; k < im.c; ++k){
@@ -544,21 +562,20 @@ void save_image_png(image im, const char *name)
             data[i*im.c+k] = (unsigned char) (255*im.data[i + k*im.w*im.h]);
         }
     }
-    int success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w*im.c);
+    int success = 0;
+    if(f == PNG)       success = stbi_write_png(buff, im.w, im.h, im.c, data, im.w*im.c);
+    else if (f == BMP) success = stbi_write_bmp(buff, im.w, im.h, im.c, data);
+    else if (f == TGA) success = stbi_write_tga(buff, im.w, im.h, im.c, data);
+    else if (f == JPG) success = stbi_write_jpg(buff, im.w, im.h, im.c, data, quality);
     free(data);
     if(!success) fprintf(stderr, "Failed to write image %s\n", buff);
 }
 
 void save_image(image im, const char *name)
 {
-#ifdef OPENCV
-    save_image_jpg(im, name);
-#else
-    save_image_png(im, name);
-#endif
+    save_image_options(im, name, JPG, 80);
 }
 
-
 void show_image_layers(image p, char *name)
 {
     int i;
@@ -566,7 +583,7 @@ void show_image_layers(image p, char *name)
     for(i = 0; i < p.c; ++i){
         sprintf(buff, "%s - Layer %d", name, i);
         image layer = get_image_layer(p, i);
-        show_image(layer, buff);
+        show_image(layer, buff, 1);
         free_image(layer);
     }
 }
@@ -574,7 +591,7 @@ void show_image_layers(image p, char *name)
 void show_image_collapsed(image p, char *name)
 {
     image c = collapse_image_layers(p, 1);
-    show_image(c, name);
+    show_image(c, name, 1);
     free_image(c);
 }
 
@@ -613,6 +630,29 @@ image float_to_image(int w, int h, int c, float *data)
     return out;
 }
 
+void place_image(image im, int w, int h, int dx, int dy, image canvas)
+{
+    int x, y, c;
+    for(c = 0; c < im.c; ++c){
+        for(y = 0; y < h; ++y){
+            for(x = 0; x < w; ++x){
+                float rx = ((float)x / w) * im.w;
+                float ry = ((float)y / h) * im.h;
+                float val = bilinear_interpolate(im, rx, ry, c);
+                set_pixel(canvas, x + dx, y + dy, c, val);
+            }
+        }
+    }
+}
+
+image center_crop_image(image im, int w, int h)
+{
+    int m = (im.w < im.h) ? im.w : im.h;   
+    image c = crop_image(im, (im.w - m) / 2, (im.h - m)/2, m, m);
+    image r = resize_image(c, w, h);
+    free_image(c);
+    return r;
+}
 
 image rotate_crop_image(image im, float rad, float s, int w, int h, float dx, float dy, float aspect)
 {
@@ -652,6 +692,12 @@ image rotate_image(image im, float rad)
     return rot;
 }
 
+void fill_image(image m, float s)
+{
+    int i;
+    for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s;
+}
+
 void translate_image(image m, float s)
 {
     int i;
@@ -676,9 +722,7 @@ image crop_image(image im, int dx, int dy, int w, int h)
                 float val = 0;
                 r = constrain_int(r, 0, im.h-1);
                 c = constrain_int(c, 0, im.w-1);
-                if (r >= 0 && r < im.h && c >= 0 && c < im.w) {
-                    val = get_pixel(im, c, r, k);
-                }
+                val = get_pixel(im, c, r, k);
                 set_pixel(cropped, i, j, k, val);
             }
         }
@@ -746,11 +790,44 @@ void composite_3d(char *f1, char *f2, char *out, int delta)
     for(i = 0; i < c.w*c.h; ++i){
         c.data[i] = a.data[i];
     }
-#ifdef OPENCV
-    save_image_jpg(c, out);
-#else
     save_image(c, out);
-#endif
+}
+
+void letterbox_image_into(image im, int w, int h, image boxed)
+{
+    int new_w = im.w;
+    int new_h = im.h;
+    if (((float)w/im.w) < ((float)h/im.h)) {
+        new_w = w;
+        new_h = (im.h * w)/im.w;
+    } else {
+        new_h = h;
+        new_w = (im.w * h)/im.h;
+    }
+    image resized = resize_image(im, new_w, new_h);
+    embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); 
+    free_image(resized);
+}
+
+image letterbox_image(image im, int w, int h)
+{
+    int new_w = im.w;
+    int new_h = im.h;
+    if (((float)w/im.w) < ((float)h/im.h)) {
+        new_w = w;
+        new_h = (im.h * w)/im.w;
+    } else {
+        new_h = h;
+        new_w = (im.w * h)/im.h;
+    }
+    image resized = resize_image(im, new_w, new_h);
+    image boxed = make_image(w, h, im.c);
+    fill_image(boxed, .5);
+    //int i;
+    //for(i = 0; i < boxed.w*boxed.h*boxed.c; ++i) boxed.data[i] = 0;
+    embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); 
+    free_image(resized);
+    return boxed;
 }
 
 image resize_max(image im, int max)
@@ -793,8 +870,9 @@ image random_crop_image(image im, int w, int h)
     return crop;
 }
 
-image random_augment_image(image im, float angle, float aspect, int low, int high, int size)
+augment_args random_augment_args(image im, float angle, float aspect, int low, int high, int w, int h)
 {
+    augment_args a = {0};
     aspect = rand_scale(aspect);
     int r = rand_int(low, high);
     int min = (im.h < im.w*aspect) ? im.h : im.w*aspect;
@@ -802,15 +880,27 @@ image random_augment_image(image im, float angle, float aspect, int low, int hig
 
     float rad = rand_uniform(-angle, angle) * TWO_PI / 360.;
 
-    float dx = (im.w*scale/aspect - size) / 2.;
-    float dy = (im.h*scale - size) / 2.;
-    if(dx < 0) dx = 0;
-    if(dy < 0) dy = 0;
+    float dx = (im.w*scale/aspect - w) / 2.;
+    float dy = (im.h*scale - w) / 2.;
+    //if(dx < 0) dx = 0;
+    //if(dy < 0) dy = 0;
     dx = rand_uniform(-dx, dx);
     dy = rand_uniform(-dy, dy);
 
-    image crop = rotate_crop_image(im, rad, scale, size, size, dx, dy, aspect);
+    a.rad = rad;
+    a.scale = scale;
+    a.w = w;
+    a.h = h;
+    a.dx = dx;
+    a.dy = dy;
+    a.aspect = aspect;
+    return a;
+}
 
+image random_augment_image(image im, float angle, float aspect, int low, int high, int w, int h)
+{
+    augment_args a = random_augment_args(im, angle, aspect, low, high, w, h);
+    image crop = rotate_crop_image(im, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
     return crop;
 }
 
@@ -824,6 +914,52 @@ float three_way_min(float a, float b, float c)
     return (a < b) ? ( (a < c) ? a : c) : ( (b < c) ? b : c) ;
 }
 
+void yuv_to_rgb(image im)
+{
+    assert(im.c == 3);
+    int i, j;
+    float r, g, b;
+    float y, u, v;
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            y = get_pixel(im, i , j, 0);
+            u = get_pixel(im, i , j, 1);
+            v = get_pixel(im, i , j, 2);
+
+            r = y + 1.13983*v;
+            g = y + -.39465*u + -.58060*v;
+            b = y + 2.03211*u;
+
+            set_pixel(im, i, j, 0, r);
+            set_pixel(im, i, j, 1, g);
+            set_pixel(im, i, j, 2, b);
+        }
+    }
+}
+
+void rgb_to_yuv(image im)
+{
+    assert(im.c == 3);
+    int i, j;
+    float r, g, b;
+    float y, u, v;
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            r = get_pixel(im, i , j, 0);
+            g = get_pixel(im, i , j, 1);
+            b = get_pixel(im, i , j, 2);
+
+            y = .299*r + .587*g + .114*b;
+            u = -.14713*r + -.28886*g + .436*b;
+            v = .615*r + -.51499*g + -.10001*b;
+
+            set_pixel(im, i, j, 0, y);
+            set_pixel(im, i, j, 1, u);
+            set_pixel(im, i, j, 2, v);
+        }
+    }
+}
+
 // http://www.cs.rit.edu/~ncs/color/t_convert.html
 void rgb_to_hsv(image im)
 {
@@ -903,12 +1039,30 @@ void hsv_to_rgb(image im)
     }
 }
 
+void grayscale_image_3c(image im)
+{
+    assert(im.c == 3);
+    int i, j, k;
+    float scale[] = {0.299, 0.587, 0.114};
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            float val = 0;
+            for(k = 0; k < 3; ++k){
+                val += scale[k]*get_pixel(im, i, j, k);
+            }
+            im.data[0*im.h*im.w + im.w*j + i] = val;
+            im.data[1*im.h*im.w + im.w*j + i] = val;
+            im.data[2*im.h*im.w + im.w*j + i] = val;
+        }
+    }
+}
+
 image grayscale_image(image im)
 {
     assert(im.c == 3);
     int i, j, k;
     image gray = make_image(im.w, im.h, 1);
-    float scale[] = {0.587, 0.299, 0.114};
+    float scale[] = {0.299, 0.587, 0.114};
     for(k = 0; k < im.c; ++k){
         for(j = 0; j < im.h; ++j){
             for(i = 0; i < im.w; ++i){
@@ -1042,21 +1196,6 @@ void saturate_exposure_image(image im, float sat, float exposure)
     constrain_image(im);
 }
 
-float bilinear_interpolate(image im, float x, float y, int c)
-{
-    int ix = (int) floorf(x);
-    int iy = (int) floorf(y);
-
-    float dx = x - ix;
-    float dy = y - iy;
-
-    float val = (1-dy) * (1-dx) * get_pixel_extend(im, ix, iy, c) + 
-        dy     * (1-dx) * get_pixel_extend(im, ix, iy+1, c) + 
-        (1-dy) *   dx   * get_pixel_extend(im, ix+1, iy, c) +
-        dy     *   dx   * get_pixel_extend(im, ix+1, iy+1, c);
-    return val;
-}
-
 image resize_image(image im, int w, int h)
 {
     image resized = make_image(w, h, im.c);   
@@ -1119,16 +1258,16 @@ void test_resize(char *filename)
     distort_image(c4, .1, .66666, 1.5);
 
 
-    show_image(im,   "Original");
-    show_image(gray, "Gray");
-    show_image(c1, "C1");
-    show_image(c2, "C2");
-    show_image(c3, "C3");
-    show_image(c4, "C4");
+    show_image(im,   "Original", 1);
+    show_image(gray, "Gray", 1);
+    show_image(c1, "C1", 1);
+    show_image(c2, "C2", 1);
+    show_image(c3, "C3", 1);
+    show_image(c4, "C4", 1);
 #ifdef OPENCV
     while(1){
-        image aug = random_augment_image(im, 0, .75, 320, 448, 320);
-        show_image(aug, "aug");
+        image aug = random_augment_image(im, 0, .75, 320, 448, 320, 320);
+        show_image(aug, "aug", 1);
         free_image(aug);
 
 
@@ -1143,10 +1282,9 @@ void test_resize(char *filename)
         float dhue = rand_uniform(-hue, hue);
 
         distort_image(c, dhue, dsat, dexp);
-        show_image(c, "rand");
+        show_image(c, "rand", 1);
         printf("%f %f %f\n", dhue, dsat, dexp);
         free_image(c);
-        cvWaitKey(0);
     }
 #endif
 }
@@ -1206,33 +1344,6 @@ image get_image_layer(image m, int l)
     }
     return out;
 }
-
-float get_pixel(image m, int x, int y, int c)
-{
-    assert(x < m.w && y < m.h && c < m.c);
-    return m.data[c*m.h*m.w + y*m.w + x];
-}
-float get_pixel_extend(image m, int x, int y, int c)
-{
-    if(x < 0) x = 0;
-    if(x >= m.w) x = m.w-1;
-    if(y < 0) y = 0;
-    if(y >= m.h) y = m.h-1;
-    if(c < 0 || c >= m.c) return 0;
-    return get_pixel(m, x, y, c);
-}
-void set_pixel(image m, int x, int y, int c, float val)
-{
-    if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
-    assert(x < m.w && y < m.h && c < m.c);
-    m.data[c*m.h*m.w + y*m.w + x] = val;
-}
-void add_pixel(image m, int x, int y, int c, float val)
-{
-    assert(x < m.w && y < m.h && c < m.c);
-    m.data[c*m.h*m.w + y*m.w + x] += val;
-}
-
 void print_image(image m)
 {
     int i, j, k;
@@ -1325,7 +1436,7 @@ void show_image_normalized(image im, const char *name)
 {
     image c = copy_image(im);
     normalize_image(c);
-    show_image(c, name);
+    show_image(c, name, 1);
     free_image(c);
 }
 
@@ -1343,7 +1454,7 @@ void show_images(image *ims, int n, char *window)
      */
     normalize_image(m);
     save_image(m, window);
-    show_image(m, window);
+    show_image(m, window, 1);
     free_image(m);
 }
 
diff --git a/image.darknet/src/image.h b/image.darknet/src/image.h
index 39c3962..3392bb9 100644
--- a/image.darknet/src/image.h
+++ b/image.darknet/src/image.h
@@ -7,81 +7,63 @@
 #include <string.h>
 #include <math.h>
 #include "box.h"
+#include "darknet.h"
 
-typedef struct {
-    int h;
-    int w;
-    int c;
-    float *data;
-} image;
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef OPENCV
+void *open_video_stream(const char *f, int c, int w, int h, int fps);
+image get_image_from_stream(void *p);
+image load_image_cv(char *filename, int channels);
+int show_image_cv(image im, const char* name, int ms);
+#endif
 
 float get_color(int c, int x, int max);
-void flip_image(image a);
 void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b);
-void draw_box_width(image a, int x1, int y1, int x2, int y2, int w, float r, float g, float b);
 void draw_bbox(image a, box bbox, int w, float r, float g, float b);
-void draw_label(image a, int r, int c, image label, const float *rgb);
 void write_label(image a, int r, int c, image *characters, char *string, float *rgb);
-void draw_detections(image im, int num, float thresh, box *boxes, float **probs, char **names, image **labels, int classes);
 image image_distance(image a, image b);
 void scale_image(image m, float s);
-image crop_image(image im, int dx, int dy, int w, int h);
+image rotate_crop_image(image im, float rad, float s, int w, int h, float dx, float dy, float aspect);
 image random_crop_image(image im, int w, int h);
-image random_augment_image(image im, float angle, float aspect, int low, int high, int size);
-void random_distort_image(image im, float hue, float saturation, float exposure);
-image resize_image(image im, int w, int h);
-image resize_min(image im, int min);
+image random_augment_image(image im, float angle, float aspect, int low, int high, int w, int h);
+augment_args random_augment_args(image im, float angle, float aspect, int low, int high, int w, int h);
+void letterbox_image_into(image im, int w, int h, image boxed);
 image resize_max(image im, int max);
 void translate_image(image m, float s);
-void normalize_image(image p);
-image rotate_image(image m, float rad);
-void rotate_image_cw(image im, int times);
 void embed_image(image source, image dest, int dx, int dy);
+void place_image(image im, int w, int h, int dx, int dy, image canvas);
 void saturate_image(image im, float sat);
 void exposure_image(image im, float sat);
 void distort_image(image im, float hue, float sat, float val);
 void saturate_exposure_image(image im, float sat, float exposure);
+void rgb_to_hsv(image im);
 void hsv_to_rgb(image im);
-void rgbgr_image(image im);
-void constrain_image(image im);
-void composite_3d(char *f1, char *f2, char *out, int delta);
-int best_3d_shift_r(image a, image b, int min, int max);
+void yuv_to_rgb(image im);
+void rgb_to_yuv(image im);
 
-image grayscale_image(image im);
-image threshold_image(image im, float thresh);
 
 image collapse_image_layers(image source, int border);
 image collapse_images_horz(image *ims, int n);
 image collapse_images_vert(image *ims, int n);
 
-void show_image(image p, const char *name);
 void show_image_normalized(image im, const char *name);
-void save_image_png(image im, const char *name);
-void save_image(image p, const char *name);
 void show_images(image *ims, int n, char *window);
 void show_image_layers(image p, char *name);
 void show_image_collapsed(image p, char *name);
 
 void print_image(image m);
 
-image make_image(int w, int h, int c);
-image make_random_image(int w, int h, int c);
 image make_empty_image(int w, int h, int c);
-image float_to_image(int w, int h, int c, float *data);
-image copy_image(image p);
-image load_image(char *filename, int w, int h, int c);
-image load_image_color(char *filename, int w, int h);
-image **load_alphabet();
-
-float get_pixel(image m, int x, int y, int c);
-float get_pixel_extend(image m, int x, int y, int c);
-void set_pixel(image m, int x, int y, int c, float val);
-void add_pixel(image m, int x, int y, int c, float val);
-float bilinear_interpolate(image im, float x, float y, int c);
+void copy_image_into(image src, image dest);
 
 image get_image_layer(image m, int l);
 
-void free_image(image m);
-void test_resize(char *filename);
+#ifdef __cplusplus
+}
+#endif
+
 #endif
 
diff --git a/image.darknet/src/image_opencv.cpp b/image.darknet/src/image_opencv.cpp
new file mode 100644
index 0000000..7511280
--- /dev/null
+++ b/image.darknet/src/image_opencv.cpp
@@ -0,0 +1,135 @@
+#ifdef OPENCV
+
+#include "stdio.h"
+#include "stdlib.h"
+#include "opencv2/opencv.hpp"
+#include "image.h"
+
+using namespace cv;
+
+extern "C" {
+
+IplImage *image_to_ipl(image im)
+{
+    int x,y,c;
+    IplImage *disp = cvCreateImage(cvSize(im.w,im.h), IPL_DEPTH_8U, im.c);
+    int step = disp->widthStep;
+    for(y = 0; y < im.h; ++y){
+        for(x = 0; x < im.w; ++x){
+            for(c= 0; c < im.c; ++c){
+                float val = im.data[c*im.h*im.w + y*im.w + x];
+                disp->imageData[y*step + x*im.c + c] = (unsigned char)(val*255);
+            }
+        }
+    }
+    return disp;
+}
+
+image ipl_to_image(IplImage* src)
+{
+    int h = src->height;
+    int w = src->width;
+    int c = src->nChannels;
+    image im = make_image(w, h, c);
+    unsigned char *data = (unsigned char *)src->imageData;
+    int step = src->widthStep;
+    int i, j, k;
+
+    for(i = 0; i < h; ++i){
+        for(k= 0; k < c; ++k){
+            for(j = 0; j < w; ++j){
+                im.data[k*w*h + i*w + j] = data[i*step + j*c + k]/255.;
+            }
+        }
+    }
+    return im;
+}
+
+Mat image_to_mat(image im)
+{
+    image copy = copy_image(im);
+    constrain_image(copy);
+    if(im.c == 3) rgbgr_image(copy);
+
+    IplImage *ipl = image_to_ipl(copy);
+    Mat m = cvarrToMat(ipl, true);
+    cvReleaseImage(&ipl);
+    free_image(copy);
+    return m;
+}
+
+image mat_to_image(Mat m)
+{
+    IplImage ipl = m;
+    image im = ipl_to_image(&ipl);
+    rgbgr_image(im);
+    return im;
+}
+
+void *open_video_stream(const char *f, int c, int w, int h, int fps)
+{
+    VideoCapture *cap;
+    if(f) cap = new VideoCapture(f);
+    else cap = new VideoCapture(c);
+    if(!cap->isOpened()) return 0;
+    if(w) cap->set(CV_CAP_PROP_FRAME_WIDTH, w);
+    if(h) cap->set(CV_CAP_PROP_FRAME_HEIGHT, w);
+    if(fps) cap->set(CV_CAP_PROP_FPS, w);
+    return (void *) cap;
+}
+
+image get_image_from_stream(void *p)
+{
+    VideoCapture *cap = (VideoCapture *)p;
+    Mat m;
+    *cap >> m;
+    if(m.empty()) return make_empty_image(0,0,0);
+    return mat_to_image(m);
+}
+
+image load_image_cv(char *filename, int channels)
+{
+    int flag = -1;
+    if (channels == 0) flag = -1;
+    else if (channels == 1) flag = 0;
+    else if (channels == 3) flag = 1;
+    else {
+        fprintf(stderr, "OpenCV can't force load with %d channels\n", channels);
+    }
+    Mat m;
+    m = imread(filename, flag);
+    if(!m.data){
+        fprintf(stderr, "Cannot load image \"%s\"\n", filename);
+        char buff[256];
+        sprintf(buff, "echo %s >> bad.list", filename);
+        system(buff);
+        return make_image(10,10,3);
+        //exit(0);
+    }
+    image im = mat_to_image(m);
+    return im;
+}
+
+int show_image_cv(image im, const char* name, int ms)
+{
+    Mat m = image_to_mat(im);
+    imshow(name, m);
+    int c = waitKey(ms);
+    if (c != -1) c = c%256;
+    return c;
+}
+
+void make_window(char *name, int w, int h, int fullscreen)
+{
+    namedWindow(name, WINDOW_NORMAL); 
+    if (fullscreen) {
+        setWindowProperty(name, CV_WND_PROP_FULLSCREEN, CV_WINDOW_FULLSCREEN);
+    } else {
+        resizeWindow(name, w, h);
+        if(strcmp(name, "Demo") == 0) moveWindow(name, 0, 0);
+    }
+}
+
+}
+
+#endif
diff --git a/image.darknet/src/iseg_layer.c b/image.darknet/src/iseg_layer.c
new file mode 100644
index 0000000..2bf03a8
--- /dev/null
+++ b/image.darknet/src/iseg_layer.c
@@ -0,0 +1,225 @@
+#include "iseg_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "box.h"
+#include "cuda.h"
+#include "utils.h"
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+layer make_iseg_layer(int batch, int w, int h, int classes, int ids)
+{
+    layer l = {0};
+    l.type = ISEG;
+
+    l.h = h;
+    l.w = w;
+    l.c = classes + ids;
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
+    l.classes = classes;
+    l.batch = batch;
+    l.extra = ids;
+    l.cost = calloc(1, sizeof(float));
+    l.outputs = h*w*l.c;
+    l.inputs = l.outputs;
+    l.truths = 90*(l.w*l.h+1);
+    l.delta = calloc(batch*l.outputs, sizeof(float));
+    l.output = calloc(batch*l.outputs, sizeof(float));
+
+    l.counts = calloc(90, sizeof(int));
+    l.sums = calloc(90, sizeof(float*));
+    if(ids){
+        int i;
+        for(i = 0; i < 90; ++i){
+            l.sums[i] = calloc(ids, sizeof(float));
+        }
+    }
+
+    l.forward = forward_iseg_layer;
+    l.backward = backward_iseg_layer;
+#ifdef GPU
+    l.forward_gpu = forward_iseg_layer_gpu;
+    l.backward_gpu = backward_iseg_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+#endif
+
+    fprintf(stderr, "iseg\n");
+    srand(0);
+
+    return l;
+}
+
+void resize_iseg_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+
+    l->outputs = h*w*l->c;
+    l->inputs = l->outputs;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+
+    l->delta_gpu =     cuda_make_array(l->delta, l->batch*l->outputs);
+    l->output_gpu =    cuda_make_array(l->output, l->batch*l->outputs);
+#endif
+}
+
+void forward_iseg_layer(const layer l, network net)
+{
+
+    double time = what_time_is_it_now();
+    int i,b,j,k;
+    int ids = l.extra;
+    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+
+#ifndef GPU
+    for (b = 0; b < l.batch; ++b){
+        int index = b*l.outputs;
+        activate_array(l.output + index, l.classes*l.w*l.h, LOGISTIC);
+    }
+#endif
+
+    for (b = 0; b < l.batch; ++b){
+        // a priori, each pixel has no class
+        for(i = 0; i < l.classes; ++i){
+            for(k = 0; k < l.w*l.h; ++k){
+                int index = b*l.outputs + i*l.w*l.h + k;
+                l.delta[index] = 0 - l.output[index];
+            }
+        }
+
+        // a priori, embedding should be small magnitude
+        for(i = 0; i < ids; ++i){
+            for(k = 0; k < l.w*l.h; ++k){
+                int index = b*l.outputs + (i+l.classes)*l.w*l.h + k;
+                l.delta[index] = .1 * (0 - l.output[index]);
+            }
+        }
+
+
+        memset(l.counts, 0, 90*sizeof(int));
+        for(i = 0; i < 90; ++i){
+            fill_cpu(ids, 0, l.sums[i], 1);
+            
+            int c = net.truth[b*l.truths + i*(l.w*l.h+1)];
+            if(c < 0) break;
+            // add up metric embeddings for each instance
+            for(k = 0; k < l.w*l.h; ++k){
+                int index = b*l.outputs + c*l.w*l.h + k;
+                float v = net.truth[b*l.truths + i*(l.w*l.h + 1) + 1 + k];
+                if(v){
+                    l.delta[index] = v - l.output[index];
+                    axpy_cpu(ids, 1, l.output + b*l.outputs + l.classes*l.w*l.h + k, l.w*l.h, l.sums[i], 1);
+                    ++l.counts[i];
+                }
+            }
+        }
+
+        float *mse = calloc(90, sizeof(float));
+        for(i = 0; i < 90; ++i){
+            int c = net.truth[b*l.truths + i*(l.w*l.h+1)];
+            if(c < 0) break;
+            for(k = 0; k < l.w*l.h; ++k){
+                float v = net.truth[b*l.truths + i*(l.w*l.h + 1) + 1 + k];
+                if(v){
+                    int z;
+                    float sum = 0;
+                    for(z = 0; z < ids; ++z){
+                        int index = b*l.outputs + (l.classes + z)*l.w*l.h + k;
+                        sum += pow(l.sums[i][z]/l.counts[i] - l.output[index], 2);
+                    }
+                    mse[i] += sum;
+                }
+            }
+            mse[i] /= l.counts[i];
+        }
+
+        // Calculate average embedding
+        for(i = 0; i < 90; ++i){
+            if(!l.counts[i]) continue;
+            scal_cpu(ids, 1.f/l.counts[i], l.sums[i], 1);
+            if(b == 0 && net.gpu_index == 0){
+                printf("%4d, %6.3f, ", l.counts[i], mse[i]);
+                for(j = 0; j < ids; ++j){
+                    printf("%6.3f,", l.sums[i][j]);
+                }
+                printf("\n");
+            }
+        }
+        free(mse);
+
+        // Calculate embedding loss
+        for(i = 0; i < 90; ++i){
+            if(!l.counts[i]) continue;
+            for(k = 0; k < l.w*l.h; ++k){
+                float v = net.truth[b*l.truths + i*(l.w*l.h + 1) + 1 + k];
+                if(v){
+                    for(j = 0; j < 90; ++j){
+                        if(!l.counts[j])continue;
+                        int z;
+                        for(z = 0; z < ids; ++z){
+                            int index = b*l.outputs + (l.classes + z)*l.w*l.h + k;
+                            float diff = l.sums[j][z] - l.output[index];
+                            if (j == i) l.delta[index] +=   diff < 0? -.1 : .1;
+                            else        l.delta[index] += -(diff < 0? -.1 : .1);
+                        }
+                    }
+                }
+            }
+        }
+
+        for(i = 0; i < ids; ++i){
+            for(k = 0; k < l.w*l.h; ++k){
+                int index = b*l.outputs + (i+l.classes)*l.w*l.h + k;
+                l.delta[index] *= .01;
+            }
+        }
+    }
+
+    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+    printf("took %lf sec\n", what_time_is_it_now() - time);
+}
+
+void backward_iseg_layer(const layer l, network net)
+{
+    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
+}
+
+#ifdef GPU
+
+void forward_iseg_layer_gpu(const layer l, network net)
+{
+    copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
+    int b;
+    for (b = 0; b < l.batch; ++b){
+        activate_array_gpu(l.output_gpu + b*l.outputs, l.classes*l.w*l.h, LOGISTIC);
+        //if(l.extra) activate_array_gpu(l.output_gpu + b*l.outputs + l.classes*l.w*l.h, l.extra*l.w*l.h, LOGISTIC);
+    }
+
+    cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs);
+    forward_iseg_layer(l, net);
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+}
+
+void backward_iseg_layer_gpu(const layer l, network net)
+{
+    int b;
+    for (b = 0; b < l.batch; ++b){
+        //if(l.extra) gradient_array_gpu(l.output_gpu + b*l.outputs + l.classes*l.w*l.h, l.extra*l.w*l.h, LOGISTIC, l.delta_gpu + b*l.outputs + l.classes*l.w*l.h);
+    }
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+#endif
+
diff --git a/image.darknet/src/iseg_layer.h b/image.darknet/src/iseg_layer.h
new file mode 100644
index 0000000..dd8e64e
--- /dev/null
+++ b/image.darknet/src/iseg_layer.h
@@ -0,0 +1,19 @@
+#ifndef ISEG_LAYER_H
+#define ISEG_LAYER_H
+
+#include "darknet.h"
+#include "layer.h"
+#include "network.h"
+
+layer make_iseg_layer(int batch, int w, int h, int classes, int ids);
+void forward_iseg_layer(const layer l, network net);
+void backward_iseg_layer(const layer l, network net);
+void resize_iseg_layer(layer *l, int w, int h);
+int iseg_num_detections(layer l, float thresh);
+
+#ifdef GPU
+void forward_iseg_layer_gpu(const layer l, network net);
+void backward_iseg_layer_gpu(layer l, network net);
+#endif
+
+#endif
diff --git a/image.darknet/src/l2norm_layer.c b/image.darknet/src/l2norm_layer.c
new file mode 100644
index 0000000..d099479
--- /dev/null
+++ b/image.darknet/src/l2norm_layer.c
@@ -0,0 +1,63 @@
+#include "l2norm_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "cuda.h"
+
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+layer make_l2norm_layer(int batch, int inputs)
+{
+    fprintf(stderr, "l2norm                                         %4d\n",  inputs);
+    layer l = {0};
+    l.type = L2NORM;
+    l.batch = batch;
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.output = calloc(inputs*batch, sizeof(float));
+    l.scales = calloc(inputs*batch, sizeof(float));
+    l.delta = calloc(inputs*batch, sizeof(float));
+
+    l.forward = forward_l2norm_layer;
+    l.backward = backward_l2norm_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_l2norm_layer_gpu;
+    l.backward_gpu = backward_l2norm_layer_gpu;
+
+    l.output_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.scales_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
+    #endif
+    return l;
+}
+
+void forward_l2norm_layer(const layer l, network net)
+{
+    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
+    l2normalize_cpu(l.output, l.scales, l.batch, l.out_c, l.out_w*l.out_h);
+}
+
+void backward_l2norm_layer(const layer l, network net)
+{
+    //axpy_cpu(l.inputs*l.batch, 1, l.scales, 1, l.delta, 1);
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
+}
+
+#ifdef GPU
+
+void forward_l2norm_layer_gpu(const layer l, network net)
+{
+    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    l2normalize_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+}
+
+void backward_l2norm_layer_gpu(const layer l, network net)
+{
+    axpy_gpu(l.batch*l.inputs, 1, l.scales_gpu, 1, l.delta_gpu, 1);
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+
+#endif
diff --git a/image.darknet/src/l2norm_layer.h b/image.darknet/src/l2norm_layer.h
new file mode 100644
index 0000000..1ca6f71
--- /dev/null
+++ b/image.darknet/src/l2norm_layer.h
@@ -0,0 +1,15 @@
+#ifndef L2NORM_LAYER_H
+#define L2NORM_LAYER_H
+#include "layer.h"
+#include "network.h"
+
+layer make_l2norm_layer(int batch, int inputs);
+void forward_l2norm_layer(const layer l, network net);
+void backward_l2norm_layer(const layer l, network net);
+
+#ifdef GPU
+void forward_l2norm_layer_gpu(const layer l, network net);
+void backward_l2norm_layer_gpu(const layer l, network net);
+#endif
+
+#endif
diff --git a/image.darknet/src/layer.c b/image.darknet/src/layer.c
index 622cf26..c27b477 100644
--- a/image.darknet/src/layer.c
+++ b/image.darknet/src/layer.c
@@ -1,5 +1,6 @@
 #include "layer.h"
 #include "cuda.h"
+
 #include <stdlib.h>
 
 void free_layer(layer l)
@@ -32,7 +33,6 @@ void free_layer(layer l)
     if(l.scale_updates)      free(l.scale_updates);
     if(l.weights)            free(l.weights);
     if(l.weight_updates)     free(l.weight_updates);
-    if(l.col_image)          free(l.col_image);
     if(l.delta)              free(l.delta);
     if(l.output)             free(l.output);
     if(l.squared)            free(l.squared);
@@ -80,7 +80,6 @@ void free_layer(layer l)
     if(l.rolling_variance_gpu)    cuda_free(l.rolling_variance_gpu);
     if(l.variance_delta_gpu)      cuda_free(l.variance_delta_gpu);
     if(l.mean_delta_gpu)          cuda_free(l.mean_delta_gpu);
-    if(l.col_image_gpu)           cuda_free(l.col_image_gpu);
     if(l.x_gpu)                   cuda_free(l.x_gpu);
     if(l.x_norm_gpu)              cuda_free(l.x_norm_gpu);
     if(l.weights_gpu)             cuda_free(l.weights_gpu);
diff --git a/image.darknet/src/layer.h b/image.darknet/src/layer.h
index 806542b..af6cd2a 100644
--- a/image.darknet/src/layer.h
+++ b/image.darknet/src/layer.h
@@ -1,271 +1 @@
-#ifndef BASE_LAYER_H
-#define BASE_LAYER_H
-
-#include "activations.h"
-#include "stddef.h"
-#include "tree.h"
-
-struct network_state;
-
-struct layer;
-typedef struct layer layer;
-
-typedef enum {
-    CONVOLUTIONAL,
-    DECONVOLUTIONAL,
-    CONNECTED,
-    MAXPOOL,
-    SOFTMAX,
-    DETECTION,
-    DROPOUT,
-    CROP,
-    ROUTE,
-    COST,
-    NORMALIZATION,
-    AVGPOOL,
-    LOCAL,
-    SHORTCUT,
-    ACTIVE,
-    RNN,
-    GRU,
-    CRNN,
-    BATCHNORM,
-    NETWORK,
-    XNOR,
-    REGION,
-    REORG,
-    BLANK
-} LAYER_TYPE;
-
-typedef enum{
-    SSE, MASKED, SMOOTH
-} COST_TYPE;
-
-struct layer{
-    LAYER_TYPE type;
-    ACTIVATION activation;
-    COST_TYPE cost_type;
-    void (*forward)   (struct layer, struct network_state);
-    void (*backward)  (struct layer, struct network_state);
-    void (*update)    (struct layer, int, float, float, float);
-    void (*forward_gpu)   (struct layer, struct network_state);
-    void (*backward_gpu)  (struct layer, struct network_state);
-    void (*update_gpu)    (struct layer, int, float, float, float);
-    int batch_normalize;
-    int shortcut;
-    int batch;
-    int forced;
-    int flipped;
-    int inputs;
-    int outputs;
-    int truths;
-    int h,w,c;
-    int out_h, out_w, out_c;
-    int n;
-    int max_boxes;
-    int groups;
-    int size;
-    int side;
-    int stride;
-    int reverse;
-    int pad;
-    int sqrt;
-    int flip;
-    int index;
-    int binary;
-    int xnor;
-    int steps;
-    int hidden;
-    float dot;
-    float angle;
-    float jitter;
-    float saturation;
-    float exposure;
-    float shift;
-    float ratio;
-    int softmax;
-    int classes;
-    int coords;
-    int background;
-    int rescore;
-    int objectness;
-    int does_cost;
-    int joint;
-    int noadjust;
-    int reorg;
-    int log;
-
-    int adam;
-    float B1;
-    float B2;
-    float eps;
-    int t;
-
-    float alpha;
-    float beta;
-    float kappa;
-
-    float coord_scale;
-    float object_scale;
-    float noobject_scale;
-    float class_scale;
-    int bias_match;
-    int random;
-    float thresh;
-    int classfix;
-    int absolute;
-
-    int dontload;
-    int dontloadscales;
-
-    float temperature;
-    float probability;
-    float scale;
-
-    char  * cweights;
-    int   * indexes;
-    int   * input_layers;
-    int   * input_sizes;
-    int   * map;
-    float * rand;
-    float * cost;
-    float * state;
-    float * prev_state;
-    float * forgot_state;
-    float * forgot_delta;
-    float * state_delta;
-
-    float * concat;
-    float * concat_delta;
-
-    float * binary_weights;
-
-    float * biases;
-    float * bias_updates;
-
-    float * scales;
-    float * scale_updates;
-
-    float * weights;
-    float * weight_updates;
-
-    float * col_image;
-    float * delta;
-    float * output;
-    float * squared;
-    float * norms;
-
-    float * spatial_mean;
-    float * mean;
-    float * variance;
-
-    float * mean_delta;
-    float * variance_delta;
-
-    float * rolling_mean;
-    float * rolling_variance;
-
-    float * x;
-    float * x_norm;
-
-    float * m;
-    float * v;
-
-    float * z_cpu;
-    float * r_cpu;
-    float * h_cpu;
-
-    float * binary_input;
-
-    struct layer *input_layer;
-    struct layer *self_layer;
-    struct layer *output_layer;
-
-    struct layer *input_gate_layer;
-    struct layer *state_gate_layer;
-    struct layer *input_save_layer;
-    struct layer *state_save_layer;
-    struct layer *input_state_layer;
-    struct layer *state_state_layer;
-
-    struct layer *input_z_layer;
-    struct layer *state_z_layer;
-
-    struct layer *input_r_layer;
-    struct layer *state_r_layer;
-
-    struct layer *input_h_layer;
-    struct layer *state_h_layer;
-
-    tree *softmax_tree;
-
-    size_t workspace_size;
-
-    #ifdef GPU
-    int *indexes_gpu;
-
-    float *z_gpu;
-    float *r_gpu;
-    float *h_gpu;
-
-    float *m_gpu;
-    float *v_gpu;
-
-    float * prev_state_gpu;
-    float * forgot_state_gpu;
-    float * forgot_delta_gpu;
-    float * state_gpu;
-    float * state_delta_gpu;
-    float * gate_gpu;
-    float * gate_delta_gpu;
-    float * save_gpu;
-    float * save_delta_gpu;
-    float * concat_gpu;
-    float * concat_delta_gpu;
-
-    float *binary_input_gpu;
-    float *binary_weights_gpu;
-
-    float * mean_gpu;
-    float * variance_gpu;
-
-    float * rolling_mean_gpu;
-    float * rolling_variance_gpu;
-
-    float * variance_delta_gpu;
-    float * mean_delta_gpu;
-
-    float * col_image_gpu;
-
-    float * x_gpu;
-    float * x_norm_gpu;
-    float * weights_gpu;
-    float * weight_updates_gpu;
-
-    float * biases_gpu;
-    float * bias_updates_gpu;
-
-    float * scales_gpu;
-    float * scale_updates_gpu;
-
-    float * output_gpu;
-    float * delta_gpu;
-    float * rand_gpu;
-    float * squared_gpu;
-    float * norms_gpu;
-    #ifdef CUDNN
-    cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
-    cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
-    cudnnFilterDescriptor_t weightDesc;
-    cudnnFilterDescriptor_t dweightDesc;
-    cudnnConvolutionDescriptor_t convDesc;
-    cudnnConvolutionFwdAlgo_t fw_algo;
-    cudnnConvolutionBwdDataAlgo_t bd_algo;
-    cudnnConvolutionBwdFilterAlgo_t bf_algo;
-    #endif
-    #endif
-};
-
-void free_layer(layer);
-
-#endif
+#include "darknet.h"
diff --git a/image.darknet/src/list.h b/image.darknet/src/list.h
index fb818c2..6b445c7 100644
--- a/image.darknet/src/list.h
+++ b/image.darknet/src/list.h
@@ -1,26 +1,13 @@
 #ifndef LIST_H
 #define LIST_H
-
-typedef struct node{
-    void *val;
-    struct node *next;
-    struct node *prev;
-} node;
-
-typedef struct list{
-    int size;
-    node *front;
-    node *back;
-} list;
+#include "darknet.h"
 
 list *make_list();
 int list_find(list *l, void *val);
 
 void list_insert(list *, void *);
 
-void **list_to_array(list *l);
 
-void free_list(list *l);
 void free_list_contents(list *l);
 
 #endif
diff --git a/image.darknet/src/local_layer.c b/image.darknet/src/local_layer.c
index 31f0ca6..74f6910 100644
--- a/image.darknet/src/local_layer.c
+++ b/image.darknet/src/local_layer.c
@@ -57,9 +57,10 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in
     float scale = sqrt(2./(size*size*c));
     for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1,1);
 
-    l.col_image = calloc(out_h*out_w*size*size*c, sizeof(float));
     l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
     l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));
+
+    l.workspace_size = out_h*out_w*size*size*c;
     
     l.forward = forward_local_layer;
     l.backward = backward_local_layer;
@@ -76,7 +77,6 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in
     l.biases_gpu = cuda_make_array(l.biases, l.outputs);
     l.bias_updates_gpu = cuda_make_array(l.bias_updates, l.outputs);
 
-    l.col_image_gpu = cuda_make_array(l.col_image, out_h*out_w*size*size*c);
     l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
     l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
 
@@ -88,7 +88,7 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in
     return l;
 }
 
-void forward_local_layer(const local_layer l, network_state state)
+void forward_local_layer(const local_layer l, network net)
 {
     int out_h = local_out_height(l);
     int out_w = local_out_width(l);
@@ -100,13 +100,13 @@ void forward_local_layer(const local_layer l, network_state state)
     }
 
     for(i = 0; i < l.batch; ++i){
-        float *input = state.input + i*l.w*l.h*l.c;
+        float *input = net.input + i*l.w*l.h*l.c;
         im2col_cpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image);
+                l.size, l.stride, l.pad, net.workspace);
         float *output = l.output + i*l.outputs;
         for(j = 0; j < locations; ++j){
             float *a = l.weights + j*l.size*l.size*l.c*l.n;
-            float *b = l.col_image + j;
+            float *b = net.workspace + j;
             float *c = output + j;
 
             int m = l.n;
@@ -119,7 +119,7 @@ void forward_local_layer(const local_layer l, network_state state)
     activate_array(l.output, l.outputs*l.batch, l.activation);
 }
 
-void backward_local_layer(local_layer l, network_state state)
+void backward_local_layer(local_layer l, network net)
 {
     int i, j;
     int locations = l.out_w*l.out_h;
@@ -131,13 +131,13 @@ void backward_local_layer(local_layer l, network_state state)
     }
 
     for(i = 0; i < l.batch; ++i){
-        float *input = state.input + i*l.w*l.h*l.c;
+        float *input = net.input + i*l.w*l.h*l.c;
         im2col_cpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image);
+                l.size, l.stride, l.pad, net.workspace);
 
         for(j = 0; j < locations; ++j){ 
             float *a = l.delta + i*l.outputs + j;
-            float *b = l.col_image + j;
+            float *b = net.workspace + j;
             float *c = l.weight_updates + j*l.size*l.size*l.c*l.n;
             int m = l.n;
             int n = l.size*l.size*l.c;
@@ -146,11 +146,11 @@ void backward_local_layer(local_layer l, network_state state)
             gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
         }
 
-        if(state.delta){
+        if(net.delta){
             for(j = 0; j < locations; ++j){ 
                 float *a = l.weights + j*l.size*l.size*l.c*l.n;
                 float *b = l.delta + i*l.outputs + j;
-                float *c = l.col_image + j;
+                float *c = net.workspace + j;
 
                 int m = l.size*l.size*l.c;
                 int n = 1;
@@ -159,13 +159,18 @@ void backward_local_layer(local_layer l, network_state state)
                 gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
             }
 
-            col2im_cpu(l.col_image, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+            col2im_cpu(net.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, net.delta+i*l.c*l.h*l.w);
         }
     }
 }
 
-void update_local_layer(local_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_local_layer(local_layer l, update_args a)
 {
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
     int locations = l.out_w*l.out_h;
     int size = l.size*l.size*l.c*l.n*locations;
     axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
@@ -178,7 +183,7 @@ void update_local_layer(local_layer l, int batch, float learning_rate, float mom
 
 #ifdef GPU
 
-void forward_local_layer_gpu(const local_layer l, network_state state)
+void forward_local_layer_gpu(const local_layer l, network net)
 {
     int out_h = local_out_height(l);
     int out_w = local_out_width(l);
@@ -186,83 +191,88 @@ void forward_local_layer_gpu(const local_layer l, network_state state)
     int locations = out_h * out_w;
 
     for(i = 0; i < l.batch; ++i){
-        copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
+        copy_gpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
     }
 
     for(i = 0; i < l.batch; ++i){
-        float *input = state.input + i*l.w*l.h*l.c;
-        im2col_ongpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image_gpu);
+        float *input = net.input_gpu + i*l.w*l.h*l.c;
+        im2col_gpu(input, l.c, l.h, l.w, 
+                l.size, l.stride, l.pad, net.workspace);
         float *output = l.output_gpu + i*l.outputs;
         for(j = 0; j < locations; ++j){
             float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
-            float *b = l.col_image_gpu + j;
+            float *b = net.workspace + j;
             float *c = output + j;
 
             int m = l.n;
             int n = 1;
             int k = l.size*l.size*l.c;
 
-            gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
+            gemm_gpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
         }
     }
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
 }
 
-void backward_local_layer_gpu(local_layer l, network_state state)
+void backward_local_layer_gpu(local_layer l, network net)
 {
     int i, j;
     int locations = l.out_w*l.out_h;
 
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
     for(i = 0; i < l.batch; ++i){
-        axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
+        axpy_gpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
     }
 
     for(i = 0; i < l.batch; ++i){
-        float *input = state.input + i*l.w*l.h*l.c;
-        im2col_ongpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image_gpu);
+        float *input = net.input_gpu + i*l.w*l.h*l.c;
+        im2col_gpu(input, l.c, l.h, l.w, 
+                l.size, l.stride, l.pad, net.workspace);
 
         for(j = 0; j < locations; ++j){ 
             float *a = l.delta_gpu + i*l.outputs + j;
-            float *b = l.col_image_gpu + j;
+            float *b = net.workspace + j;
             float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n;
             int m = l.n;
             int n = l.size*l.size*l.c;
             int k = 1;
 
-            gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
+            gemm_gpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
         }
 
-        if(state.delta){
+        if(net.delta_gpu){
             for(j = 0; j < locations; ++j){ 
                 float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
                 float *b = l.delta_gpu + i*l.outputs + j;
-                float *c = l.col_image_gpu + j;
+                float *c = net.workspace + j;
 
                 int m = l.size*l.size*l.c;
                 int n = 1;
                 int k = l.n;
 
-                gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
+                gemm_gpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
             }
 
-            col2im_ongpu(l.col_image_gpu, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+            col2im_gpu(net.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, net.delta_gpu+i*l.c*l.h*l.w);
         }
     }
 }
 
-void update_local_layer_gpu(local_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_local_layer_gpu(local_layer l, update_args a)
 {
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
     int locations = l.out_w*l.out_h;
     int size = l.size*l.size*l.c*l.n*locations;
-    axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
-    scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
+    axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+    scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1);
 
-    axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
-    axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
-    scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
+    axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+    axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+    scal_gpu(size, momentum, l.weight_updates_gpu, 1);
 }
 
 void pull_local_layer(local_layer l)
diff --git a/image.darknet/src/local_layer.h b/image.darknet/src/local_layer.h
index 28915d8..776e572 100644
--- a/image.darknet/src/local_layer.h
+++ b/image.darknet/src/local_layer.h
@@ -10,9 +10,9 @@
 typedef layer local_layer;
 
 #ifdef GPU
-void forward_local_layer_gpu(local_layer layer, network_state state);
-void backward_local_layer_gpu(local_layer layer, network_state state);
-void update_local_layer_gpu(local_layer layer, int batch, float learning_rate, float momentum, float decay);
+void forward_local_layer_gpu(local_layer layer, network net);
+void backward_local_layer_gpu(local_layer layer, network net);
+void update_local_layer_gpu(local_layer layer, update_args a);
 
 void push_local_layer(local_layer layer);
 void pull_local_layer(local_layer layer);
@@ -20,9 +20,9 @@ void pull_local_layer(local_layer layer);
 
 local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation);
 
-void forward_local_layer(const local_layer layer, network_state state);
-void backward_local_layer(local_layer layer, network_state state);
-void update_local_layer(local_layer layer, int batch, float learning_rate, float momentum, float decay);
+void forward_local_layer(const local_layer layer, network net);
+void backward_local_layer(local_layer layer, network net);
+void update_local_layer(local_layer layer, update_args a);
 
 void bias_output(float *output, float *biases, int batch, int n, int size);
 void backward_bias(float *bias_updates, float *delta, int batch, int n, int size);
diff --git a/image.darknet/src/logistic_layer.c b/image.darknet/src/logistic_layer.c
new file mode 100644
index 0000000..b2b3d6b
--- /dev/null
+++ b/image.darknet/src/logistic_layer.c
@@ -0,0 +1,71 @@
+#include "logistic_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "cuda.h"
+
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+layer make_logistic_layer(int batch, int inputs)
+{
+    fprintf(stderr, "logistic x entropy                             %4d\n",  inputs);
+    layer l = {0};
+    l.type = LOGXENT;
+    l.batch = batch;
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.loss = calloc(inputs*batch, sizeof(float));
+    l.output = calloc(inputs*batch, sizeof(float));
+    l.delta = calloc(inputs*batch, sizeof(float));
+    l.cost = calloc(1, sizeof(float));
+
+    l.forward = forward_logistic_layer;
+    l.backward = backward_logistic_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_logistic_layer_gpu;
+    l.backward_gpu = backward_logistic_layer_gpu;
+
+    l.output_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.loss_gpu = cuda_make_array(l.loss, inputs*batch); 
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
+    #endif
+    return l;
+}
+
+void forward_logistic_layer(const layer l, network net)
+{
+    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
+    activate_array(l.output, l.outputs*l.batch, LOGISTIC);
+    if(net.truth){
+        logistic_x_ent_cpu(l.batch*l.inputs, l.output, net.truth, l.delta, l.loss);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
+    }
+}
+
+void backward_logistic_layer(const layer l, network net)
+{
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
+}
+
+#ifdef GPU
+
+void forward_logistic_layer_gpu(const layer l, network net)
+{
+    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, LOGISTIC);
+    if(net.truth){
+        logistic_x_ent_gpu(l.batch*l.inputs, l.output_gpu, net.truth_gpu, l.delta_gpu, l.loss_gpu);
+        cuda_pull_array(l.loss_gpu, l.loss, l.batch*l.inputs);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
+    }
+}
+
+void backward_logistic_layer_gpu(const layer l, network net)
+{
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+
+#endif
diff --git a/image.darknet/src/logistic_layer.h b/image.darknet/src/logistic_layer.h
new file mode 100644
index 0000000..9c25bee
--- /dev/null
+++ b/image.darknet/src/logistic_layer.h
@@ -0,0 +1,15 @@
+#ifndef LOGISTIC_LAYER_H
+#define LOGISTIC_LAYER_H
+#include "layer.h"
+#include "network.h"
+
+layer make_logistic_layer(int batch, int inputs);
+void forward_logistic_layer(const layer l, network net);
+void backward_logistic_layer(const layer l, network net);
+
+#ifdef GPU
+void forward_logistic_layer_gpu(const layer l, network net);
+void backward_logistic_layer_gpu(const layer l, network net);
+#endif
+
+#endif
diff --git a/image.darknet/src/lstm_layer.c b/image.darknet/src/lstm_layer.c
new file mode 100644
index 0000000..fb07de2
--- /dev/null
+++ b/image.darknet/src/lstm_layer.c
@@ -0,0 +1,626 @@
+#include "lstm_layer.h"
+#include "connected_layer.h"
+#include "utils.h"
+#include "cuda.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+static void increment_layer(layer *l, int steps)
+{
+    int num = l->outputs*l->batch*steps;
+    l->output += num;
+    l->delta += num;
+    l->x += num;
+    l->x_norm += num;
+
+#ifdef GPU
+    l->output_gpu += num;
+    l->delta_gpu += num;
+    l->x_gpu += num;
+    l->x_norm_gpu += num;
+#endif
+}
+
+layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
+{
+    fprintf(stderr, "LSTM Layer: %d inputs, %d outputs\n", inputs, outputs);
+    batch = batch / steps;
+    layer l = { 0 };
+    l.batch = batch;
+    l.type = LSTM;
+    l.steps = steps;
+    l.inputs = inputs;
+
+    l.uf = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.uf->batch = batch;
+
+    l.ui = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.ui->batch = batch;
+
+    l.ug = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.ug->batch = batch;
+
+    l.uo = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
+    l.uo->batch = batch;
+
+    l.wf = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wf->batch = batch;
+
+    l.wi = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wi->batch = batch;
+
+    l.wg = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wg->batch = batch;
+
+    l.wo = malloc(sizeof(layer));
+    fprintf(stderr, "\t\t");
+    *(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
+    l.wo->batch = batch;
+
+    l.batch_normalize = batch_normalize;
+    l.outputs = outputs;
+
+    l.output = calloc(outputs*batch*steps, sizeof(float));
+    l.state = calloc(outputs*batch, sizeof(float));
+
+    l.forward = forward_lstm_layer;
+    l.update = update_lstm_layer;
+
+    l.prev_state_cpu =  calloc(batch*outputs, sizeof(float));
+    l.prev_cell_cpu =   calloc(batch*outputs, sizeof(float));
+    l.cell_cpu =        calloc(batch*outputs*steps, sizeof(float));
+
+    l.f_cpu =           calloc(batch*outputs, sizeof(float));
+    l.i_cpu =           calloc(batch*outputs, sizeof(float));
+    l.g_cpu =           calloc(batch*outputs, sizeof(float));
+    l.o_cpu =           calloc(batch*outputs, sizeof(float));
+    l.c_cpu =           calloc(batch*outputs, sizeof(float));
+    l.h_cpu =           calloc(batch*outputs, sizeof(float));
+    l.temp_cpu =        calloc(batch*outputs, sizeof(float));
+    l.temp2_cpu =       calloc(batch*outputs, sizeof(float));
+    l.temp3_cpu =       calloc(batch*outputs, sizeof(float));
+    l.dc_cpu =          calloc(batch*outputs, sizeof(float));
+    l.dh_cpu =          calloc(batch*outputs, sizeof(float));
+
+#ifdef GPU
+    l.forward_gpu = forward_lstm_layer_gpu;
+    l.backward_gpu = backward_lstm_layer_gpu;
+    l.update_gpu = update_lstm_layer_gpu;
+
+    l.output_gpu = cuda_make_array(0, batch*outputs*steps);
+    l.delta_gpu = cuda_make_array(0, batch*l.outputs*steps);
+
+    l.prev_state_gpu = cuda_make_array(0, batch*outputs);
+    l.prev_cell_gpu = cuda_make_array(0, batch*outputs);
+    l.cell_gpu = cuda_make_array(0, batch*outputs*steps);
+
+    l.f_gpu = cuda_make_array(0, batch*outputs);
+    l.i_gpu = cuda_make_array(0, batch*outputs);
+    l.g_gpu = cuda_make_array(0, batch*outputs);
+    l.o_gpu = cuda_make_array(0, batch*outputs);
+    l.c_gpu = cuda_make_array(0, batch*outputs);
+    l.h_gpu = cuda_make_array(0, batch*outputs);
+    l.temp_gpu =  cuda_make_array(0, batch*outputs);
+    l.temp2_gpu = cuda_make_array(0, batch*outputs);
+    l.temp3_gpu = cuda_make_array(0, batch*outputs);
+    l.dc_gpu = cuda_make_array(0, batch*outputs);
+    l.dh_gpu = cuda_make_array(0, batch*outputs);
+#ifdef CUDNN
+        cudnnSetTensor4dDescriptor(l.wf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wf->out_c, l.wf->out_h, l.wf->out_w); 
+        cudnnSetTensor4dDescriptor(l.wi->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wi->out_c, l.wi->out_h, l.wi->out_w); 
+        cudnnSetTensor4dDescriptor(l.wg->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wg->out_c, l.wg->out_h, l.wg->out_w); 
+        cudnnSetTensor4dDescriptor(l.wo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wo->out_c, l.wo->out_h, l.wo->out_w); 
+
+        cudnnSetTensor4dDescriptor(l.uf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uf->out_c, l.uf->out_h, l.uf->out_w); 
+        cudnnSetTensor4dDescriptor(l.ui->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ui->out_c, l.ui->out_h, l.ui->out_w); 
+        cudnnSetTensor4dDescriptor(l.ug->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ug->out_c, l.ug->out_h, l.ug->out_w); 
+        cudnnSetTensor4dDescriptor(l.uo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uo->out_c, l.uo->out_h, l.uo->out_w); 
+#endif
+
+#endif
+
+    return l;
+}
+
+void update_lstm_layer(layer l, update_args a)
+{
+    update_connected_layer(*(l.wf), a);
+    update_connected_layer(*(l.wi), a);
+    update_connected_layer(*(l.wg), a);
+    update_connected_layer(*(l.wo), a);
+    update_connected_layer(*(l.uf), a);
+    update_connected_layer(*(l.ui), a);
+    update_connected_layer(*(l.ug), a);
+    update_connected_layer(*(l.uo), a);
+}
+
+void forward_lstm_layer(layer l, network state)
+{
+    network s = { 0 };
+    s.train = state.train;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1);
+
+    fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1);
+    if (state.train) {
+        fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i) {
+        s.input = l.h_cpu;
+        forward_connected_layer(wf, s);							
+        forward_connected_layer(wi, s);							
+        forward_connected_layer(wg, s);							
+        forward_connected_layer(wo, s);							
+
+        s.input = state.input;
+        forward_connected_layer(uf, s);							
+        forward_connected_layer(ui, s);							
+        forward_connected_layer(ug, s);							
+        forward_connected_layer(uo, s);							
+
+        copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);
+        axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);
+
+        copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);	
+        axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);	
+
+        copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);	
+        axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);	
+
+        copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);	
+        axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);	
+
+        activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array(l.g_cpu, l.outputs*l.batch, TANH);			
+        activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC);		
+
+        copy_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);		
+        mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);		
+        mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.c_cpu, 1);			
+        axpy_cpu(l.outputs*l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1);	
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.h_cpu, 1);			
+        activate_array(l.h_cpu, l.outputs*l.batch, TANH);		
+        mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.h_cpu, 1);	
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.cell_cpu, 1);		
+        copy_cpu(l.outputs*l.batch, l.h_cpu, 1, l.output, 1);
+
+        state.input += l.inputs*l.batch;
+        l.output    += l.outputs*l.batch;
+        l.cell_cpu      += l.outputs*l.batch;
+
+        increment_layer(&wf, 1);
+        increment_layer(&wi, 1);
+        increment_layer(&wg, 1);
+        increment_layer(&wo, 1);
+
+        increment_layer(&uf, 1);
+        increment_layer(&ui, 1);
+        increment_layer(&ug, 1);
+        increment_layer(&uo, 1);
+    }
+}
+
+void backward_lstm_layer(layer l, network state)
+{
+    network s = { 0 };
+    s.train = state.train;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    increment_layer(&wf, l.steps - 1);
+    increment_layer(&wi, l.steps - 1);
+    increment_layer(&wg, l.steps - 1);
+    increment_layer(&wo, l.steps - 1);
+
+    increment_layer(&uf, l.steps - 1);
+    increment_layer(&ui, l.steps - 1);
+    increment_layer(&ug, l.steps - 1);
+    increment_layer(&uo, l.steps - 1);
+
+    state.input += l.inputs*l.batch*(l.steps - 1);
+    if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1);
+
+    l.output += l.outputs*l.batch*(l.steps - 1);
+    l.cell_cpu += l.outputs*l.batch*(l.steps - 1);
+    l.delta += l.outputs*l.batch*(l.steps - 1);
+
+    for (i = l.steps - 1; i >= 0; --i) {
+        if (i != 0) copy_cpu(l.outputs*l.batch, l.cell_cpu - l.outputs*l.batch, 1, l.prev_cell_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.cell_cpu, 1, l.c_cpu, 1);
+        if (i != 0) copy_cpu(l.outputs*l.batch, l.output - l.outputs*l.batch, 1, l.prev_state_cpu, 1);
+        copy_cpu(l.outputs*l.batch, l.output, 1, l.h_cpu, 1);
+
+        l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs*l.batch;
+
+        copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);			
+        axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);			
+
+        copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);			
+        axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);			
+
+        copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);			
+        axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);			
+
+        copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);			
+        axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);			
+
+        activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC);			
+        activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array(l.g_cpu, l.outputs*l.batch, TANH);			
+        activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC);		
+
+        copy_cpu(l.outputs*l.batch, l.delta, 1, l.temp3_cpu, 1);		
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);			
+        activate_array(l.temp_cpu, l.outputs*l.batch, TANH);			
+
+        copy_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1);		
+        mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.temp2_cpu, 1);			
+
+        gradient_array(l.temp_cpu, l.outputs*l.batch, TANH, l.temp2_cpu);
+        axpy_cpu(l.outputs*l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1);		
+
+        copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);			
+        activate_array(l.temp_cpu, l.outputs*l.batch, TANH);			
+        mul_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp_cpu, 1);		
+        gradient_array(l.o_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wo.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;															
+        backward_connected_layer(wo, s);	
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uo.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(uo, s);									
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);			
+        mul_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);				
+        gradient_array(l.g_cpu, l.outputs*l.batch, TANH, l.temp_cpu);		
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wg.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;														
+        backward_connected_layer(wg, s);	
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ug.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(ug, s);																
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);			
+        mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);				
+        gradient_array(l.i_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);	
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wi.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;
+        backward_connected_layer(wi, s);						
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ui.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(ui, s);									
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);		
+        mul_cpu(l.outputs*l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1);
+        gradient_array(l.f_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wf.delta, 1);
+        s.input = l.prev_state_cpu;
+        s.delta = l.dh_cpu;
+        backward_connected_layer(wf, s);						
+
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uf.delta, 1);
+        s.input = state.input;
+        s.delta = state.delta;
+        backward_connected_layer(uf, s);									
+
+        copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);			
+        mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.temp_cpu, 1);				
+        copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, l.dc_cpu, 1);				
+
+        state.input -= l.inputs*l.batch;
+        if (state.delta) state.delta -= l.inputs*l.batch;
+        l.output -= l.outputs*l.batch;
+        l.cell_cpu -= l.outputs*l.batch;
+        l.delta -= l.outputs*l.batch;
+
+        increment_layer(&wf, -1);
+        increment_layer(&wi, -1);
+        increment_layer(&wg, -1);
+        increment_layer(&wo, -1);
+
+        increment_layer(&uf, -1);
+        increment_layer(&ui, -1);
+        increment_layer(&ug, -1);
+        increment_layer(&uo, -1);
+    }
+}
+
+#ifdef GPU
+void update_lstm_layer_gpu(layer l, update_args a)
+{
+    update_connected_layer_gpu(*(l.wf), a);
+    update_connected_layer_gpu(*(l.wi), a);
+    update_connected_layer_gpu(*(l.wg), a);
+    update_connected_layer_gpu(*(l.wo), a);
+    update_connected_layer_gpu(*(l.uf), a);
+    update_connected_layer_gpu(*(l.ui), a);
+    update_connected_layer_gpu(*(l.ug), a);
+    update_connected_layer_gpu(*(l.uo), a);
+}
+
+void forward_lstm_layer_gpu(layer l, network state)
+{
+    network s = { 0 };
+    s.train = state.train;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1);
+
+    fill_gpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1);
+    if (state.train) {
+        fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
+    }
+
+    for (i = 0; i < l.steps; ++i) {
+        s.input_gpu = l.h_gpu;
+        forward_connected_layer_gpu(wf, s);							
+        forward_connected_layer_gpu(wi, s);							
+        forward_connected_layer_gpu(wg, s);							
+        forward_connected_layer_gpu(wo, s);							
+
+        s.input_gpu = state.input_gpu;
+        forward_connected_layer_gpu(uf, s);							
+        forward_connected_layer_gpu(ui, s);							
+        forward_connected_layer_gpu(ug, s);							
+        forward_connected_layer_gpu(uo, s);							
+
+        copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
+        axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
+
+        copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);	
+        axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);	
+
+        copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);	
+        axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);	
+
+        copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);	
+        axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);	
+
+        activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH);			
+        activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);		
+
+        copy_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);		
+        mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);		
+        mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1);			
+        axpy_gpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1);	
+
+        copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1);			
+        activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);		
+        mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1);	
+
+        copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1);		
+        copy_gpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1);
+
+        state.input_gpu += l.inputs*l.batch;
+        l.output_gpu    += l.outputs*l.batch;
+        l.cell_gpu      += l.outputs*l.batch;
+
+        increment_layer(&wf, 1);
+        increment_layer(&wi, 1);
+        increment_layer(&wg, 1);
+        increment_layer(&wo, 1);
+
+        increment_layer(&uf, 1);
+        increment_layer(&ui, 1);
+        increment_layer(&ug, 1);
+        increment_layer(&uo, 1);
+    }
+}
+
+void backward_lstm_layer_gpu(layer l, network state)
+{
+    network s = { 0 };
+    s.train = state.train;
+    int i;
+    layer wf = *(l.wf);
+    layer wi = *(l.wi);
+    layer wg = *(l.wg);
+    layer wo = *(l.wo);
+
+    layer uf = *(l.uf);
+    layer ui = *(l.ui);
+    layer ug = *(l.ug);
+    layer uo = *(l.uo);
+
+    increment_layer(&wf, l.steps - 1);
+    increment_layer(&wi, l.steps - 1);
+    increment_layer(&wg, l.steps - 1);
+    increment_layer(&wo, l.steps - 1);
+
+    increment_layer(&uf, l.steps - 1);
+    increment_layer(&ui, l.steps - 1);
+    increment_layer(&ug, l.steps - 1);
+    increment_layer(&uo, l.steps - 1);
+
+    state.input_gpu += l.inputs*l.batch*(l.steps - 1);
+    if (state.delta_gpu) state.delta_gpu += l.inputs*l.batch*(l.steps - 1);
+
+    l.output_gpu += l.outputs*l.batch*(l.steps - 1);
+    l.cell_gpu += l.outputs*l.batch*(l.steps - 1);
+    l.delta_gpu += l.outputs*l.batch*(l.steps - 1);
+
+    for (i = l.steps - 1; i >= 0; --i) {
+        if (i != 0) copy_gpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1);
+        if (i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1);
+
+        l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
+
+        copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);			
+        axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);			
+
+        copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);			
+        axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);			
+
+        copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);			
+        axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);			
+
+        copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);			
+        axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);			
+
+        activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);			
+        activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);		
+        activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH);			
+        activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);		
+
+        copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1);		
+
+        copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);			
+        activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH);			
+
+        copy_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1);		
+        mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1);			
+
+        gradient_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu);
+        axpy_gpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1);		
+
+        copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);			
+        activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH);			
+        mul_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);		
+        gradient_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1);
+        s.input_gpu = l.prev_state_gpu;
+        s.delta_gpu = l.dh_gpu;															
+        backward_connected_layer_gpu(wo, s);	
+
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1);
+        s.input_gpu = state.input_gpu;
+        s.delta_gpu = state.delta_gpu;
+        backward_connected_layer_gpu(uo, s);									
+
+        copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);			
+        mul_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);				
+        gradient_array_gpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu);		
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1);
+        s.input_gpu = l.prev_state_gpu;
+        s.delta_gpu = l.dh_gpu;														
+        backward_connected_layer_gpu(wg, s);	
+
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1);
+        s.input_gpu = state.input_gpu;
+        s.delta_gpu = state.delta_gpu;
+        backward_connected_layer_gpu(ug, s);																
+
+        copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);			
+        mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);				
+        gradient_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);	
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1);
+        s.input_gpu = l.prev_state_gpu;
+        s.delta_gpu = l.dh_gpu;
+        backward_connected_layer_gpu(wi, s);						
+
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1);
+        s.input_gpu = state.input_gpu;
+        s.delta_gpu = state.delta_gpu;
+        backward_connected_layer_gpu(ui, s);									
+
+        copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);		
+        mul_gpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
+        gradient_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1);
+        s.input_gpu = l.prev_state_gpu;
+        s.delta_gpu = l.dh_gpu;
+        backward_connected_layer_gpu(wf, s);						
+
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1);
+        s.input_gpu = state.input_gpu;
+        s.delta_gpu = state.delta_gpu;
+        backward_connected_layer_gpu(uf, s);									
+
+        copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);			
+        mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);				
+        copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1);				
+
+        state.input_gpu -= l.inputs*l.batch;
+        if (state.delta_gpu) state.delta_gpu -= l.inputs*l.batch;
+        l.output_gpu -= l.outputs*l.batch;
+        l.cell_gpu -= l.outputs*l.batch;
+        l.delta_gpu -= l.outputs*l.batch;
+
+        increment_layer(&wf, -1);
+        increment_layer(&wi, -1);
+        increment_layer(&wg, -1);
+        increment_layer(&wo, -1);
+
+        increment_layer(&uf, -1);
+        increment_layer(&ui, -1);
+        increment_layer(&ug, -1);
+        increment_layer(&uo, -1);
+    }
+}
+#endif
diff --git a/image.darknet/src/lstm_layer.h b/image.darknet/src/lstm_layer.h
new file mode 100644
index 0000000..b9f07e6
--- /dev/null
+++ b/image.darknet/src/lstm_layer.h
@@ -0,0 +1,20 @@
+#ifndef LSTM_LAYER_H
+#define LSTM_LAYER_H
+
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+#define USET
+
+layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
+
+void forward_lstm_layer(layer l, network net); 
+void update_lstm_layer(layer l, update_args a);
+
+#ifdef GPU
+void forward_lstm_layer_gpu(layer l, network net);
+void backward_lstm_layer_gpu(layer l, network net);
+void update_lstm_layer_gpu(layer l, update_args a); 
+
+#endif
+#endif
diff --git a/image.darknet/src/matrix.c b/image.darknet/src/matrix.c
index ee14979..799916b 100644
--- a/image.darknet/src/matrix.c
+++ b/image.darknet/src/matrix.c
@@ -1,5 +1,6 @@
 #include "matrix.h"
 #include "utils.h"
+#include "blas.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -73,6 +74,20 @@ void matrix_add_matrix(matrix from, matrix to)
     }
 }
 
+matrix copy_matrix(matrix m)
+{
+    matrix c = {0};
+    c.rows = m.rows;
+    c.cols = m.cols;
+    c.vals = calloc(c.rows, sizeof(float *));
+    int i;
+    for(i = 0; i < c.rows; ++i){
+        c.vals[i] = calloc(c.cols, sizeof(float));
+        copy_cpu(c.cols, m.vals[i], 1, c.vals[i], 1);
+    }
+    return c;
+}
+
 matrix make_matrix(int rows, int cols)
 {
     int i;
diff --git a/image.darknet/src/matrix.h b/image.darknet/src/matrix.h
index 641b596..879acd7 100644
--- a/image.darknet/src/matrix.h
+++ b/image.darknet/src/matrix.h
@@ -1,20 +1,11 @@
 #ifndef MATRIX_H
 #define MATRIX_H
-typedef struct matrix{
-    int rows, cols;
-    float **vals;
-} matrix;
+#include "darknet.h"
 
-matrix make_matrix(int rows, int cols);
-void free_matrix(matrix m);
+matrix copy_matrix(matrix m);
 void print_matrix(matrix m);
 
-matrix csv_to_matrix(char *filename);
-void matrix_to_csv(matrix m);
 matrix hold_out_matrix(matrix *m, int n);
-float matrix_topk_accuracy(matrix truth, matrix guess, int k);
-void matrix_add_matrix(matrix from, matrix to);
-void scale_matrix(matrix m, float scale);
 matrix resize_matrix(matrix m, int size);
 
 float *pop_column(matrix *m, int c);
diff --git a/image.darknet/src/maxpool_layer.c b/image.darknet/src/maxpool_layer.c
index 031d116..fb05635 100644
--- a/image.darknet/src/maxpool_layer.c
+++ b/image.darknet/src/maxpool_layer.c
@@ -27,8 +27,8 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
     l.w = w;
     l.c = c;
     l.pad = padding;
-    l.out_w = (w + 2*padding)/stride;
-    l.out_h = (h + 2*padding)/stride;
+    l.out_w = (w + padding - size)/stride + 1;
+    l.out_h = (h + padding - size)/stride + 1;
     l.out_c = c;
     l.outputs = l.out_h * l.out_w * l.out_c;
     l.inputs = h*w*c;
@@ -43,7 +43,7 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
     #ifdef GPU
     l.forward_gpu = forward_maxpool_layer_gpu;
     l.backward_gpu = backward_maxpool_layer_gpu;
-    l.indexes_gpu = cuda_make_int_array(output_size);
+    l.indexes_gpu = cuda_make_int_array(0, output_size);
     l.output_gpu  = cuda_make_array(l.output, output_size);
     l.delta_gpu   = cuda_make_array(l.delta, output_size);
     #endif
@@ -57,8 +57,8 @@ void resize_maxpool_layer(maxpool_layer *l, int w, int h)
     l->w = w;
     l->inputs = h*w*l->c;
 
-    l->out_w = (w + 2*l->pad)/l->stride;
-    l->out_h = (h + 2*l->pad)/l->stride;
+    l->out_w = (w + l->pad - l->size)/l->stride + 1;
+    l->out_h = (h + l->pad - l->size)/l->stride + 1;
     l->outputs = l->out_w * l->out_h * l->c;
     int output_size = l->outputs * l->batch;
 
@@ -70,17 +70,17 @@ void resize_maxpool_layer(maxpool_layer *l, int w, int h)
     cuda_free((float *)l->indexes_gpu);
     cuda_free(l->output_gpu);
     cuda_free(l->delta_gpu);
-    l->indexes_gpu = cuda_make_int_array(output_size);
+    l->indexes_gpu = cuda_make_int_array(0, output_size);
     l->output_gpu  = cuda_make_array(l->output, output_size);
     l->delta_gpu   = cuda_make_array(l->delta,  output_size);
     #endif
 }
 
-void forward_maxpool_layer(const maxpool_layer l, network_state state)
+void forward_maxpool_layer(const maxpool_layer l, network net)
 {
     int b,i,j,k,m,n;
-    int w_offset = -l.pad;
-    int h_offset = -l.pad;
+    int w_offset = -l.pad/2;
+    int h_offset = -l.pad/2;
 
     int h = l.out_h;
     int w = l.out_w;
@@ -100,7 +100,7 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state)
                             int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c));
                             int valid = (cur_h >= 0 && cur_h < l.h &&
                                          cur_w >= 0 && cur_w < l.w);
-                            float val = (valid != 0) ? state.input[index] : -FLT_MAX;
+                            float val = (valid != 0) ? net.input[index] : -FLT_MAX;
                             max_i = (val > max) ? index : max_i;
                             max   = (val > max) ? val   : max;
                         }
@@ -113,7 +113,7 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state)
     }
 }
 
-void backward_maxpool_layer(const maxpool_layer l, network_state state)
+void backward_maxpool_layer(const maxpool_layer l, network net)
 {
     int i;
     int h = l.out_h;
@@ -121,7 +121,7 @@ void backward_maxpool_layer(const maxpool_layer l, network_state state)
     int c = l.c;
     for(i = 0; i < h*w*c*l.batch; ++i){
         int index = l.indexes[i];
-        state.delta[index] += l.delta[i];
+        net.delta[index] += l.delta[i];
     }
 }
 
diff --git a/image.darknet/src/maxpool_layer.h b/image.darknet/src/maxpool_layer.h
index ce56dd8..ceb5190 100644
--- a/image.darknet/src/maxpool_layer.h
+++ b/image.darknet/src/maxpool_layer.h
@@ -11,12 +11,12 @@ typedef layer maxpool_layer;
 image get_maxpool_image(maxpool_layer l);
 maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding);
 void resize_maxpool_layer(maxpool_layer *l, int w, int h);
-void forward_maxpool_layer(const maxpool_layer l, network_state state);
-void backward_maxpool_layer(const maxpool_layer l, network_state state);
+void forward_maxpool_layer(const maxpool_layer l, network net);
+void backward_maxpool_layer(const maxpool_layer l, network net);
 
 #ifdef GPU
-void forward_maxpool_layer_gpu(maxpool_layer l, network_state state);
-void backward_maxpool_layer_gpu(maxpool_layer l, network_state state);
+void forward_maxpool_layer_gpu(maxpool_layer l, network net);
+void backward_maxpool_layer_gpu(maxpool_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/maxpool_layer_kernels.cu b/image.darknet/src/maxpool_layer_kernels.cu
index 6381cc1..869ef46 100644
--- a/image.darknet/src/maxpool_layer_kernels.cu
+++ b/image.darknet/src/maxpool_layer_kernels.cu
@@ -9,8 +9,8 @@ extern "C" {
 
 __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride, int size, int pad, float *input, float *output, int *indexes)
 {
-    int h = (in_h + 2*pad)/stride;
-    int w = (in_w + 2*pad)/stride;
+    int h = (in_h + pad - size)/stride + 1;
+    int w = (in_w + pad - size)/stride + 1;
     int c = in_c;
 
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -24,8 +24,8 @@ __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c
     id /= c;
     int b = id;
 
-    int w_offset = -pad;
-    int h_offset = -pad;
+    int w_offset = -pad/2;
+    int h_offset = -pad/2;
 
     int out_index = j + w*(i + h*(k + c*b));
     float max = -INFINITY;
@@ -49,8 +49,8 @@ __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c
 
 __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride, int size, int pad, float *delta, float *prev_delta, int *indexes)
 {
-    int h = (in_h + 2*pad)/stride;
-    int w = (in_w + 2*pad)/stride;
+    int h = (in_h + pad - size)/stride + 1;
+    int w = (in_w + pad - size)/stride + 1;
     int c = in_c;
     int area = (size-1)/stride;
 
@@ -66,8 +66,8 @@ __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_
     id /= in_c;
     int b = id;
 
-    int w_offset = -pad;
-    int h_offset = -pad;
+    int w_offset = -pad/2;
+    int h_offset = -pad/2;
 
     float d = 0;
     int l, m;
@@ -84,7 +84,7 @@ __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_
     prev_delta[index] += d;
 }
 
-extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
+extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network net)
 {
     int h = layer.out_h;
     int w = layer.out_w;
@@ -92,15 +92,15 @@ extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state sta
 
     size_t n = h*w*c*layer.batch;
 
-    forward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu);
+    forward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, net.input_gpu, layer.output_gpu, layer.indexes_gpu);
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
+extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network net)
 {
     size_t n = layer.h*layer.w*layer.c*layer.batch;
 
-    backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, layer.delta_gpu, state.delta, layer.indexes_gpu);
+    backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK>>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, layer.delta_gpu, net.delta_gpu, layer.indexes_gpu);
     check_error(cudaPeekAtLastError());
 }
 
diff --git a/image.darknet/src/network.c b/image.darknet/src/network.c
index 0914e37..aaab799 100644
--- a/image.darknet/src/network.c
+++ b/image.darknet/src/network.c
@@ -17,6 +17,7 @@
 #include "activation_layer.h"
 #include "detection_layer.h"
 #include "region_layer.h"
+#include "yolo_layer.h"
 #include "normalization_layer.h"
 #include "batchnorm_layer.h"
 #include "maxpool_layer.h"
@@ -26,55 +27,95 @@
 #include "softmax_layer.h"
 #include "dropout_layer.h"
 #include "route_layer.h"
+#include "upsample_layer.h"
 #include "shortcut_layer.h"
+#include "parser.h"
+#include "data.h"
+
+load_args get_base_args(network *net)
+{
+    load_args args = {0};
+    args.w = net->w;
+    args.h = net->h;
+    args.size = net->w;
+
+    args.min = net->min_crop;
+    args.max = net->max_crop;
+    args.angle = net->angle;
+    args.aspect = net->aspect;
+    args.exposure = net->exposure;
+    args.center = net->center;
+    args.saturation = net->saturation;
+    args.hue = net->hue;
+    return args;
+}
+
+network *load_network(char *cfg, char *weights, int clear)
+{
+    network *net = parse_network_cfg(cfg);
+    if(weights && weights[0] != 0){
+        load_weights(net, weights);
+    }
+    if(clear) (*net->seen) = 0;
+    return net;
+}
 
-int get_current_batch(network net)
+size_t get_current_batch(network *net)
 {
-    int batch_num = (*net.seen)/(net.batch*net.subdivisions);
+    size_t batch_num = (*net->seen)/(net->batch*net->subdivisions);
     return batch_num;
 }
 
-void reset_momentum(network net)
+void reset_network_state(network *net, int b)
 {
-    if (net.momentum == 0) return;
-    net.learning_rate = 0;
-    net.momentum = 0;
-    net.decay = 0;
-    #ifdef GPU
-        //if(net.gpu_index >= 0) update_network_gpu(net);
-    #endif
+    int i;
+    for (i = 0; i < net->n; ++i) {
+        #ifdef GPU
+        layer l = net->layers[i];
+        if(l.state_gpu){
+            fill_gpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
+        }
+        if(l.h_gpu){
+            fill_gpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1);
+        }
+        #endif
+    }
 }
 
-float get_current_rate(network net)
+void reset_rnn(network *net)
 {
-    int batch_num = get_current_batch(net);
+    reset_network_state(net, 0);
+}
+
+float get_current_rate(network *net)
+{
+    size_t batch_num = get_current_batch(net);
     int i;
     float rate;
-    switch (net.policy) {
+    if (batch_num < net->burn_in) return net->learning_rate * pow((float)batch_num / net->burn_in, net->power);
+    switch (net->policy) {
         case CONSTANT:
-            return net.learning_rate;
+            return net->learning_rate;
         case STEP:
-            return net.learning_rate * pow(net.scale, batch_num/net.step);
+            return net->learning_rate * pow(net->scale, batch_num/net->step);
         case STEPS:
-            rate = net.learning_rate;
-            for(i = 0; i < net.num_steps; ++i){
-                if(net.steps[i] > batch_num) return rate;
-                rate *= net.scales[i];
-                //if(net.steps[i] > batch_num - 1 && net.scales[i] > 1) reset_momentum(net);
+            rate = net->learning_rate;
+            for(i = 0; i < net->num_steps; ++i){
+                if(net->steps[i] > batch_num) return rate;
+                rate *= net->scales[i];
             }
             return rate;
         case EXP:
-            return net.learning_rate * pow(net.gamma, batch_num);
+            return net->learning_rate * pow(net->gamma, batch_num);
         case POLY:
-            if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
-            return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
+            return net->learning_rate * pow(1 - (float)batch_num / net->max_batches, net->power);
         case RANDOM:
-            return net.learning_rate * pow(rand_uniform(0,1), net.power);
+            return net->learning_rate * pow(rand_uniform(0,1), net->power);
         case SIG:
-            return net.learning_rate * (1./(1.+exp(net.gamma*(batch_num - net.step))));
+            return net->learning_rate * (1./(1.+exp(net->gamma*(batch_num - net->step))));
         default:
             fprintf(stderr, "Policy is weird!\n");
-            return net.learning_rate;
+            return net->learning_rate;
     }
 }
 
@@ -95,6 +136,8 @@ char *get_layer_string(LAYER_TYPE a)
             return "rnn";
         case GRU:
             return "gru";
+        case LSTM:
+	    return "lstm";
         case CRNN:
             return "crnn";
         case MAXPOOL:
@@ -109,6 +152,8 @@ char *get_layer_string(LAYER_TYPE a)
             return "detection";
         case REGION:
             return "region";
+        case YOLO:
+            return "yolo";
         case DROPOUT:
             return "dropout";
         case CROP:
@@ -129,59 +174,75 @@ char *get_layer_string(LAYER_TYPE a)
     return "none";
 }
 
-network make_network(int n)
+network *make_network(int n)
 {
-    network net = {0};
-    net.n = n;
-    net.layers = calloc(net.n, sizeof(layer));
-    net.seen = calloc(1, sizeof(int));
-    #ifdef GPU
-    net.input_gpu = calloc(1, sizeof(float *));
-    net.truth_gpu = calloc(1, sizeof(float *));
-    #endif
+    network *net = calloc(1, sizeof(network));
+    net->n = n;
+    net->layers = calloc(net->n, sizeof(layer));
+    net->seen = calloc(1, sizeof(size_t));
+    net->t    = calloc(1, sizeof(int));
+    net->cost = calloc(1, sizeof(float));
     return net;
 }
 
-void forward_network(network net, network_state state)
+void forward_network(network *netp)
 {
-    state.workspace = net.workspace;
+#ifdef GPU
+    if(netp->gpu_index >= 0){
+        forward_network_gpu(netp);   
+        return;
+    }
+#endif
+    network net = *netp;
     int i;
     for(i = 0; i < net.n; ++i){
-        state.index = i;
+        net.index = i;
         layer l = net.layers[i];
         if(l.delta){
-            scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
+            fill_cpu(l.outputs * l.batch, 0, l.delta, 1);
+        }
+        l.forward(l, net);
+        net.input = l.output;
+        if(l.truth) {
+            net.truth = l.output;
         }
-        l.forward(l, state);
-        state.input = l.output;
     }
+    calc_network_cost(netp);
 }
 
-void update_network(network net)
+void update_network(network *netp)
 {
+#ifdef GPU
+    if(netp->gpu_index >= 0){
+        update_network_gpu(netp);   
+        return;
+    }
+#endif
+    network net = *netp;
     int i;
-    int update_batch = net.batch*net.subdivisions;
-    float rate = get_current_rate(net);
+    update_args a = {0};
+    a.batch = net.batch*net.subdivisions;
+    a.learning_rate = get_current_rate(netp);
+    a.momentum = net.momentum;
+    a.decay = net.decay;
+    a.adam = net.adam;
+    a.B1 = net.B1;
+    a.B2 = net.B2;
+    a.eps = net.eps;
+    ++*net.t;
+    a.t = *net.t;
+
     for(i = 0; i < net.n; ++i){
         layer l = net.layers[i];
         if(l.update){
-            l.update(l, update_batch, rate, net.momentum, net.decay);
+            l.update(l, a);
         }
     }
 }
 
-float *get_network_output(network net)
-{
-#ifdef GPU
-    if (gpu_index >= 0) return get_network_output_gpu(net);
-#endif 
-    int i;
-    for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
-    return net.layers[i].output;
-}
-
-float get_network_cost(network net)
+void calc_network_cost(network *netp)
 {
+    network net = *netp;
     int i;
     float sum = 0;
     int count = 0;
@@ -191,120 +252,90 @@ float get_network_cost(network net)
             ++count;
         }
     }
-    return sum/count;
+    *net.cost = sum/count;
 }
 
-int get_predicted_class_network(network net)
+int get_predicted_class_network(network *net)
 {
-    float *out = get_network_output(net);
-    int k = get_network_output_size(net);
-    return max_index(out, k);
+    return max_index(net->output, net->outputs);
 }
 
-void backward_network(network net, network_state state)
+void backward_network(network *netp)
 {
+#ifdef GPU
+    if(netp->gpu_index >= 0){
+        backward_network_gpu(netp);   
+        return;
+    }
+#endif
+    network net = *netp;
     int i;
-    float *original_input = state.input;
-    float *original_delta = state.delta;
-    state.workspace = net.workspace;
+    network orig = net;
     for(i = net.n-1; i >= 0; --i){
-        state.index = i;
+        layer l = net.layers[i];
+        if(l.stopbackward) break;
         if(i == 0){
-            state.input = original_input;
-            state.delta = original_delta;
+            net = orig;
         }else{
             layer prev = net.layers[i-1];
-            state.input = prev.output;
-            state.delta = prev.delta;
+            net.input = prev.output;
+            net.delta = prev.delta;
         }
-        layer l = net.layers[i];
-        l.backward(l, state);
+        net.index = i;
+        l.backward(l, net);
     }
 }
 
-float train_network_datum(network net, float *x, float *y)
+float train_network_datum(network *net)
 {
-#ifdef GPU
-    if(gpu_index >= 0) return train_network_datum_gpu(net, x, y);
-#endif
-    network_state state;
-    *net.seen += net.batch;
-    state.index = 0;
-    state.net = net;
-    state.input = x;
-    state.delta = 0;
-    state.truth = y;
-    state.train = 1;
-    forward_network(net, state);
-    backward_network(net, state);
-    float error = get_network_cost(net);
-    if(((*net.seen)/net.batch)%net.subdivisions == 0) update_network(net);
+    *net->seen += net->batch;
+    net->train = 1;
+    forward_network(net);
+    backward_network(net);
+    float error = *net->cost;
+    if(((*net->seen)/net->batch)%net->subdivisions == 0) update_network(net);
     return error;
 }
 
-float train_network_sgd(network net, data d, int n)
+float train_network_sgd(network *net, data d, int n)
 {
-    int batch = net.batch;
-    float *X = calloc(batch*d.X.cols, sizeof(float));
-    float *y = calloc(batch*d.y.cols, sizeof(float));
+    int batch = net->batch;
 
     int i;
     float sum = 0;
     for(i = 0; i < n; ++i){
-        get_random_batch(d, batch, X, y);
-        float err = train_network_datum(net, X, y);
+        get_random_batch(d, batch, net->input, net->truth);
+        float err = train_network_datum(net);
         sum += err;
     }
-    free(X);
-    free(y);
     return (float)sum/(n*batch);
 }
 
-float train_network(network net, data d)
+float train_network(network *net, data d)
 {
-    assert(d.X.rows % net.batch == 0);
-    int batch = net.batch;
+    assert(d.X.rows % net->batch == 0);
+    int batch = net->batch;
     int n = d.X.rows / batch;
-    float *X = calloc(batch*d.X.cols, sizeof(float));
-    float *y = calloc(batch*d.y.cols, sizeof(float));
 
     int i;
     float sum = 0;
     for(i = 0; i < n; ++i){
-        get_next_batch(d, batch, i*batch, X, y);
-        float err = train_network_datum(net, X, y);
+        get_next_batch(d, batch, i*batch, net->input, net->truth);
+        float err = train_network_datum(net);
         sum += err;
     }
-    free(X);
-    free(y);
     return (float)sum/(n*batch);
 }
 
-
-float train_network_batch(network net, data d, int n)
+void set_temp_network(network *net, float t)
 {
-    int i,j;
-    network_state state;
-    state.index = 0;
-    state.net = net;
-    state.train = 1;
-    state.delta = 0;
-    float sum = 0;
-    int batch = 2;
-    for(i = 0; i < n; ++i){
-        for(j = 0; j < batch; ++j){
-            int index = rand()%d.X.rows;
-            state.input = d.X.vals[index];
-            state.truth = d.y.vals[index];
-            forward_network(net, state);
-            backward_network(net, state);
-            sum += get_network_cost(net);
-        }
-        update_network(net);
+    int i;
+    for(i = 0; i < net->n; ++i){
+        net->layers[i].temperature = t;
     }
-    return (float)sum/(n*batch);
 }
 
+
 void set_batch_network(network *net, int b)
 {
     net->batch = b;
@@ -315,6 +346,11 @@ void set_batch_network(network *net, int b)
         if(net->layers[i].type == CONVOLUTIONAL){
             cudnn_convolutional_setup(net->layers + i);
         }
+        if(net->layers[i].type == DECONVOLUTIONAL){
+            layer *l = net->layers + i;
+            cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, l->out_h, l->out_w);
+            cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
+        }
 #endif
     }
 }
@@ -323,9 +359,7 @@ int resize_network(network *net, int w, int h)
 {
 #ifdef GPU
     cuda_set_device(net->gpu_index);
-    if(gpu_index >= 0){
-        cuda_free(net->workspace);
-    }
+    cuda_free(net->workspace);
 #endif
     int i;
     //if(w == net->w && h == net->h) return 0;
@@ -345,8 +379,14 @@ int resize_network(network *net, int w, int h)
             resize_maxpool_layer(&l, w, h);
         }else if(l.type == REGION){
             resize_region_layer(&l, w, h);
+        }else if(l.type == YOLO){
+            resize_yolo_layer(&l, w, h);
         }else if(l.type == ROUTE){
             resize_route_layer(&l, net);
+        }else if(l.type == SHORTCUT){
+            resize_shortcut_layer(&l, w, h);
+        }else if(l.type == UPSAMPLE){
+            resize_upsample_layer(&l, w, h);
         }else if(l.type == REORG){
             resize_reorg_layer(&l, w, h);
         }else if(l.type == AVGPOOL){
@@ -359,21 +399,32 @@ int resize_network(network *net, int w, int h)
             error("Cannot resize this type of layer");
         }
         if(l.workspace_size > workspace_size) workspace_size = l.workspace_size;
+        if(l.workspace_size > 2000000000) assert(0);
         inputs = l.outputs;
         net->layers[i] = l;
         w = l.out_w;
         h = l.out_h;
         if(l.type == AVGPOOL) break;
     }
+    layer out = get_network_output_layer(net);
+    net->inputs = net->layers[0].inputs;
+    net->outputs = out.outputs;
+    net->truths = out.outputs;
+    if(net->layers[net->n-1].truths) net->truths = net->layers[net->n-1].truths;
+    net->output = out.output;
+    free(net->input);
+    free(net->truth);
+    net->input = calloc(net->inputs*net->batch, sizeof(float));
+    net->truth = calloc(net->truths*net->batch, sizeof(float));
 #ifdef GPU
     if(gpu_index >= 0){
-        if(net->input_gpu) {
-            cuda_free(*net->input_gpu);
-            *net->input_gpu = 0;
-            cuda_free(*net->truth_gpu);
-            *net->truth_gpu = 0;
+        cuda_free(net->input_gpu);
+        cuda_free(net->truth_gpu);
+        net->input_gpu = cuda_make_array(net->input, net->inputs*net->batch);
+        net->truth_gpu = cuda_make_array(net->truth, net->truths*net->batch);
+        if(workspace_size){
+            net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
         }
-        net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
     }else {
         free(net->workspace);
         net->workspace = calloc(1, workspace_size);
@@ -386,34 +437,25 @@ int resize_network(network *net, int w, int h)
     return 0;
 }
 
-int get_network_output_size(network net)
+layer get_network_detection_layer(network *net)
 {
     int i;
-    for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
-    return net.layers[i].outputs;
-}
-
-int get_network_input_size(network net)
-{
-    return net.layers[0].inputs;
-}
-
-detection_layer get_network_detection_layer(network net)
-{
-    int i;
-    for(i = 0; i < net.n; ++i){
-        if(net.layers[i].type == DETECTION){
-            return net.layers[i];
+    for(i = 0; i < net->n; ++i){
+        if(net->layers[i].type == DETECTION){
+            return net->layers[i];
         }
     }
     fprintf(stderr, "Detection layer not found!!\n");
-    detection_layer l = {0};
+    layer l = {0};
     return l;
 }
 
-image get_network_image_layer(network net, int i)
+image get_network_image_layer(network *net, int i)
 {
-    layer l = net.layers[i];
+    layer l = net->layers[i];
+#ifdef GPU
+    //cuda_pull_array(l.output_gpu, l.output, l.outputs);
+#endif
     if (l.out_w && l.out_h && l.out_c){
         return float_to_image(l.out_w, l.out_h, l.out_c, l.output);
     }
@@ -421,10 +463,10 @@ image get_network_image_layer(network net, int i)
     return def;
 }
 
-image get_network_image(network net)
+image get_network_image(network *net)
 {
     int i;
-    for(i = net.n-1; i >= 0; --i){
+    for(i = net->n-1; i >= 0; --i){
         image m = get_network_image_layer(net, i);
         if(m.h != 0) return m;
     }
@@ -432,60 +474,134 @@ image get_network_image(network net)
     return def;
 }
 
-void visualize_network(network net)
+void visualize_network(network *net)
 {
     image *prev = 0;
     int i;
     char buff[256];
-    for(i = 0; i < net.n; ++i){
+    for(i = 0; i < net->n; ++i){
         sprintf(buff, "Layer %d", i);
-        layer l = net.layers[i];
+        layer l = net->layers[i];
         if(l.type == CONVOLUTIONAL){
             prev = visualize_convolutional_layer(l, buff, prev);
         }
     } 
 }
 
-void top_predictions(network net, int k, int *index)
+void top_predictions(network *net, int k, int *index)
 {
-    int size = get_network_output_size(net);
-    float *out = get_network_output(net);
-    top_k(out, size, k, index);
+    top_k(net->output, net->outputs, k, index);
 }
 
 
-float *network_predict(network net, float *input)
+float *network_predict(network *net, float *input)
 {
-#ifdef GPU
-    if(gpu_index >= 0)  return network_predict_gpu(net, input);
-#endif
-
-    network_state state;
-    state.net = net;
-    state.index = 0;
-    state.input = input;
-    state.truth = 0;
-    state.train = 0;
-    state.delta = 0;
-    forward_network(net, state);
-    float *out = get_network_output(net);
+    network orig = *net;
+    net->input = input;
+    net->truth = 0;
+    net->train = 0;
+    net->delta = 0;
+    forward_network(net);
+    float *out = net->output;
+    *net = orig;
     return out;
 }
 
-matrix network_predict_data_multi(network net, data test, int n)
+int num_detections(network *net, float thresh)
+{
+    int i;
+    int s = 0;
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        if(l.type == YOLO){
+            s += yolo_num_detections(l, thresh);
+        }
+        if(l.type == DETECTION || l.type == REGION){
+            s += l.w*l.h*l.n;
+        }
+    }
+    return s;
+}
+
+detection *make_network_boxes(network *net, float thresh, int *num)
+{
+    layer l = net->layers[net->n - 1];
+    int i;
+    int nboxes = num_detections(net, thresh);
+    if(num) *num = nboxes;
+    detection *dets = calloc(nboxes, sizeof(detection));
+    for(i = 0; i < nboxes; ++i){
+        dets[i].prob = calloc(l.classes, sizeof(float));
+        if(l.coords > 4){
+            dets[i].mask = calloc(l.coords-4, sizeof(float));
+        }
+    }
+    return dets;
+}
+
+void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets)
+{
+    int j;
+    for(j = 0; j < net->n; ++j){
+        layer l = net->layers[j];
+        if(l.type == YOLO){
+            int count = get_yolo_detections(l, w, h, net->w, net->h, thresh, map, relative, dets);
+            dets += count;
+        }
+        if(l.type == REGION){
+            get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
+            dets += l.w*l.h*l.n;
+        }
+        if(l.type == DETECTION){
+            get_detection_detections(l, w, h, thresh, dets);
+            dets += l.w*l.h*l.n;
+        }
+    }
+}
+
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, int *num)
+{
+    detection *dets = make_network_boxes(net, thresh, num);
+    fill_network_boxes(net, w, h, thresh, hier, map, relative, dets);
+    return dets;
+}
+
+void free_detections(detection *dets, int n)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        free(dets[i].prob);
+        if(dets[i].mask) free(dets[i].mask);
+    }
+    free(dets);
+}
+
+float *network_predict_image(network *net, image im)
+{
+    image imr = letterbox_image(im, net->w, net->h);
+    set_batch_network(net, 1);
+    float *p = network_predict(net, imr.data);
+    free_image(imr);
+    return p;
+}
+
+int network_width(network *net){return net->w;}
+int network_height(network *net){return net->h;}
+
+matrix network_predict_data_multi(network *net, data test, int n)
 {
     int i,j,b,m;
-    int k = get_network_output_size(net);
+    int k = net->outputs;
     matrix pred = make_matrix(test.X.rows, k);
-    float *X = calloc(net.batch*test.X.rows, sizeof(float));
-    for(i = 0; i < test.X.rows; i += net.batch){
-        for(b = 0; b < net.batch; ++b){
+    float *X = calloc(net->batch*test.X.rows, sizeof(float));
+    for(i = 0; i < test.X.rows; i += net->batch){
+        for(b = 0; b < net->batch; ++b){
             if(i+b == test.X.rows) break;
             memcpy(X+b*test.X.cols, test.X.vals[i+b], test.X.cols*sizeof(float));
         }
         for(m = 0; m < n; ++m){
             float *out = network_predict(net, X);
-            for(b = 0; b < net.batch; ++b){
+            for(b = 0; b < net->batch; ++b){
                 if(i+b == test.X.rows) break;
                 for(j = 0; j < k; ++j){
                     pred.vals[i+b][j] += out[j+b*k]/n;
@@ -497,19 +613,19 @@ matrix network_predict_data_multi(network net, data test, int n)
     return pred;   
 }
 
-matrix network_predict_data(network net, data test)
+matrix network_predict_data(network *net, data test)
 {
     int i,j,b;
-    int k = get_network_output_size(net);
+    int k = net->outputs;
     matrix pred = make_matrix(test.X.rows, k);
-    float *X = calloc(net.batch*test.X.cols, sizeof(float));
-    for(i = 0; i < test.X.rows; i += net.batch){
-        for(b = 0; b < net.batch; ++b){
+    float *X = calloc(net->batch*test.X.cols, sizeof(float));
+    for(i = 0; i < test.X.rows; i += net->batch){
+        for(b = 0; b < net->batch; ++b){
             if(i+b == test.X.rows) break;
             memcpy(X+b*test.X.cols, test.X.vals[i+b], test.X.cols*sizeof(float));
         }
         float *out = network_predict(net, X);
-        for(b = 0; b < net.batch; ++b){
+        for(b = 0; b < net->batch; ++b){
             if(i+b == test.X.rows) break;
             for(j = 0; j < k; ++j){
                 pred.vals[i+b][j] = out[j+b*k];
@@ -520,11 +636,11 @@ matrix network_predict_data(network net, data test)
     return pred;   
 }
 
-void print_network(network net)
+void print_network(network *net)
 {
     int i,j;
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
         float *output = l.output;
         int n = l.outputs;
         float mean = mean_array(output, n);
@@ -537,7 +653,7 @@ void print_network(network net)
     }
 }
 
-void compare_networks(network n1, network n2, data test)
+void compare_networks(network *n1, network *n2, data test)
 {
     matrix g1 = network_predict_data(n1, test);
     matrix g2 = network_predict_data(n2, test);
@@ -562,7 +678,7 @@ void compare_networks(network n1, network n2, data test)
     printf("%f\n", num/den); 
 }
 
-float network_accuracy(network net, data d)
+float network_accuracy(network *net, data d)
 {
     matrix guess = network_predict_data(net, d);
     float acc = matrix_topk_accuracy(d.y, guess,1);
@@ -570,7 +686,7 @@ float network_accuracy(network net, data d)
     return acc;
 }
 
-float *network_accuracies(network net, data d, int n)
+float *network_accuracies(network *net, data d, int n)
 {
     static float acc[2];
     matrix guess = network_predict_data(net, d);
@@ -580,7 +696,16 @@ float *network_accuracies(network net, data d, int n)
     return acc;
 }
 
-float network_accuracy_multi(network net, data d, int n)
+layer get_network_output_layer(network *net)
+{
+    int i;
+    for(i = net->n - 1; i >= 0; --i){
+        if(net->layers[i].type != COST) break;
+    }
+    return net->layers[i];
+}
+
+float network_accuracy_multi(network *net, data d, int n)
 {
     matrix guess = network_predict_data_multi(net, d, n);
     float acc = matrix_topk_accuracy(d.y, guess,1);
@@ -588,17 +713,417 @@ float network_accuracy_multi(network net, data d, int n)
     return acc;
 }
 
-void free_network(network net)
+void free_network(network *net)
 {
     int i;
-    for(i = 0; i < net.n; ++i){
-        free_layer(net.layers[i]);
+    for(i = 0; i < net->n; ++i){
+        free_layer(net->layers[i]);
     }
-    free(net.layers);
+    free(net->layers);
+    if(net->input) free(net->input);
+    if(net->truth) free(net->truth);
 #ifdef GPU
-    if(*net.input_gpu) cuda_free(*net.input_gpu);
-    if(*net.truth_gpu) cuda_free(*net.truth_gpu);
-    if(net.input_gpu) free(net.input_gpu);
-    if(net.truth_gpu) free(net.truth_gpu);
+    if(net->input_gpu) cuda_free(net->input_gpu);
+    if(net->truth_gpu) cuda_free(net->truth_gpu);
 #endif
+    free(net);
+}
+
+// Some day...
+// ^ What the hell is this comment for?
+
+
+layer network_output_layer(network *net)
+{
+    int i;
+    for(i = net->n - 1; i >= 0; --i){
+        if(net->layers[i].type != COST) break;
+    }
+    return net->layers[i];
 }
+
+int network_inputs(network *net)
+{
+    return net->layers[0].inputs;
+}
+
+int network_outputs(network *net)
+{
+    return network_output_layer(net).outputs;
+}
+
+float *network_output(network *net)
+{
+    return network_output_layer(net).output;
+}
+
+#ifdef GPU
+
+void forward_network_gpu(network *netp)
+{
+    network net = *netp;
+    cuda_set_device(net.gpu_index);
+    cuda_push_array(net.input_gpu, net.input, net.inputs*net.batch);
+    if(net.truth){
+        cuda_push_array(net.truth_gpu, net.truth, net.truths*net.batch);
+    }
+
+    int i;
+    for(i = 0; i < net.n; ++i){
+        net.index = i;
+        layer l = net.layers[i];
+        if(l.delta_gpu){
+            fill_gpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
+        }
+        l.forward_gpu(l, net);
+        net.input_gpu = l.output_gpu;
+        net.input = l.output;
+        if(l.truth) {
+            net.truth_gpu = l.output_gpu;
+            net.truth = l.output;
+        }
+    }
+    pull_network_output(netp);
+    calc_network_cost(netp);
+}
+
+void backward_network_gpu(network *netp)
+{
+    int i;
+    network net = *netp;
+    network orig = net;
+    cuda_set_device(net.gpu_index);
+    for(i = net.n-1; i >= 0; --i){
+        layer l = net.layers[i];
+        if(l.stopbackward) break;
+        if(i == 0){
+            net = orig;
+        }else{
+            layer prev = net.layers[i-1];
+            net.input = prev.output;
+            net.delta = prev.delta;
+            net.input_gpu = prev.output_gpu;
+            net.delta_gpu = prev.delta_gpu;
+        }
+        net.index = i;
+        l.backward_gpu(l, net);
+    }
+}
+
+void update_network_gpu(network *netp)
+{
+    network net = *netp;
+    cuda_set_device(net.gpu_index);
+    int i;
+    update_args a = {0};
+    a.batch = net.batch*net.subdivisions;
+    a.learning_rate = get_current_rate(netp);
+    a.momentum = net.momentum;
+    a.decay = net.decay;
+    a.adam = net.adam;
+    a.B1 = net.B1;
+    a.B2 = net.B2;
+    a.eps = net.eps;
+    ++*net.t;
+    a.t = (*net.t);
+
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        if(l.update_gpu){
+            l.update_gpu(l, a);
+        }
+    }
+}
+
+void harmless_update_network_gpu(network *netp)
+{
+    network net = *netp;
+    cuda_set_device(net.gpu_index);
+    int i;
+    for(i = 0; i < net.n; ++i){
+        layer l = net.layers[i];
+        if(l.weight_updates_gpu) fill_gpu(l.nweights, 0, l.weight_updates_gpu, 1);
+        if(l.bias_updates_gpu) fill_gpu(l.nbiases, 0, l.bias_updates_gpu, 1);
+        if(l.scale_updates_gpu) fill_gpu(l.nbiases, 0, l.scale_updates_gpu, 1);
+    }
+}
+
+typedef struct {
+    network *net;
+    data d;
+    float *err;
+} train_args;
+
+void *train_thread(void *ptr)
+{
+    train_args args = *(train_args*)ptr;
+    free(ptr);
+    cuda_set_device(args.net->gpu_index);
+    *args.err = train_network(args.net, args.d);
+    return 0;
+}
+
+pthread_t train_network_in_thread(network *net, data d, float *err)
+{
+    pthread_t thread;
+    train_args *ptr = (train_args *)calloc(1, sizeof(train_args));
+    ptr->net = net;
+    ptr->d = d;
+    ptr->err = err;
+    if(pthread_create(&thread, 0, train_thread, ptr)) error("Thread creation failed");
+    return thread;
+}
+
+void merge_weights(layer l, layer base)
+{
+    if (l.type == CONVOLUTIONAL) {
+        axpy_cpu(l.n, 1, l.bias_updates, 1, base.biases, 1);
+        axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weights, 1);
+        if (l.scales) {
+            axpy_cpu(l.n, 1, l.scale_updates, 1, base.scales, 1);
+        }
+    } else if(l.type == CONNECTED) {
+        axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.biases, 1);
+        axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weights, 1);
+    }
+}
+
+void scale_weights(layer l, float s)
+{
+    if (l.type == CONVOLUTIONAL) {
+        scal_cpu(l.n, s, l.biases, 1);
+        scal_cpu(l.nweights, s, l.weights, 1);
+        if (l.scales) {
+            scal_cpu(l.n, s, l.scales, 1);
+        }
+    } else if(l.type == CONNECTED) {
+        scal_cpu(l.outputs, s, l.biases, 1);
+        scal_cpu(l.outputs*l.inputs, s, l.weights, 1);
+    }
+}
+
+
+void pull_weights(layer l)
+{
+    if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
+        cuda_pull_array(l.biases_gpu, l.bias_updates, l.n);
+        cuda_pull_array(l.weights_gpu, l.weight_updates, l.nweights);
+        if(l.scales) cuda_pull_array(l.scales_gpu, l.scale_updates, l.n);
+    } else if(l.type == CONNECTED){
+        cuda_pull_array(l.biases_gpu, l.bias_updates, l.outputs);
+        cuda_pull_array(l.weights_gpu, l.weight_updates, l.outputs*l.inputs);
+    }
+}
+
+void push_weights(layer l)
+{
+    if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
+        cuda_push_array(l.biases_gpu, l.biases, l.n);
+        cuda_push_array(l.weights_gpu, l.weights, l.nweights);
+        if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n);
+    } else if(l.type == CONNECTED){
+        cuda_push_array(l.biases_gpu, l.biases, l.outputs);
+        cuda_push_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
+    }
+}
+
+void distribute_weights(layer l, layer base)
+{
+    if (l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL) {
+        cuda_push_array(l.biases_gpu, base.biases, l.n);
+        cuda_push_array(l.weights_gpu, base.weights, l.nweights);
+        if (base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n);
+    } else if (l.type == CONNECTED) {
+        cuda_push_array(l.biases_gpu, base.biases, l.outputs);
+        cuda_push_array(l.weights_gpu, base.weights, l.outputs*l.inputs);
+    }
+}
+
+
+/*
+
+   void pull_updates(layer l)
+   {
+   if(l.type == CONVOLUTIONAL){
+   cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
+   cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+   if(l.scale_updates) cuda_pull_array(l.scale_updates_gpu, l.scale_updates, l.n);
+   } else if(l.type == CONNECTED){
+   cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
+   cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
+   }
+   }
+
+   void push_updates(layer l)
+   {
+   if(l.type == CONVOLUTIONAL){
+   cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+   cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+   if(l.scale_updates) cuda_push_array(l.scale_updates_gpu, l.scale_updates, l.n);
+   } else if(l.type == CONNECTED){
+   cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
+   cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
+   }
+   }
+
+   void update_layer(layer l, network net)
+   {
+   int update_batch = net.batch*net.subdivisions;
+   float rate = get_current_rate(net);
+   l.t = get_current_batch(net);
+   if(l.update_gpu){
+   l.update_gpu(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay);
+   }
+   }
+   void merge_updates(layer l, layer base)
+   {
+   if (l.type == CONVOLUTIONAL) {
+   axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1);
+   axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weight_updates, 1);
+   if (l.scale_updates) {
+   axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1);
+   }
+   } else if(l.type == CONNECTED) {
+   axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.bias_updates, 1);
+   axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weight_updates, 1);
+   }
+   }
+
+   void distribute_updates(layer l, layer base)
+   {
+   if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
+   cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.n);
+   cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.nweights);
+   if(base.scale_updates) cuda_push_array(l.scale_updates_gpu, base.scale_updates, l.n);
+   } else if(l.type == CONNECTED){
+   cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.outputs);
+   cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.outputs*l.inputs);
+   }
+   }
+ */
+
+/*
+   void sync_layer(network *nets, int n, int j)
+   {
+   int i;
+   network net = nets[0];
+   layer base = net.layers[j];
+   scale_weights(base, 0);
+   for (i = 0; i < n; ++i) {
+   cuda_set_device(nets[i].gpu_index);
+   layer l = nets[i].layers[j];
+   pull_weights(l);
+   merge_weights(l, base);
+   }
+   scale_weights(base, 1./n);
+   for (i = 0; i < n; ++i) {
+   cuda_set_device(nets[i].gpu_index);
+   layer l = nets[i].layers[j];
+   distribute_weights(l, base);
+   }
+   }
+ */
+
+void sync_layer(network **nets, int n, int j)
+{
+    int i;
+    network *net = nets[0];
+    layer base = net->layers[j];
+    scale_weights(base, 0);
+    for (i = 0; i < n; ++i) {
+        cuda_set_device(nets[i]->gpu_index);
+        layer l = nets[i]->layers[j];
+        pull_weights(l);
+        merge_weights(l, base);
+    }
+    scale_weights(base, 1./n);
+    for (i = 0; i < n; ++i) {
+        cuda_set_device(nets[i]->gpu_index);
+        layer l = nets[i]->layers[j];
+        distribute_weights(l, base);
+    }
+}
+
+typedef struct{
+    network **nets;
+    int n;
+    int j;
+} sync_args;
+
+void *sync_layer_thread(void *ptr)
+{
+    sync_args args = *(sync_args*)ptr;
+    sync_layer(args.nets, args.n, args.j);
+    free(ptr);
+    return 0;
+}
+
+pthread_t sync_layer_in_thread(network **nets, int n, int j)
+{
+    pthread_t thread;
+    sync_args *ptr = (sync_args *)calloc(1, sizeof(sync_args));
+    ptr->nets = nets;
+    ptr->n = n;
+    ptr->j = j;
+    if(pthread_create(&thread, 0, sync_layer_thread, ptr)) error("Thread creation failed");
+    return thread;
+}
+
+void sync_nets(network **nets, int n, int interval)
+{
+    int j;
+    int layers = nets[0]->n;
+    pthread_t *threads = (pthread_t *) calloc(layers, sizeof(pthread_t));
+
+    *(nets[0]->seen) += interval * (n-1) * nets[0]->batch * nets[0]->subdivisions;
+    for (j = 0; j < n; ++j){
+        *(nets[j]->seen) = *(nets[0]->seen);
+    }
+    for (j = 0; j < layers; ++j) {
+        threads[j] = sync_layer_in_thread(nets, n, j);
+    }
+    for (j = 0; j < layers; ++j) {
+        pthread_join(threads[j], 0);
+    }
+    free(threads);
+}
+
+float train_networks(network **nets, int n, data d, int interval)
+{
+    int i;
+    int batch = nets[0]->batch;
+    int subdivisions = nets[0]->subdivisions;
+    assert(batch * subdivisions * n == d.X.rows);
+    pthread_t *threads = (pthread_t *) calloc(n, sizeof(pthread_t));
+    float *errors = (float *) calloc(n, sizeof(float));
+
+    float sum = 0;
+    for(i = 0; i < n; ++i){
+        data p = get_data_part(d, i, n);
+        threads[i] = train_network_in_thread(nets[i], p, errors + i);
+    }
+    for(i = 0; i < n; ++i){
+        pthread_join(threads[i], 0);
+        //printf("%f\n", errors[i]);
+        sum += errors[i];
+    }
+    //cudaDeviceSynchronize();
+    if (get_current_batch(nets[0]) % interval == 0) {
+        printf("Syncing... ");
+        fflush(stdout);
+        sync_nets(nets, n, interval);
+        printf("Done!\n");
+    }
+    //cudaDeviceSynchronize();
+    free(threads);
+    free(errors);
+    return (float)sum/(n);
+}
+
+void pull_network_output(network *net)
+{
+    layer l = get_network_output_layer(net);
+    cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
+}
+
+#endif
diff --git a/image.darknet/src/network.h b/image.darknet/src/network.h
index e48cbc2..1b0dfd1 100644
--- a/image.darknet/src/network.h
+++ b/image.darknet/src/network.h
@@ -1,129 +1,29 @@
 // Oh boy, why am I about to do this....
 #ifndef NETWORK_H
 #define NETWORK_H
+#include "darknet.h"
 
 #include "image.h"
 #include "layer.h"
 #include "data.h"
 #include "tree.h"
 
-typedef enum {
-    CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM
-} learning_rate_policy;
-
-typedef struct network{
-    float *workspace;
-    int n;
-    int batch;
-    int *seen;
-    float epoch;
-    int subdivisions;
-    float momentum;
-    float decay;
-    layer *layers;
-    int outputs;
-    float *output;
-    learning_rate_policy policy;
-
-    float learning_rate;
-    float gamma;
-    float scale;
-    float power;
-    int time_steps;
-    int step;
-    int max_batches;
-    float *scales;
-    int   *steps;
-    int num_steps;
-    int burn_in;
-
-    int adam;
-    float B1;
-    float B2;
-    float eps;
-
-    int inputs;
-    int h, w, c;
-    int max_crop;
-    int min_crop;
-    float angle;
-    float aspect;
-    float exposure;
-    float saturation;
-    float hue;
-
-    int gpu_index;
-    tree *hierarchy;
-
-    #ifdef GPU
-    float **input_gpu;
-    float **truth_gpu;
-    #endif
-} network;
-
-typedef struct network_state {
-    float *truth;
-    float *input;
-    float *delta;
-    float *workspace;
-    int train;
-    int index;
-    network net;
-} network_state;
 
 #ifdef GPU
-float train_networks(network *nets, int n, data d, int interval);
-void sync_nets(network *nets, int n, int interval);
-float train_network_datum_gpu(network net, float *x, float *y);
-float *network_predict_gpu(network net, float *input);
-float * get_network_output_gpu_layer(network net, int i);
-float * get_network_delta_gpu_layer(network net, int i);
-float *get_network_output_gpu(network net);
-void forward_network_gpu(network net, network_state state);
-void backward_network_gpu(network net, network_state state);
-void update_network_gpu(network net);
+void pull_network_output(network *net);
 #endif
 
-float get_current_rate(network net);
-int get_current_batch(network net);
-void free_network(network net);
-void compare_networks(network n1, network n2, data d);
+void compare_networks(network *n1, network *n2, data d);
 char *get_layer_string(LAYER_TYPE a);
 
-network make_network(int n);
-void forward_network(network net, network_state state);
-void backward_network(network net, network_state state);
-void update_network(network net);
+network *make_network(int n);
 
-float train_network(network net, data d);
-float train_network_batch(network net, data d, int n);
-float train_network_sgd(network net, data d, int n);
-float train_network_datum(network net, float *x, float *y);
 
-matrix network_predict_data(network net, data test);
-float *network_predict(network net, float *input);
-float network_accuracy(network net, data d);
-float *network_accuracies(network net, data d, int n);
-float network_accuracy_multi(network net, data d, int n);
-void top_predictions(network net, int n, int *index);
-float *get_network_output(network net);
-float *get_network_output_layer(network net, int i);
-float *get_network_delta_layer(network net, int i);
-float *get_network_delta(network net);
-int get_network_output_size_layer(network net, int i);
-int get_network_output_size(network net);
-image get_network_image(network net);
-image get_network_image_layer(network net, int i);
-int get_predicted_class_network(network net);
-void print_network(network net);
-void visualize_network(network net);
+float network_accuracy_multi(network *net, data d, int n);
+int get_predicted_class_network(network *net);
+void print_network(network *net);
 int resize_network(network *net, int w, int h);
-void set_batch_network(network *net, int b);
-int get_network_input_size(network net);
-float get_network_cost(network net);
-
-int get_network_nuisance(network net);
-int get_network_background(network net);
+void calc_network_cost(network *net);
 
 #endif
 
diff --git a/image.darknet/src/network_kernels.cu b/image.darknet/src/network_kernels.cu
deleted file mode 100644
index 313cd6d..0000000
--- a/image.darknet/src/network_kernels.cu
+++ /dev/null
@@ -1,408 +0,0 @@
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
-
-extern "C" {
-#include <stdio.h>
-#include <time.h>
-#include <assert.h>
-
-#include "network.h"
-#include "image.h"
-#include "data.h"
-#include "utils.h"
-#include "parser.h"
-
-#include "crop_layer.h"
-#include "connected_layer.h"
-#include "rnn_layer.h"
-#include "gru_layer.h"
-#include "crnn_layer.h"
-#include "detection_layer.h"
-#include "region_layer.h"
-#include "convolutional_layer.h"
-#include "activation_layer.h"
-#include "maxpool_layer.h"
-#include "reorg_layer.h"
-#include "avgpool_layer.h"
-#include "normalization_layer.h"
-#include "batchnorm_layer.h"
-#include "cost_layer.h"
-#include "local_layer.h"
-#include "softmax_layer.h"
-#include "dropout_layer.h"
-#include "route_layer.h"
-#include "shortcut_layer.h"
-#include "blas.h"
-}
-
-float * get_network_output_gpu_layer(network net, int i);
-float * get_network_delta_gpu_layer(network net, int i);
-float * get_network_output_gpu(network net);
-
-void forward_network_gpu(network net, network_state state)
-{
-    state.workspace = net.workspace;
-    int i;
-    for(i = 0; i < net.n; ++i){
-        state.index = i;
-        layer l = net.layers[i];
-        if(l.delta_gpu){
-            fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
-        }
-        l.forward_gpu(l, state);
-        state.input = l.output_gpu;
-    }
-}
-
-void backward_network_gpu(network net, network_state state)
-{
-    state.workspace = net.workspace;
-    int i;
-    float * original_input = state.input;
-    float * original_delta = state.delta;
-    for(i = net.n-1; i >= 0; --i){
-        state.index = i;
-        layer l = net.layers[i];
-        if(i == 0){
-            state.input = original_input;
-            state.delta = original_delta;
-        }else{
-            layer prev = net.layers[i-1];
-            state.input = prev.output_gpu;
-            state.delta = prev.delta_gpu;
-        }
-        l.backward_gpu(l, state);
-    }
-}
-
-void update_network_gpu(network net)
-{
-    cuda_set_device(net.gpu_index);
-    int i;
-    int update_batch = net.batch*net.subdivisions;
-    float rate = get_current_rate(net);
-    for(i = 0; i < net.n; ++i){
-        layer l = net.layers[i];
-        l.t = get_current_batch(net);
-        if(l.update_gpu){
-            l.update_gpu(l, update_batch, rate, net.momentum, net.decay);
-        }
-    }
-}
-
-void forward_backward_network_gpu(network net, float *x, float *y)
-{
-    network_state state;
-    state.index = 0;
-    state.net = net;
-    int x_size = get_network_input_size(net)*net.batch;
-    int y_size = get_network_output_size(net)*net.batch;
-    if(net.layers[net.n-1].truths) y_size = net.layers[net.n-1].truths*net.batch;
-    if(!*net.input_gpu){
-        *net.input_gpu = cuda_make_array(x, x_size);
-        *net.truth_gpu = cuda_make_array(y, y_size);
-    }else{
-        cuda_push_array(*net.input_gpu, x, x_size);
-        cuda_push_array(*net.truth_gpu, y, y_size);
-    }
-    state.input = *net.input_gpu;
-    state.delta = 0;
-    state.truth = *net.truth_gpu;
-    state.train = 1;
-    forward_network_gpu(net, state);
-    backward_network_gpu(net, state);
-}
-
-float train_network_datum_gpu(network net, float *x, float *y)
-{
-    *net.seen += net.batch;
-    forward_backward_network_gpu(net, x, y);
-    float error = get_network_cost(net);
-    if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net);
-
-    return error;
-}
-
-typedef struct {
-    network net;
-    data d;
-    float *err;
-} train_args;
-
-void *train_thread(void *ptr)
-{
-    train_args args = *(train_args*)ptr;
-    free(ptr);
-    cuda_set_device(args.net.gpu_index);
-    *args.err = train_network(args.net, args.d);
-    return 0;
-}
-
-pthread_t train_network_in_thread(network net, data d, float *err)
-{
-    pthread_t thread;
-    train_args *ptr = (train_args *)calloc(1, sizeof(train_args));
-    ptr->net = net;
-    ptr->d = d;
-    ptr->err = err;
-    if(pthread_create(&thread, 0, train_thread, ptr)) error("Thread creation failed");
-    return thread;
-}
-
-void pull_updates(layer l)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
-        cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.n*l.size*l.size*l.c);
-        if(l.scale_updates) cuda_pull_array(l.scale_updates_gpu, l.scale_updates, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
-        cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
-    }
-}
-
-void push_updates(layer l)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
-        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.n*l.size*l.size*l.c);
-        if(l.scale_updates) cuda_push_array(l.scale_updates_gpu, l.scale_updates, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
-        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.outputs*l.inputs);
-    }
-}
-
-void update_layer(layer l, network net)
-{
-    int update_batch = net.batch*net.subdivisions;
-    float rate = get_current_rate(net);
-    l.t = get_current_batch(net);
-    if(l.update_gpu){
-        l.update_gpu(l, update_batch, rate, net.momentum, net.decay);
-    }
-}
-
-void merge_weights(layer l, layer base)
-{
-    if (l.type == CONVOLUTIONAL) {
-        axpy_cpu(l.n, 1, l.biases, 1, base.biases, 1);
-        axpy_cpu(l.n*l.size*l.size*l.c, 1, l.weights, 1, base.weights, 1);
-        if (l.scales) {
-            axpy_cpu(l.n, 1, l.scales, 1, base.scales, 1);
-        }
-    } else if(l.type == CONNECTED) {
-        axpy_cpu(l.outputs, 1, l.biases, 1, base.biases, 1);
-        axpy_cpu(l.outputs*l.inputs, 1, l.weights, 1, base.weights, 1);
-    }
-}
-
-void scale_weights(layer l, float s)
-{
-    if (l.type == CONVOLUTIONAL) {
-        scal_cpu(l.n, s, l.biases, 1);
-        scal_cpu(l.n*l.size*l.size*l.c, s, l.weights, 1);
-        if (l.scales) {
-            scal_cpu(l.n, s, l.scales, 1);
-        }
-    } else if(l.type == CONNECTED) {
-        scal_cpu(l.outputs, s, l.biases, 1);
-        scal_cpu(l.outputs*l.inputs, s, l.weights, 1);
-    }
-}
-
-
-void pull_weights(layer l)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_pull_array(l.biases_gpu, l.biases, l.n);
-        cuda_pull_array(l.weights_gpu, l.weights, l.n*l.size*l.size*l.c);
-        if(l.scales) cuda_pull_array(l.scales_gpu, l.scales, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
-        cuda_pull_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
-    }
-}
-
-void push_weights(layer l)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_push_array(l.biases_gpu, l.biases, l.n);
-        cuda_push_array(l.weights_gpu, l.weights, l.n*l.size*l.size*l.c);
-        if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_push_array(l.biases_gpu, l.biases, l.outputs);
-        cuda_push_array(l.weights_gpu, l.weights, l.outputs*l.inputs);
-    }
-}
-
-void distribute_weights(layer l, layer base)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_push_array(l.biases_gpu, base.biases, l.n);
-        cuda_push_array(l.weights_gpu, base.weights, l.n*l.size*l.size*l.c);
-        if(base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_push_array(l.biases_gpu, base.biases, l.outputs);
-        cuda_push_array(l.weights_gpu, base.weights, l.outputs*l.inputs);
-    }
-}
-
-
-void merge_updates(layer l, layer base)
-{
-    if (l.type == CONVOLUTIONAL) {
-        axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1);
-        axpy_cpu(l.n*l.size*l.size*l.c, 1, l.weight_updates, 1, base.weight_updates, 1);
-        if (l.scale_updates) {
-            axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1);
-        }
-    } else if(l.type == CONNECTED) {
-        axpy_cpu(l.outputs, 1, l.bias_updates, 1, base.bias_updates, 1);
-        axpy_cpu(l.outputs*l.inputs, 1, l.weight_updates, 1, base.weight_updates, 1);
-    }
-}
-
-void distribute_updates(layer l, layer base)
-{
-    if(l.type == CONVOLUTIONAL){
-        cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.n);
-        cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.n*l.size*l.size*l.c);
-        if(base.scale_updates) cuda_push_array(l.scale_updates_gpu, base.scale_updates, l.n);
-    } else if(l.type == CONNECTED){
-        cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.outputs);
-        cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.outputs*l.inputs);
-    }
-}
-
-void sync_layer(network *nets, int n, int j)
-{
-    //printf("Syncing layer %d\n", j);
-    int i;
-    network net = nets[0];
-    layer base = net.layers[j];
-    cuda_set_device(net.gpu_index);
-    pull_weights(base);
-    for (i = 1; i < n; ++i) {
-        cuda_set_device(nets[i].gpu_index);
-        layer l = nets[i].layers[j];
-        pull_weights(l);
-        merge_weights(l, base);
-    }
-    scale_weights(base, 1./n);
-    for (i = 0; i < n; ++i) {
-        cuda_set_device(nets[i].gpu_index);
-        layer l = nets[i].layers[j];
-        distribute_weights(l, base);
-    }
-    //printf("Done syncing layer %d\n", j);
-}
-
-typedef struct{
-    network *nets;
-    int n;
-    int j;
-} sync_args;
-
-void *sync_layer_thread(void *ptr)
-{
-    sync_args args = *(sync_args*)ptr;
-    sync_layer(args.nets, args.n, args.j);
-    free(ptr);
-    return 0;
-}
-
-pthread_t sync_layer_in_thread(network *nets, int n, int j)
-{
-    pthread_t thread;
-    sync_args *ptr = (sync_args *)calloc(1, sizeof(sync_args));
-    ptr->nets = nets;
-    ptr->n = n;
-    ptr->j = j;
-    if(pthread_create(&thread, 0, sync_layer_thread, ptr)) error("Thread creation failed");
-    return thread;
-}
-
-void sync_nets(network *nets, int n, int interval)
-{
-    int j;
-    int layers = nets[0].n;
-    pthread_t *threads = (pthread_t *) calloc(layers, sizeof(pthread_t));
-
-    *nets[0].seen += interval * (n-1) * nets[0].batch * nets[0].subdivisions;
-    for (j = 0; j < n; ++j){
-        *nets[j].seen = *nets[0].seen;
-    }
-    for (j = 0; j < layers; ++j) {
-        threads[j] = sync_layer_in_thread(nets, n, j);
-    }
-    for (j = 0; j < layers; ++j) {
-        pthread_join(threads[j], 0);
-    }
-    free(threads);
-}
-
-float train_networks(network *nets, int n, data d, int interval)
-{
-    int i;
-    int batch = nets[0].batch;
-    int subdivisions = nets[0].subdivisions;
-    assert(batch * subdivisions * n == d.X.rows);
-    pthread_t *threads = (pthread_t *) calloc(n, sizeof(pthread_t));
-    float *errors = (float *) calloc(n, sizeof(float));
-
-    float sum = 0;
-    for(i = 0; i < n; ++i){
-        data p = get_data_part(d, i, n);
-        threads[i] = train_network_in_thread(nets[i], p, errors + i);
-    }
-    for(i = 0; i < n; ++i){
-        pthread_join(threads[i], 0);
-        //printf("%f\n", errors[i]);
-        sum += errors[i];
-    }
-    //cudaDeviceSynchronize();
-    if (get_current_batch(nets[0]) % interval == 0) {
-        printf("Syncing... ");
-        fflush(stdout);
-        sync_nets(nets, n, interval);
-        printf("Done!\n");
-    }
-    //cudaDeviceSynchronize();
-    free(threads);
-    free(errors);
-    return (float)sum/(n);
-}
-
-float *get_network_output_layer_gpu(network net, int i)
-{
-    layer l = net.layers[i];
-    if(l.type != REGION) cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
-    return l.output;
-}
-
-float *get_network_output_gpu(network net)
-{
-    int i;
-    for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break;
-    return get_network_output_layer_gpu(net, i);
-}
-
-float *network_predict_gpu(network net, float *input)
-{
-    cuda_set_device(net.gpu_index);
-    int size = get_network_input_size(net) * net.batch;
-    network_state state;
-    state.index = 0;
-    state.net = net;
-    state.input = cuda_make_array(input, size);
-    state.truth = 0;
-    state.train = 0;
-    state.delta = 0;
-    forward_network_gpu(net, state);
-    float *out = get_network_output_gpu(net);
-    cuda_free(state.input);
-    return out;
-}
-
diff --git a/image.darknet/src/nightmare.c b/image.darknet/src/nightmare.c
deleted file mode 100644
index ec7166c..0000000
--- a/image.darknet/src/nightmare.c
+++ /dev/null
@@ -1,308 +0,0 @@
-
-#include "network.h"
-#include "parser.h"
-#include "blas.h"
-#include "utils.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-// ./darknet nightmare cfg/extractor.recon.cfg ~/trained/yolo-coco.conv frame6.png -reconstruct -iters 500 -i 3 -lambda .1 -rate .01 -smooth 2
-
-float abs_mean(float *x, int n)
-{
-    int i;
-    float sum = 0;
-    for (i = 0; i < n; ++i){
-        sum += fabs(x[i]);
-    }
-    return sum/n;
-}
-
-void calculate_loss(float *output, float *delta, int n, float thresh)
-{
-    int i;
-    float mean = mean_array(output, n); 
-    float var = variance_array(output, n);
-    for(i = 0; i < n; ++i){
-        if(delta[i] > mean + thresh*sqrt(var)) delta[i] = output[i];
-        else delta[i] = 0;
-    }
-}
-
-void optimize_picture(network *net, image orig, int max_layer, float scale, float rate, float thresh, int norm)
-{
-    //scale_image(orig, 2);
-    //translate_image(orig, -1);
-    net->n = max_layer + 1;
-
-    int dx = rand()%16 - 8;
-    int dy = rand()%16 - 8;
-    int flip = rand()%2;
-
-    image crop = crop_image(orig, dx, dy, orig.w, orig.h);
-    image im = resize_image(crop, (int)(orig.w * scale), (int)(orig.h * scale));
-    if(flip) flip_image(im);
-
-    resize_network(net, im.w, im.h);
-    layer last = net->layers[net->n-1];
-    //net->layers[net->n - 1].activation = LINEAR;
-
-    image delta = make_image(im.w, im.h, im.c);
-
-    network_state state = {0};
-
-#ifdef GPU
-    state.input = cuda_make_array(im.data, im.w*im.h*im.c);
-    state.delta = cuda_make_array(im.data, im.w*im.h*im.c);
-
-    forward_network_gpu(*net, state);
-    copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1);
-
-    cuda_pull_array(last.delta_gpu, last.delta, last.outputs);
-    calculate_loss(last.delta, last.delta, last.outputs, thresh);
-    cuda_push_array(last.delta_gpu, last.delta, last.outputs);
-
-    backward_network_gpu(*net, state);
-
-    cuda_pull_array(state.delta, delta.data, im.w*im.h*im.c);
-    cuda_free(state.input);
-    cuda_free(state.delta);
-#else
-    state.input = im.data;
-    state.delta = delta.data;
-    forward_network(*net, state);
-    copy_cpu(last.outputs, last.output, 1, last.delta, 1);
-    calculate_loss(last.output, last.delta, last.outputs, thresh);
-    backward_network(*net, state);
-#endif
-
-    if(flip) flip_image(delta);
-    //normalize_array(delta.data, delta.w*delta.h*delta.c);
-    image resized = resize_image(delta, orig.w, orig.h);
-    image out = crop_image(resized, -dx, -dy, orig.w, orig.h);
-
-    /*
-       image g = grayscale_image(out);
-       free_image(out);
-       out = g;
-     */
-
-    //rate = rate / abs_mean(out.data, out.w*out.h*out.c);
-
-    if(norm) normalize_array(out.data, out.w*out.h*out.c);
-    axpy_cpu(orig.w*orig.h*orig.c, rate, out.data, 1, orig.data, 1);
-
-    /*
-       normalize_array(orig.data, orig.w*orig.h*orig.c);
-       scale_image(orig, sqrt(var));
-       translate_image(orig, mean);
-     */
-
-    //translate_image(orig, 1);
-    //scale_image(orig, .5);
-    //normalize_image(orig);
-
-    constrain_image(orig);
-
-    free_image(crop);
-    free_image(im);
-    free_image(delta);
-    free_image(resized);
-    free_image(out);
-
-}
-
-void smooth(image recon, image update, float lambda, int num)
-{
-    int i, j, k;
-    int ii, jj;
-    for(k = 0; k < recon.c; ++k){
-        for(j = 0; j < recon.h; ++j){
-            for(i = 0; i < recon.w; ++i){
-                int out_index = i + recon.w*(j + recon.h*k);
-                for(jj = j-num; jj <= j + num && jj < recon.h; ++jj){
-                    if (jj < 0) continue;
-                    for(ii = i-num; ii <= i + num && ii < recon.w; ++ii){
-                        if (ii < 0) continue;
-                        int in_index = ii + recon.w*(jj + recon.h*k);
-                        update.data[out_index] += lambda * (recon.data[in_index] - recon.data[out_index]);
-                    }
-                }
-            }
-        }
-    }
-}
-
-void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters)
-{
-    int iter = 0;
-    for (iter = 0; iter < iters; ++iter) {
-        image delta = make_image(recon.w, recon.h, recon.c);
-
-        network_state state = {0};
-#ifdef GPU
-        state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c);
-        state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c);
-        state.truth = cuda_make_array(features, get_network_output_size(net));
-
-        forward_network_gpu(net, state);
-        backward_network_gpu(net, state);
-
-        cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c);
-
-        cuda_free(state.input);
-        cuda_free(state.delta);
-        cuda_free(state.truth);
-#else
-        state.input = recon.data;
-        state.delta = delta.data;
-        state.truth = features;
-
-        forward_network(net, state);
-        backward_network(net, state);
-#endif
-
-        axpy_cpu(recon.w*recon.h*recon.c, 1, delta.data, 1, update.data, 1);
-        smooth(recon, update, lambda, smooth_size);
-
-        axpy_cpu(recon.w*recon.h*recon.c, rate, update.data, 1, recon.data, 1);
-        scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1);
-
-        //float mag = mag_array(recon.data, recon.w*recon.h*recon.c);
-        //scal_cpu(recon.w*recon.h*recon.c, 600/mag, recon.data, 1);
-
-        constrain_image(recon);
-        free_image(delta);
-    }
-}
-
-
-void run_nightmare(int argc, char **argv)
-{
-    srand(0);
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [cfg] [weights] [image] [layer] [options! (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[2];
-    char *weights = argv[3];
-    char *input = argv[4];
-    int max_layer = atoi(argv[5]);
-
-    int range = find_int_arg(argc, argv, "-range", 1);
-    int norm = find_int_arg(argc, argv, "-norm", 1);
-    int rounds = find_int_arg(argc, argv, "-rounds", 1);
-    int iters = find_int_arg(argc, argv, "-iters", 10);
-    int octaves = find_int_arg(argc, argv, "-octaves", 4);
-    float zoom = find_float_arg(argc, argv, "-zoom", 1.);
-    float rate = find_float_arg(argc, argv, "-rate", .04);
-    float thresh = find_float_arg(argc, argv, "-thresh", 1.);
-    float rotate = find_float_arg(argc, argv, "-rotate", 0);
-    float momentum = find_float_arg(argc, argv, "-momentum", .9);
-    float lambda = find_float_arg(argc, argv, "-lambda", .01);
-    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
-    int reconstruct = find_arg(argc, argv, "-reconstruct");
-    int smooth_size = find_int_arg(argc, argv, "-smooth", 1);
-
-    network net = parse_network_cfg(cfg);
-    load_weights(&net, weights);
-    char *cfgbase = basecfg(cfg);
-    char *imbase = basecfg(input);
-
-    set_batch_network(&net, 1);
-    image im = load_image_color(input, 0, 0);
-    if(0){
-        float scale = 1;
-        if(im.w > 512 || im.h > 512){
-            if(im.w > im.h) scale = 512.0/im.w;
-            else scale = 512.0/im.h;
-        }
-        image resized = resize_image(im, scale*im.w, scale*im.h);
-        free_image(im);
-        im = resized;
-    }
-
-    float *features = 0;
-    image update;
-    if (reconstruct){
-        resize_network(&net, im.w, im.h);
-
-        int zz = 0;
-        network_predict(net, im.data);
-        image out_im = get_network_image(net);
-        image crop = crop_image(out_im, zz, zz, out_im.w-2*zz, out_im.h-2*zz);
-        //flip_image(crop);
-        image f_im = resize_image(crop, out_im.w, out_im.h);
-        free_image(crop);
-        printf("%d features\n", out_im.w*out_im.h*out_im.c);
-
-
-        im = resize_image(im, im.w, im.h);
-        f_im = resize_image(f_im, f_im.w, f_im.h);
-        features = f_im.data;
-
-        int i;
-        for(i = 0; i < 14*14*512; ++i){
-            features[i] += rand_uniform(-.19, .19);
-        }
-
-        free_image(im);
-        im = make_random_image(im.w, im.h, im.c);
-        update = make_image(im.w, im.h, im.c);
-
-    }
-
-    int e;
-    int n;
-    for(e = 0; e < rounds; ++e){
-        fprintf(stderr, "Iteration: ");
-        fflush(stderr);
-        for(n = 0; n < iters; ++n){  
-            fprintf(stderr, "%d, ", n);
-            fflush(stderr);
-            if(reconstruct){
-                reconstruct_picture(net, features, im, update, rate, momentum, lambda, smooth_size, 1);
-                //if ((n+1)%30 == 0) rate *= .5;
-                show_image(im, "reconstruction");
-#ifdef OPENCV
-                cvWaitKey(10);
-#endif
-            }else{
-                int layer = max_layer + rand()%range - range/2;
-                int octave = rand()%octaves;
-                optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm);
-            }
-        }
-        fprintf(stderr, "done\n");
-        if(0){
-            image g = grayscale_image(im);
-            free_image(im);
-            im = g;
-        }
-        char buff[256];
-        if (prefix){
-            sprintf(buff, "%s/%s_%s_%d_%06d",prefix, imbase, cfgbase, max_layer, e);
-        }else{
-            sprintf(buff, "%s_%s_%d_%06d",imbase, cfgbase, max_layer, e);
-        }
-        printf("%d %s\n", e, buff);
-        save_image(im, buff);
-        //show_image(im, buff);
-        //cvWaitKey(0);
-
-        if(rotate){
-            image rot = rotate_image(im, rotate);
-            free_image(im);
-            im = rot;
-        }
-        image crop = crop_image(im, im.w * (1. - zoom)/2., im.h * (1.-zoom)/2., im.w*zoom, im.h*zoom);
-        image resized = resize_image(crop, im.w, im.h);
-        free_image(im);
-        free_image(crop);
-        im = resized;
-    }
-}
-
diff --git a/image.darknet/src/normalization_layer.c b/image.darknet/src/normalization_layer.c
index 069a079..424714f 100644
--- a/image.darknet/src/normalization_layer.c
+++ b/image.darknet/src/normalization_layer.c
@@ -1,5 +1,6 @@
 #include "normalization_layer.h"
 #include "blas.h"
+
 #include <stdio.h>
 
 layer make_normalization_layer(int batch, int w, int h, int c, int size, float alpha, float beta, float kappa)
@@ -62,7 +63,7 @@ void resize_normalization_layer(layer *layer, int w, int h)
 #endif
 }
 
-void forward_normalization_layer(const layer layer, network_state state)
+void forward_normalization_layer(const layer layer, network net)
 {
     int k,b;
     int w = layer.w;
@@ -73,7 +74,7 @@ void forward_normalization_layer(const layer layer, network_state state)
     for(b = 0; b < layer.batch; ++b){
         float *squared = layer.squared + w*h*c*b;
         float *norms   = layer.norms + w*h*c*b;
-        float *input   = state.input + w*h*c*b;
+        float *input   = net.input + w*h*c*b;
         pow_cpu(w*h*c, 2, input, 1, squared, 1);
 
         const_cpu(w*h, layer.kappa, norms, 1);
@@ -90,10 +91,10 @@ void forward_normalization_layer(const layer layer, network_state state)
         }
     }
     pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, layer.output, 1);
-    mul_cpu(w*h*c*layer.batch, state.input, 1, layer.output, 1);
+    mul_cpu(w*h*c*layer.batch, net.input, 1, layer.output, 1);
 }
 
-void backward_normalization_layer(const layer layer, network_state state)
+void backward_normalization_layer(const layer layer, network net)
 {
     // TODO This is approximate ;-)
     // Also this should add in to delta instead of overwritting.
@@ -101,50 +102,50 @@ void backward_normalization_layer(const layer layer, network_state state)
     int w = layer.w;
     int h = layer.h;
     int c = layer.c;
-    pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, state.delta, 1);
-    mul_cpu(w*h*c*layer.batch, layer.delta, 1, state.delta, 1);
+    pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, net.delta, 1);
+    mul_cpu(w*h*c*layer.batch, layer.delta, 1, net.delta, 1);
 }
 
 #ifdef GPU
-void forward_normalization_layer_gpu(const layer layer, network_state state)
+void forward_normalization_layer_gpu(const layer layer, network net)
 {
     int k,b;
     int w = layer.w;
     int h = layer.h;
     int c = layer.c;
-    scal_ongpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1);
+    scal_gpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1);
 
     for(b = 0; b < layer.batch; ++b){
         float *squared = layer.squared_gpu + w*h*c*b;
         float *norms   = layer.norms_gpu + w*h*c*b;
-        float *input   = state.input + w*h*c*b;
-        pow_ongpu(w*h*c, 2, input, 1, squared, 1);
+        float *input   = net.input_gpu + w*h*c*b;
+        pow_gpu(w*h*c, 2, input, 1, squared, 1);
 
-        const_ongpu(w*h, layer.kappa, norms, 1);
+        const_gpu(w*h, layer.kappa, norms, 1);
         for(k = 0; k < layer.size/2; ++k){
-            axpy_ongpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1);
+            axpy_gpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1);
         }
 
         for(k = 1; k < layer.c; ++k){
-            copy_ongpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1);
+            copy_gpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1);
             int prev = k - ((layer.size-1)/2) - 1;
             int next = k + (layer.size/2);
-            if(prev >= 0)      axpy_ongpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1);
-            if(next < layer.c) axpy_ongpu(w*h,  layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1);
+            if(prev >= 0)      axpy_gpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1);
+            if(next < layer.c) axpy_gpu(w*h,  layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1);
         }
     }
-    pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1);
-    mul_ongpu(w*h*c*layer.batch, state.input, 1, layer.output_gpu, 1);
+    pow_gpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1);
+    mul_gpu(w*h*c*layer.batch, net.input_gpu, 1, layer.output_gpu, 1);
 }
 
-void backward_normalization_layer_gpu(const layer layer, network_state state)
+void backward_normalization_layer_gpu(const layer layer, network net)
 {
     // TODO This is approximate ;-)
 
     int w = layer.w;
     int h = layer.h;
     int c = layer.c;
-    pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, state.delta, 1);
-    mul_ongpu(w*h*c*layer.batch, layer.delta_gpu, 1, state.delta, 1);
+    pow_gpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, net.delta_gpu, 1);
+    mul_gpu(w*h*c*layer.batch, layer.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
diff --git a/image.darknet/src/normalization_layer.h b/image.darknet/src/normalization_layer.h
index ab32776..665baa5 100644
--- a/image.darknet/src/normalization_layer.h
+++ b/image.darknet/src/normalization_layer.h
@@ -7,13 +7,13 @@
 
 layer make_normalization_layer(int batch, int w, int h, int c, int size, float alpha, float beta, float kappa);
 void resize_normalization_layer(layer *layer, int h, int w);
-void forward_normalization_layer(const layer layer, network_state state);
-void backward_normalization_layer(const layer layer, network_state state);
+void forward_normalization_layer(const layer layer, network net);
+void backward_normalization_layer(const layer layer, network net);
 void visualize_normalization_layer(layer layer, char *window);
 
 #ifdef GPU
-void forward_normalization_layer_gpu(const layer layer, network_state state);
-void backward_normalization_layer_gpu(const layer layer, network_state state);
+void forward_normalization_layer_gpu(const layer layer, network net);
+void backward_normalization_layer_gpu(const layer layer, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/option_list.c b/image.darknet/src/option_list.c
index f935af3..2f52781 100644
--- a/image.darknet/src/option_list.c
+++ b/image.darknet/src/option_list.c
@@ -32,6 +32,23 @@ list *read_data_cfg(char *filename)
     return options;
 }
 
+metadata get_metadata(char *file)
+{
+    metadata m = {0};
+    list *options = read_data_cfg(file);
+
+    char *name_list = option_find_str(options, "names", 0);
+    if(!name_list) name_list = option_find_str(options, "labels", 0);
+    if(!name_list) {
+        fprintf(stderr, "No names or labels found\n");
+    } else {
+        m.names = get_labels(name_list);
+    }
+    m.classes = option_find_int(options, "classes", 2);
+    free_list(options);
+    return m;
+}
+
 int read_option(char *s, list *options)
 {
     size_t i;
diff --git a/image.darknet/src/option_list.h b/image.darknet/src/option_list.h
index 054b3fd..844bd87 100644
--- a/image.darknet/src/option_list.h
+++ b/image.darknet/src/option_list.h
@@ -9,13 +9,9 @@ typedef struct{
 } kvp;
 
 
-list *read_data_cfg(char *filename);
 int read_option(char *s, list *options);
 void option_insert(list *l, char *key, char *val);
 char *option_find(list *l, char *key);
-char *option_find_str(list *l, char *key, char *def);
-int option_find_int(list *l, char *key, int def);
-int option_find_int_quiet(list *l, char *key, int def);
 float option_find_float(list *l, char *key, float def);
 float option_find_float_quiet(list *l, char *key, float def);
 void option_unused(list *l);
diff --git a/image.darknet/src/parser.c b/image.darknet/src/parser.c
index 3f39a13..c8141c9 100644
--- a/image.darknet/src/parser.c
+++ b/image.darknet/src/parser.c
@@ -1,14 +1,17 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <assert.h>
 
 #include "activation_layer.h"
+#include "logistic_layer.h"
+#include "l2norm_layer.h"
 #include "activations.h"
-#include "assert.h"
 #include "avgpool_layer.h"
 #include "batchnorm_layer.h"
 #include "blas.h"
 #include "connected_layer.h"
+#include "deconvolutional_layer.h"
 #include "convolutional_layer.h"
 #include "cost_layer.h"
 #include "crnn_layer.h"
@@ -23,11 +26,15 @@
 #include "option_list.h"
 #include "parser.h"
 #include "region_layer.h"
+#include "yolo_layer.h"
+#include "iseg_layer.h"
 #include "reorg_layer.h"
 #include "rnn_layer.h"
 #include "route_layer.h"
+#include "upsample_layer.h"
 #include "shortcut_layer.h"
 #include "softmax_layer.h"
+#include "lstm_layer.h"
 #include "utils.h"
 
 typedef struct{
@@ -45,14 +52,21 @@ LAYER_TYPE string_to_layer_type(char * type)
     if (strcmp(type, "[cost]")==0) return COST;
     if (strcmp(type, "[detection]")==0) return DETECTION;
     if (strcmp(type, "[region]")==0) return REGION;
+    if (strcmp(type, "[yolo]")==0) return YOLO;
+    if (strcmp(type, "[iseg]")==0) return ISEG;
     if (strcmp(type, "[local]")==0) return LOCAL;
     if (strcmp(type, "[conv]")==0
             || strcmp(type, "[convolutional]")==0) return CONVOLUTIONAL;
+    if (strcmp(type, "[deconv]")==0
+            || strcmp(type, "[deconvolutional]")==0) return DECONVOLUTIONAL;
     if (strcmp(type, "[activation]")==0) return ACTIVE;
+    if (strcmp(type, "[logistic]")==0) return LOGXENT;
+    if (strcmp(type, "[l2norm]")==0) return L2NORM;
     if (strcmp(type, "[net]")==0
             || strcmp(type, "[network]")==0) return NETWORK;
     if (strcmp(type, "[crnn]")==0) return CRNN;
     if (strcmp(type, "[gru]")==0) return GRU;
+    if (strcmp(type, "[lstm]") == 0) return LSTM;
     if (strcmp(type, "[rnn]")==0) return RNN;
     if (strcmp(type, "[conn]")==0
             || strcmp(type, "[connected]")==0) return CONNECTED;
@@ -68,6 +82,7 @@ LAYER_TYPE string_to_layer_type(char * type)
     if (strcmp(type, "[soft]")==0
             || strcmp(type, "[softmax]")==0) return SOFTMAX;
     if (strcmp(type, "[route]")==0) return ROUTE;
+    if (strcmp(type, "[upsample]")==0) return UPSAMPLE;
     return BLANK;
 }
 
@@ -111,7 +126,7 @@ typedef struct size_params{
     int c;
     int index;
     int time_steps;
-    network net;
+    network *net;
 } size_params;
 
 local_layer parse_local(list *options, size_params params)
@@ -135,6 +150,32 @@ local_layer parse_local(list *options, size_params params)
     return layer;
 }
 
+layer parse_deconvolutional(list *options, size_params params)
+{
+    int n = option_find_int(options, "filters",1);
+    int size = option_find_int(options, "size",1);
+    int stride = option_find_int(options, "stride",1);
+
+    char *activation_s = option_find_str(options, "activation", "logistic");
+    ACTIVATION activation = get_activation(activation_s);
+
+    int batch,h,w,c;
+    h = params.h;
+    w = params.w;
+    c = params.c;
+    batch=params.batch;
+    if(!(h && w && c)) error("Layer before deconvolutional layer must output image.");
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+    int pad = option_find_int_quiet(options, "pad",0);
+    int padding = option_find_int_quiet(options, "padding",0);
+    if(pad) padding = size/2;
+
+    layer l = make_deconvolutional_layer(batch,h,w,c,n,size,stride,padding, activation, batch_normalize, params.net->adam);
+
+    return l;
+}
+
+
 convolutional_layer parse_convolutional(list *options, size_params params)
 {
     int n = option_find_int(options, "filters",1);
@@ -142,6 +183,7 @@ convolutional_layer parse_convolutional(list *options, size_params params)
     int stride = option_find_int(options, "stride",1);
     int pad = option_find_int_quiet(options, "pad",0);
     int padding = option_find_int_quiet(options, "padding",0);
+    int groups = option_find_int_quiet(options, "groups", 1);
     if(pad) padding = size/2;
 
     char *activation_s = option_find_str(options, "activation", "logistic");
@@ -157,14 +199,9 @@ convolutional_layer parse_convolutional(list *options, size_params params)
     int binary = option_find_int_quiet(options, "binary", 0);
     int xnor = option_find_int_quiet(options, "xnor", 0);
 
-    convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam);
+    convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,groups,size,stride,padding,activation, batch_normalize, binary, xnor, params.net->adam);
     layer.flipped = option_find_int_quiet(options, "flipped", 0);
     layer.dot = option_find_float_quiet(options, "dot", 0);
-    if(params.net.adam){
-        layer.B1 = params.net.B1;
-        layer.B2 = params.net.B2;
-        layer.eps = params.net.eps;
-    }
 
     return layer;
 }
@@ -187,13 +224,11 @@ layer parse_crnn(list *options, size_params params)
 layer parse_rnn(list *options, size_params params)
 {
     int output = option_find_int(options, "output",1);
-    int hidden = option_find_int(options, "hidden",1);
     char *activation_s = option_find_str(options, "activation", "logistic");
     ACTIVATION activation = get_activation(activation_s);
     int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
-    int logistic = option_find_int_quiet(options, "logistic", 0);
 
-    layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic);
+    layer l = make_rnn_layer(params.batch, params.inputs, output, params.time_steps, activation, batch_normalize, params.net->adam);
 
     l.shortcut = option_find_int_quiet(options, "shortcut", 0);
 
@@ -205,31 +240,114 @@ layer parse_gru(list *options, size_params params)
     int output = option_find_int(options, "output",1);
     int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
 
-    layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize);
+    layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize, params.net->adam);
+    l.tanh = option_find_int_quiet(options, "tanh", 0);
+
+    return l;
+}
+
+layer parse_lstm(list *options, size_params params)
+{
+    int output = option_find_int(options, "output", 1);
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+
+    layer l = make_lstm_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize, params.net->adam);
 
     return l;
 }
 
-connected_layer parse_connected(list *options, size_params params)
+layer parse_connected(list *options, size_params params)
 {
     int output = option_find_int(options, "output",1);
     char *activation_s = option_find_str(options, "activation", "logistic");
     ACTIVATION activation = get_activation(activation_s);
     int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
 
-    connected_layer layer = make_connected_layer(params.batch, params.inputs, output, activation, batch_normalize);
-
-    return layer;
+    layer l = make_connected_layer(params.batch, params.inputs, output, activation, batch_normalize, params.net->adam);
+    return l;
 }
 
-softmax_layer parse_softmax(list *options, size_params params)
+layer parse_softmax(list *options, size_params params)
 {
     int groups = option_find_int_quiet(options, "groups",1);
-    softmax_layer layer = make_softmax_layer(params.batch, params.inputs, groups);
-    layer.temperature = option_find_float_quiet(options, "temperature", 1);
+    layer l = make_softmax_layer(params.batch, params.inputs, groups);
+    l.temperature = option_find_float_quiet(options, "temperature", 1);
     char *tree_file = option_find_str(options, "tree", 0);
-    if (tree_file) layer.softmax_tree = read_tree(tree_file);
-    return layer;
+    if (tree_file) l.softmax_tree = read_tree(tree_file);
+    l.w = params.w;
+    l.h = params.h;
+    l.c = params.c;
+    l.spatial = option_find_float_quiet(options, "spatial", 0);
+    l.noloss =  option_find_int_quiet(options, "noloss", 0);
+    return l;
+}
+
+int *parse_yolo_mask(char *a, int *num)
+{
+    int *mask = 0;
+    if(a){
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (a[i] == ',') ++n;
+        }
+        mask = calloc(n, sizeof(int));
+        for(i = 0; i < n; ++i){
+            int val = atoi(a);
+            mask[i] = val;
+            a = strchr(a, ',')+1;
+        }
+        *num = n;
+    }
+    return mask;
+}
+
+layer parse_yolo(list *options, size_params params)
+{
+    int classes = option_find_int(options, "classes", 20);
+    int total = option_find_int(options, "num", 1);
+    int num = total;
+
+    char *a = option_find_str(options, "mask", 0);
+    int *mask = parse_yolo_mask(a, &num);
+    layer l = make_yolo_layer(params.batch, params.w, params.h, num, total, mask, classes);
+    assert(l.outputs == params.inputs);
+
+    l.max_boxes = option_find_int_quiet(options, "max",90);
+    l.jitter = option_find_float(options, "jitter", .2);
+
+    l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
+    l.truth_thresh = option_find_float(options, "truth_thresh", 1);
+    l.random = option_find_int_quiet(options, "random", 0);
+
+    char *map_file = option_find_str(options, "map", 0);
+    if (map_file) l.map = read_map(map_file);
+
+    a = option_find_str(options, "anchors", 0);
+    if(a){
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (a[i] == ',') ++n;
+        }
+        for(i = 0; i < n; ++i){
+            float bias = atof(a);
+            l.biases[i] = bias;
+            a = strchr(a, ',')+1;
+        }
+    }
+    return l;
+}
+
+layer parse_iseg(list *options, size_params params)
+{
+    int classes = option_find_int(options, "classes", 20);
+    int ids = option_find_int(options, "ids", 32);
+    layer l = make_iseg_layer(params.batch, params.w, params.h, classes, ids);
+    assert(l.outputs == params.inputs);
+    return l;
 }
 
 layer parse_region(list *options, size_params params)
@@ -245,6 +363,7 @@ layer parse_region(list *options, size_params params)
     l.sqrt = option_find_int_quiet(options, "sqrt", 0);
 
     l.softmax = option_find_int(options, "softmax", 0);
+    l.background = option_find_int_quiet(options, "background", 0);
     l.max_boxes = option_find_int_quiet(options, "max",30);
     l.jitter = option_find_float(options, "jitter", .2);
     l.rescore = option_find_int_quiet(options, "rescore",0);
@@ -257,6 +376,7 @@ layer parse_region(list *options, size_params params)
     l.coord_scale = option_find_float(options, "coord_scale", 1);
     l.object_scale = option_find_float(options, "object_scale", 1);
     l.noobject_scale = option_find_float(options, "noobject_scale", 1);
+    l.mask_scale = option_find_float(options, "mask_scale", 1);
     l.class_scale = option_find_float(options, "class_scale", 1);
     l.bias_match = option_find_int_quiet(options, "bias_match",0);
 
@@ -281,6 +401,7 @@ layer parse_region(list *options, size_params params)
     }
     return l;
 }
+
 detection_layer parse_detection(list *options, size_params params)
 {
     int coords = option_find_int(options, "coords", 1);
@@ -293,7 +414,7 @@ detection_layer parse_detection(list *options, size_params params)
     layer.softmax = option_find_int(options, "softmax", 0);
     layer.sqrt = option_find_int(options, "sqrt", 0);
 
-    layer.max_boxes = option_find_int_quiet(options, "max",30);
+    layer.max_boxes = option_find_int_quiet(options, "max",90);
     layer.coord_scale = option_find_float(options, "coord_scale", 1);
     layer.forced = option_find_int(options, "forced", 0);
     layer.object_scale = option_find_float(options, "object_scale", 1);
@@ -312,6 +433,8 @@ cost_layer parse_cost(list *options, size_params params)
     float scale = option_find_float_quiet(options, "scale",1);
     cost_layer layer = make_cost_layer(params.batch, params.inputs, type, scale);
     layer.ratio =  option_find_float_quiet(options, "ratio",0);
+    layer.noobject_scale =  option_find_float_quiet(options, "noobj", 1);
+    layer.thresh =  option_find_float_quiet(options, "thresh",0);
     return layer;
 }
 
@@ -343,6 +466,8 @@ layer parse_reorg(list *options, size_params params)
 {
     int stride = option_find_int(options, "stride",1);
     int reverse = option_find_int_quiet(options, "reverse",0);
+    int flatten = option_find_int_quiet(options, "flatten",0);
+    int extra = option_find_int_quiet(options, "extra",0);
 
     int batch,h,w,c;
     h = params.h;
@@ -351,7 +476,7 @@ layer parse_reorg(list *options, size_params params)
     batch=params.batch;
     if(!(h && w && c)) error("Layer before reorg layer must output image.");
 
-    layer layer = make_reorg_layer(batch,w,h,c,stride,reverse);
+    layer layer = make_reorg_layer(batch,w,h,c,stride,reverse, flatten, extra);
     return layer;
 }
 
@@ -359,7 +484,7 @@ maxpool_layer parse_maxpool(list *options, size_params params)
 {
     int stride = option_find_int(options, "stride",1);
     int size = option_find_int(options, "size",stride);
-    int padding = option_find_int_quiet(options, "padding", (size-1)/2);
+    int padding = option_find_int_quiet(options, "padding", size-1);
 
     int batch,h,w,c;
     h = params.h;
@@ -411,24 +536,45 @@ layer parse_batchnorm(list *options, size_params params)
     return l;
 }
 
-layer parse_shortcut(list *options, size_params params, network net)
+layer parse_shortcut(list *options, size_params params, network *net)
 {
-    char *l = option_find(options, "from");   
+    char *l = option_find(options, "from");
     int index = atoi(l);
     if(index < 0) index = params.index + index;
 
     int batch = params.batch;
-    layer from = net.layers[index];
+    layer from = net->layers[index];
 
     layer s = make_shortcut_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c);
 
     char *activation_s = option_find_str(options, "activation", "linear");
     ACTIVATION activation = get_activation(activation_s);
     s.activation = activation;
+    s.alpha = option_find_float_quiet(options, "alpha", 1);
+    s.beta = option_find_float_quiet(options, "beta", 1);
     return s;
 }
 
 
+layer parse_l2norm(list *options, size_params params)
+{
+    layer l = make_l2norm_layer(params.batch, params.inputs);
+    l.h = l.out_h = params.h;
+    l.w = l.out_w = params.w;
+    l.c = l.out_c = params.c;
+    return l;
+}
+
+
+layer parse_logistic(list *options, size_params params)
+{
+    layer l = make_logistic_layer(params.batch, params.inputs);
+    l.h = l.out_h = params.h;
+    l.w = l.out_w = params.w;
+    l.c = l.out_c = params.c;
+    return l;
+}
+
 layer parse_activation(list *options, size_params params)
 {
     char *activation_s = option_find_str(options, "activation", "linear");
@@ -436,19 +582,25 @@ layer parse_activation(list *options, size_params params)
 
     layer l = make_activation_layer(params.batch, params.inputs, activation);
 
-    l.out_h = params.h;
-    l.out_w = params.w;
-    l.out_c = params.c;
-    l.h = params.h;
-    l.w = params.w;
-    l.c = params.c;
+    l.h = l.out_h = params.h;
+    l.w = l.out_w = params.w;
+    l.c = l.out_c = params.c;
+
+    return l;
+}
+
+layer parse_upsample(list *options, size_params params, network *net)
+{
 
+    int stride = option_find_int(options, "stride",2);
+    layer l = make_upsample_layer(params.batch, params.w, params.h, params.c, stride);
+    l.scale = option_find_float_quiet(options, "scale", 1);
     return l;
 }
 
-route_layer parse_route(list *options, size_params params, network net)
+route_layer parse_route(list *options, size_params params, network *net)
 {
-    char *l = option_find(options, "layers");   
+    char *l = option_find(options, "layers");
     int len = strlen(l);
     if(!l) error("Route Layer must specify input layers");
     int n = 1;
@@ -464,19 +616,19 @@ route_layer parse_route(list *options, size_params params, network net)
         l = strchr(l, ',')+1;
         if(index < 0) index = params.index + index;
         layers[i] = index;
-        sizes[i] = net.layers[index].outputs;
+        sizes[i] = net->layers[index].outputs;
     }
     int batch = params.batch;
 
     route_layer layer = make_route_layer(batch, n, layers, sizes);
 
-    convolutional_layer first = net.layers[layers[0]];
+    convolutional_layer first = net->layers[layers[0]];
     layer.out_w = first.out_w;
     layer.out_h = first.out_h;
     layer.out_c = first.out_c;
     for(i = 1; i < n; ++i){
         int index = layers[i];
-        convolutional_layer next = net.layers[index];
+        convolutional_layer next = net->layers[index];
         if(next.out_w == first.out_w && next.out_h == first.out_h){
             layer.out_c += next.out_c;
         }else{
@@ -508,15 +660,17 @@ void parse_net_options(list *options, network *net)
     net->decay = option_find_float(options, "decay", .0001);
     int subdivs = option_find_int(options, "subdivisions",1);
     net->time_steps = option_find_int_quiet(options, "time_steps",1);
+    net->notruth = option_find_int_quiet(options, "notruth",0);
     net->batch /= subdivs;
     net->batch *= net->time_steps;
     net->subdivisions = subdivs;
+    net->random = option_find_int_quiet(options, "random", 0);
 
     net->adam = option_find_int_quiet(options, "adam", 0);
     if(net->adam){
         net->B1 = option_find_float(options, "B1", .9);
         net->B2 = option_find_float(options, "B2", .999);
-        net->eps = option_find_float(options, "eps", .000001);
+        net->eps = option_find_float(options, "eps", .0000001);
     }
 
     net->h = option_find_int_quiet(options, "height",0);
@@ -525,6 +679,10 @@ void parse_net_options(list *options, network *net)
     net->inputs = option_find_int_quiet(options, "inputs", net->h * net->w * net->c);
     net->max_crop = option_find_int_quiet(options, "max_crop",net->w*2);
     net->min_crop = option_find_int_quiet(options, "min_crop",net->w);
+    net->max_ratio = option_find_float_quiet(options, "max_ratio", (float) net->max_crop / net->w);
+    net->min_ratio = option_find_float_quiet(options, "min_ratio", (float) net->min_crop / net->w);
+    net->center = option_find_int_quiet(options, "center",0);
+    net->clip = option_find_float_quiet(options, "clip", 0);
 
     net->angle = option_find_float_quiet(options, "angle", 0);
     net->aspect = option_find_float_quiet(options, "aspect", 1);
@@ -537,12 +695,13 @@ void parse_net_options(list *options, network *net)
     char *policy_s = option_find_str(options, "policy", "constant");
     net->policy = get_policy(policy_s);
     net->burn_in = option_find_int_quiet(options, "burn_in", 0);
+    net->power = option_find_float_quiet(options, "power", 4);
     if(net->policy == STEP){
         net->step = option_find_int(options, "step", 1);
         net->scale = option_find_float(options, "scale", 1);
     } else if (net->policy == STEPS){
-        char *l = option_find(options, "steps");   
-        char *p = option_find(options, "scales");   
+        char *l = option_find(options, "steps");
+        char *p = option_find(options, "scales");
         if(!l || !p) error("STEPS policy must have steps and scales in cfg file");
 
         int len = strlen(l);
@@ -570,7 +729,6 @@ void parse_net_options(list *options, network *net)
         net->gamma = option_find_float(options, "gamma", 1);
         net->step = option_find_int(options, "step", 1);
     } else if (net->policy == POLY || net->policy == RANDOM){
-        net->power = option_find_float(options, "power", 1);
     }
     net->max_batches = option_find_int(options, "max_batches", 0);
 }
@@ -581,26 +739,26 @@ int is_network(section *s)
             || strcmp(s->type, "[network]")==0);
 }
 
-network parse_network_cfg(char *filename)
+network *parse_network_cfg(char *filename)
 {
     list *sections = read_cfg(filename);
     node *n = sections->front;
     if(!n) error("Config file has no sections");
-    network net = make_network(sections->size - 1);
-    net.gpu_index = gpu_index;
+    network *net = make_network(sections->size - 1);
+    net->gpu_index = gpu_index;
     size_params params;
 
     section *s = (section *)n->val;
     list *options = s->options;
     if(!is_network(s)) error("First section must be [net] or [network]");
-    parse_net_options(options, &net);
-
-    params.h = net.h;
-    params.w = net.w;
-    params.c = net.c;
-    params.inputs = net.inputs;
-    params.batch = net.batch;
-    params.time_steps = net.time_steps;
+    parse_net_options(options, net);
+
+    params.h = net->h;
+    params.w = net->w;
+    params.c = net->c;
+    params.inputs = net->inputs;
+    params.batch = net->batch;
+    params.time_steps = net->time_steps;
     params.net = net;
 
     size_t workspace_size = 0;
@@ -617,14 +775,22 @@ network parse_network_cfg(char *filename)
         LAYER_TYPE lt = string_to_layer_type(s->type);
         if(lt == CONVOLUTIONAL){
             l = parse_convolutional(options, params);
+        }else if(lt == DECONVOLUTIONAL){
+            l = parse_deconvolutional(options, params);
         }else if(lt == LOCAL){
             l = parse_local(options, params);
         }else if(lt == ACTIVE){
             l = parse_activation(options, params);
+        }else if(lt == LOGXENT){
+            l = parse_logistic(options, params);
+        }else if(lt == L2NORM){
+            l = parse_l2norm(options, params);
         }else if(lt == RNN){
             l = parse_rnn(options, params);
         }else if(lt == GRU){
             l = parse_gru(options, params);
+        }else if (lt == LSTM) {
+            l = parse_lstm(options, params);
         }else if(lt == CRNN){
             l = parse_crnn(options, params);
         }else if(lt == CONNECTED){
@@ -635,11 +801,15 @@ network parse_network_cfg(char *filename)
             l = parse_cost(options, params);
         }else if(lt == REGION){
             l = parse_region(options, params);
+        }else if(lt == YOLO){
+            l = parse_yolo(options, params);
+        }else if(lt == ISEG){
+            l = parse_iseg(options, params);
         }else if(lt == DETECTION){
             l = parse_detection(options, params);
         }else if(lt == SOFTMAX){
             l = parse_softmax(options, params);
-            net.hierarchy = l.softmax_tree;
+            net->hierarchy = l.softmax_tree;
         }else if(lt == NORMALIZATION){
             l = parse_normalization(options, params);
         }else if(lt == BATCHNORM){
@@ -652,23 +822,33 @@ network parse_network_cfg(char *filename)
             l = parse_avgpool(options, params);
         }else if(lt == ROUTE){
             l = parse_route(options, params, net);
+        }else if(lt == UPSAMPLE){
+            l = parse_upsample(options, params, net);
         }else if(lt == SHORTCUT){
             l = parse_shortcut(options, params, net);
         }else if(lt == DROPOUT){
             l = parse_dropout(options, params);
-            l.output = net.layers[count-1].output;
-            l.delta = net.layers[count-1].delta;
+            l.output = net->layers[count-1].output;
+            l.delta = net->layers[count-1].delta;
 #ifdef GPU
-            l.output_gpu = net.layers[count-1].output_gpu;
-            l.delta_gpu = net.layers[count-1].delta_gpu;
+            l.output_gpu = net->layers[count-1].output_gpu;
+            l.delta_gpu = net->layers[count-1].delta_gpu;
 #endif
         }else{
             fprintf(stderr, "Type not recognized: %s\n", s->type);
         }
+        l.clip = net->clip;
+        l.truth = option_find_int_quiet(options, "truth", 0);
+        l.onlyforward = option_find_int_quiet(options, "onlyforward", 0);
+        l.stopbackward = option_find_int_quiet(options, "stopbackward", 0);
+        l.dontsave = option_find_int_quiet(options, "dontsave", 0);
         l.dontload = option_find_int_quiet(options, "dontload", 0);
+        l.numload = option_find_int_quiet(options, "numload", 0);
         l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0);
+        l.learning_rate_scale = option_find_float_quiet(options, "learning_rate", 1);
+        l.smooth = option_find_float_quiet(options, "smooth", 0);
         option_unused(options);
-        net.layers[count] = l;
+        net->layers[count] = l;
         if (l.workspace_size > workspace_size) workspace_size = l.workspace_size;
         free_section(s);
         n = n->next;
@@ -679,20 +859,30 @@ network parse_network_cfg(char *filename)
             params.c = l.out_c;
             params.inputs = l.outputs;
         }
-    }   
+    }
     free_list(sections);
-    net.outputs = get_network_output_size(net);
-    net.output = get_network_output(net);
+    layer out = get_network_output_layer(net);
+    net->outputs = out.outputs;
+    net->truths = out.outputs;
+    if(net->layers[net->n-1].truths) net->truths = net->layers[net->n-1].truths;
+    net->output = out.output;
+    net->input = calloc(net->inputs*net->batch, sizeof(float));
+    net->truth = calloc(net->truths*net->batch, sizeof(float));
+#ifdef GPU
+    net->output_gpu = out.output_gpu;
+    net->input_gpu = cuda_make_array(net->input, net->inputs*net->batch);
+    net->truth_gpu = cuda_make_array(net->truth, net->truths*net->batch);
+#endif
     if(workspace_size){
         //printf("%ld\n", workspace_size);
 #ifdef GPU
         if(gpu_index >= 0){
-            net.workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
+            net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
         }else {
-            net.workspace = calloc(1, workspace_size);
+            net->workspace = calloc(1, workspace_size);
         }
 #else
-        net.workspace = calloc(1, workspace_size);
+        net->workspace = calloc(1, workspace_size);
 #endif
     }
     return net;
@@ -704,7 +894,7 @@ list *read_cfg(char *filename)
     if(file == 0) file_error(filename);
     char *line;
     int nu = 0;
-    list *sections = make_list();
+    list *options = make_list();
     section *current = 0;
     while((line=fgetl(file)) != 0){
         ++ nu;
@@ -712,7 +902,7 @@ list *read_cfg(char *filename)
         switch(line[0]){
             case '[':
                 current = malloc(sizeof(section));
-                list_insert(sections, current);
+                list_insert(options, current);
                 current->options = make_list();
                 current->type = line;
                 break;
@@ -730,7 +920,7 @@ list *read_cfg(char *filename)
         }
     }
     fclose(file);
-    return sections;
+    return options;
 }
 
 void save_convolutional_weights_binary(layer l, FILE *fp)
@@ -776,7 +966,7 @@ void save_convolutional_weights(layer l, FILE *fp)
         pull_convolutional_layer(l);
     }
 #endif
-    int num = l.n*l.c*l.size*l.size;
+    int num = l.nweights;
     fwrite(l.biases, sizeof(float), l.n, fp);
     if (l.batch_normalize){
         fwrite(l.scales, sizeof(float), l.n, fp);
@@ -784,10 +974,6 @@ void save_convolutional_weights(layer l, FILE *fp)
         fwrite(l.rolling_variance, sizeof(float), l.n, fp);
     }
     fwrite(l.weights, sizeof(float), num, fp);
-    if(l.adam){
-        fwrite(l.m, sizeof(float), num, fp);
-        fwrite(l.v, sizeof(float), num, fp);
-    }
 }
 
 void save_batchnorm_weights(layer l, FILE *fp)
@@ -818,11 +1004,11 @@ void save_connected_weights(layer l, FILE *fp)
     }
 }
 
-void save_weights_upto(network net, char *filename, int cutoff)
+void save_weights_upto(network *net, char *filename, int cutoff)
 {
 #ifdef GPU
-    if(net.gpu_index >= 0){
-        cuda_set_device(net.gpu_index);
+    if(net->gpu_index >= 0){
+        cuda_set_device(net->gpu_index);
     }
 #endif
     fprintf(stderr, "Saving weights to %s\n", filename);
@@ -830,17 +1016,18 @@ void save_weights_upto(network net, char *filename, int cutoff)
     if(!fp) file_error(filename);
 
     int major = 0;
-    int minor = 1;
+    int minor = 2;
     int revision = 0;
     fwrite(&major, sizeof(int), 1, fp);
     fwrite(&minor, sizeof(int), 1, fp);
     fwrite(&revision, sizeof(int), 1, fp);
-    fwrite(net.seen, sizeof(int), 1, fp);
+    fwrite(net->seen, sizeof(size_t), 1, fp);
 
     int i;
-    for(i = 0; i < net.n && i < cutoff; ++i){
-        layer l = net.layers[i];
-        if(l.type == CONVOLUTIONAL){
+    for(i = 0; i < net->n && i < cutoff; ++i){
+        layer l = net->layers[i];
+        if (l.dontsave) continue;
+        if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
             save_convolutional_weights(l, fp);
         } if(l.type == CONNECTED){
             save_connected_weights(l, fp);
@@ -850,14 +1037,29 @@ void save_weights_upto(network net, char *filename, int cutoff)
             save_connected_weights(*(l.input_layer), fp);
             save_connected_weights(*(l.self_layer), fp);
             save_connected_weights(*(l.output_layer), fp);
-        } if(l.type == GRU){
-            save_connected_weights(*(l.input_z_layer), fp);
-            save_connected_weights(*(l.input_r_layer), fp);
-            save_connected_weights(*(l.input_h_layer), fp);
-            save_connected_weights(*(l.state_z_layer), fp);
-            save_connected_weights(*(l.state_r_layer), fp);
-            save_connected_weights(*(l.state_h_layer), fp);
-        } if(l.type == CRNN){
+        } if (l.type == LSTM) {
+            save_connected_weights(*(l.wi), fp);
+            save_connected_weights(*(l.wf), fp);
+            save_connected_weights(*(l.wo), fp);
+            save_connected_weights(*(l.wg), fp);
+            save_connected_weights(*(l.ui), fp);
+            save_connected_weights(*(l.uf), fp);
+            save_connected_weights(*(l.uo), fp);
+            save_connected_weights(*(l.ug), fp);
+        } if (l.type == GRU) {
+            if(1){
+                save_connected_weights(*(l.wz), fp);
+                save_connected_weights(*(l.wr), fp);
+                save_connected_weights(*(l.wh), fp);
+                save_connected_weights(*(l.uz), fp);
+                save_connected_weights(*(l.ur), fp);
+                save_connected_weights(*(l.uh), fp);
+            }else{
+                save_connected_weights(*(l.reset_layer), fp);
+                save_connected_weights(*(l.update_layer), fp);
+                save_connected_weights(*(l.state_layer), fp);
+            }
+        }  if(l.type == CRNN){
             save_convolutional_weights(*(l.input_layer), fp);
             save_convolutional_weights(*(l.self_layer), fp);
             save_convolutional_weights(*(l.output_layer), fp);
@@ -875,9 +1077,9 @@ void save_weights_upto(network net, char *filename, int cutoff)
     }
     fclose(fp);
 }
-void save_weights(network net, char *filename)
+void save_weights(network *net, char *filename)
 {
-    save_weights_upto(net, filename, net.n);
+    save_weights_upto(net, filename, net->n);
 }
 
 void transpose_matrix(float *a, int rows, int cols)
@@ -965,7 +1167,8 @@ void load_convolutional_weights(layer l, FILE *fp)
         //load_convolutional_weights_binary(l, fp);
         //return;
     }
-    int num = l.n*l.c*l.size*l.size;
+    if(l.numload) l.n = l.numload;
+    int num = l.c/l.groups*l.n*l.size*l.size;
     fread(l.biases, sizeof(float), l.n, fp);
     if (l.batch_normalize && (!l.dontloadscales)){
         fread(l.scales, sizeof(float), l.n, fp);
@@ -986,12 +1189,19 @@ void load_convolutional_weights(layer l, FILE *fp)
             fill_cpu(l.n, 0, l.rolling_mean, 1);
             fill_cpu(l.n, 0, l.rolling_variance, 1);
         }
+        if(0){
+            int i;
+            for(i = 0; i < l.n; ++i){
+                printf("%g, ", l.rolling_mean[i]);
+            }
+            printf("\n");
+            for(i = 0; i < l.n; ++i){
+                printf("%g, ", l.rolling_variance[i]);
+            }
+            printf("\n");
+        }
     }
     fread(l.weights, sizeof(float), num, fp);
-    if(l.adam){
-        fread(l.m, sizeof(float), num, fp);
-        fread(l.v, sizeof(float), num, fp);
-    }
     //if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1);
     if (l.flipped) {
         transpose_matrix(l.weights, l.c*l.size*l.size, l.n);
@@ -1005,7 +1215,7 @@ void load_convolutional_weights(layer l, FILE *fp)
 }
 
 
-void load_weights_upto(network *net, char *filename, int cutoff)
+void load_weights_upto(network *net, char *filename, int start, int cutoff)
 {
 #ifdef GPU
     if(net->gpu_index >= 0){
@@ -1023,14 +1233,20 @@ void load_weights_upto(network *net, char *filename, int cutoff)
     fread(&major, sizeof(int), 1, fp);
     fread(&minor, sizeof(int), 1, fp);
     fread(&revision, sizeof(int), 1, fp);
-    fread(net->seen, sizeof(int), 1, fp);
+    if ((major*10 + minor) >= 2 && major < 1000 && minor < 1000){
+        fread(net->seen, sizeof(size_t), 1, fp);
+    } else {
+        int iseen = 0;
+        fread(&iseen, sizeof(int), 1, fp);
+        *net->seen = iseen;
+    }
     int transpose = (major > 1000) || (minor > 1000);
 
     int i;
-    for(i = 0; i < net->n && i < cutoff; ++i){
+    for(i = start; i < net->n && i < cutoff; ++i){
         layer l = net->layers[i];
         if (l.dontload) continue;
-        if(l.type == CONVOLUTIONAL){
+        if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
             load_convolutional_weights(l, fp);
         }
         if(l.type == CONNECTED){
@@ -1049,13 +1265,29 @@ void load_weights_upto(network *net, char *filename, int cutoff)
             load_connected_weights(*(l.self_layer), fp, transpose);
             load_connected_weights(*(l.output_layer), fp, transpose);
         }
-        if(l.type == GRU){
-            load_connected_weights(*(l.input_z_layer), fp, transpose);
-            load_connected_weights(*(l.input_r_layer), fp, transpose);
-            load_connected_weights(*(l.input_h_layer), fp, transpose);
-            load_connected_weights(*(l.state_z_layer), fp, transpose);
-            load_connected_weights(*(l.state_r_layer), fp, transpose);
-            load_connected_weights(*(l.state_h_layer), fp, transpose);
+        if (l.type == LSTM) {
+            load_connected_weights(*(l.wi), fp, transpose);
+            load_connected_weights(*(l.wf), fp, transpose);
+            load_connected_weights(*(l.wo), fp, transpose);
+            load_connected_weights(*(l.wg), fp, transpose);
+            load_connected_weights(*(l.ui), fp, transpose);
+            load_connected_weights(*(l.uf), fp, transpose);
+            load_connected_weights(*(l.uo), fp, transpose);
+            load_connected_weights(*(l.ug), fp, transpose);
+        }
+        if (l.type == GRU) {
+            if(1){
+                load_connected_weights(*(l.wz), fp, transpose);
+                load_connected_weights(*(l.wr), fp, transpose);
+                load_connected_weights(*(l.wh), fp, transpose);
+                load_connected_weights(*(l.uz), fp, transpose);
+                load_connected_weights(*(l.ur), fp, transpose);
+                load_connected_weights(*(l.uh), fp, transpose);
+            }else{
+                load_connected_weights(*(l.reset_layer), fp, transpose);
+                load_connected_weights(*(l.update_layer), fp, transpose);
+                load_connected_weights(*(l.state_layer), fp, transpose);
+            }
         }
         if(l.type == LOCAL){
             int locations = l.out_w*l.out_h;
@@ -1075,6 +1307,6 @@ void load_weights_upto(network *net, char *filename, int cutoff)
 
 void load_weights(network *net, char *filename)
 {
-    load_weights_upto(net, filename, net->n);
+    load_weights_upto(net, filename, 0, net->n);
 }
 
diff --git a/image.darknet/src/parser.h b/image.darknet/src/parser.h
index 6cff4fb..81aef2c 100644
--- a/image.darknet/src/parser.h
+++ b/image.darknet/src/parser.h
@@ -1,13 +1,9 @@
 #ifndef PARSER_H
 #define PARSER_H
+#include "darknet.h"
 #include "network.h"
 
-network parse_network_cfg(char *filename);
 void save_network(network net, char *filename);
-void save_weights(network net, char *filename);
-void save_weights_upto(network net, char *filename, int cutoff);
 void save_weights_double(network net, char *filename);
-void load_weights(network *net, char *filename);
-void load_weights_upto(network *net, char *filename, int cutoff);
 
 #endif
diff --git a/image.darknet/src/region_layer.c b/image.darknet/src/region_layer.c
index f5522c3..179f5e3 100644
--- a/image.darknet/src/region_layer.c
+++ b/image.darknet/src/region_layer.c
@@ -4,6 +4,7 @@
 #include "box.h"
 #include "cuda.h"
 #include "utils.h"
+
 #include <stdio.h>
 #include <assert.h>
 #include <string.h>
@@ -18,6 +19,10 @@ layer make_region_layer(int batch, int w, int h, int n, int classes, int coords)
     l.batch = batch;
     l.h = h;
     l.w = w;
+    l.c = n*(classes + coords + 1);
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
     l.classes = classes;
     l.coords = coords;
     l.cost = calloc(1, sizeof(float));
@@ -25,7 +30,7 @@ layer make_region_layer(int batch, int w, int h, int n, int classes, int coords)
     l.bias_updates = calloc(n*2, sizeof(float));
     l.outputs = h*w*n*(classes + coords + 1);
     l.inputs = l.outputs;
-    l.truths = 30*(5);
+    l.truths = 30*(l.coords + 1);
     l.delta = calloc(batch*l.outputs, sizeof(float));
     l.output = calloc(batch*l.outputs, sizeof(float));
     int i;
@@ -68,19 +73,19 @@ void resize_region_layer(layer *l, int w, int h)
 #endif
 }
 
-box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h)
+box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h, int stride)
 {
     box b;
-    b.x = (i + logistic_activate(x[index + 0])) / w;
-    b.y = (j + logistic_activate(x[index + 1])) / h;
-    b.w = exp(x[index + 2]) * biases[2*n]   / w;
-    b.h = exp(x[index + 3]) * biases[2*n+1] / h;
+    b.x = (i + x[index + 0*stride]) / w;
+    b.y = (j + x[index + 1*stride]) / h;
+    b.w = exp(x[index + 2*stride]) * biases[2*n]   / w;
+    b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
     return b;
 }
 
-float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale)
+float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale, int stride)
 {
-    box pred = get_region_box(x, biases, n, index, i, j, w, h);
+    box pred = get_region_box(x, biases, n, index, i, j, w, h, stride);
     float iou = box_iou(pred, truth);
 
     float tx = (truth.x*w - i);
@@ -88,34 +93,47 @@ float delta_region_box(box truth, float *x, float *biases, int n, int index, int
     float tw = log(truth.w*w / biases[2*n]);
     float th = log(truth.h*h / biases[2*n + 1]);
 
-    delta[index + 0] = scale * (tx - logistic_activate(x[index + 0])) * logistic_gradient(logistic_activate(x[index + 0]));
-    delta[index + 1] = scale * (ty - logistic_activate(x[index + 1])) * logistic_gradient(logistic_activate(x[index + 1]));
-    delta[index + 2] = scale * (tw - x[index + 2]);
-    delta[index + 3] = scale * (th - x[index + 3]);
+    delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
+    delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
+    delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
+    delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
     return iou;
 }
 
-void delta_region_class(float *output, float *delta, int index, int class, int classes, tree *hier, float scale, float *avg_cat)
+void delta_region_mask(float *truth, float *x, int n, int index, float *delta, int stride, int scale)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        delta[index + i*stride] = scale*(truth[i] - x[index + i*stride]);
+    }
+}
+
+
+void delta_region_class(float *output, float *delta, int index, int class, int classes, tree *hier, float scale, int stride, float *avg_cat, int tag)
 {
     int i, n;
     if(hier){
         float pred = 1;
         while(class >= 0){
-            pred *= output[index + class];
+            pred *= output[index + stride*class];
             int g = hier->group[class];
             int offset = hier->group_offset[g];
             for(i = 0; i < hier->group_size[g]; ++i){
-                delta[index + offset + i] = scale * (0 - output[index + offset + i]);
+                delta[index + stride*(offset + i)] = scale * (0 - output[index + stride*(offset + i)]);
             }
-            delta[index + class] = scale * (1 - output[index + class]);
+            delta[index + stride*class] = scale * (1 - output[index + stride*class]);
 
             class = hier->parent[class];
         }
         *avg_cat += pred;
     } else {
+        if (delta[index] && tag){
+            delta[index + stride*class] = scale * (1 - output[index + stride*class]);
+            return;
+        }
         for(n = 0; n < classes; ++n){
-            delta[index + n] = scale * (((n == class)?1 : 0) - output[index + n]);
-            if(n == class) *avg_cat += output[index + n];
+            delta[index + stride*n] = scale * (((n == class)?1 : 0) - output[index + stride*n]);
+            if(n == class) *avg_cat += output[index + stride*n];
         }
     }
 }
@@ -130,42 +148,45 @@ float tisnan(float x)
     return (x != x);
 }
 
-void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output);
-void forward_region_layer(const layer l, network_state state)
+int entry_index(layer l, int batch, int location, int entry)
+{
+    int n =   location / (l.w*l.h);
+    int loc = location % (l.w*l.h);
+    return batch*l.outputs + n*l.w*l.h*(l.coords+l.classes+1) + entry*l.w*l.h + loc;
+}
+
+void forward_region_layer(const layer l, network net)
 {
     int i,j,b,t,n;
-    int size = l.coords + l.classes + 1;
-    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
+    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+
 #ifndef GPU
-    flatten(l.output, l.w*l.h, size*l.n, l.batch, 1);
-#endif
     for (b = 0; b < l.batch; ++b){
-        for(i = 0; i < l.h*l.w*l.n; ++i){
-            int index = size*i + b*l.outputs;
-            l.output[index + 4] = logistic_activate(l.output[index + 4]);
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, l.coords);
+            if(!l.background) activate_array(l.output + index,   l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, l.coords + 1);
+            if(!l.softmax && !l.softmax_tree) activate_array(l.output + index, l.classes*l.w*l.h, LOGISTIC);
         }
     }
-
-
-#ifndef GPU
     if (l.softmax_tree){
-        for (b = 0; b < l.batch; ++b){
-            for(i = 0; i < l.h*l.w*l.n; ++i){
-                int index = size*i + b*l.outputs;
-                softmax_tree(l.output + index + 5, 1, 0, 1, l.softmax_tree, l.output + index + 5);
-            }
+        int i;
+        int count = l.coords + 1;
+        for (i = 0; i < l.softmax_tree->groups; ++i) {
+            int group_size = l.softmax_tree->group_size[i];
+            softmax_cpu(net.input + count, group_size, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + count);
+            count += group_size;
         }
     } else if (l.softmax){
-        for (b = 0; b < l.batch; ++b){
-            for(i = 0; i < l.h*l.w*l.n; ++i){
-                int index = size*i + b*l.outputs;
-                softmax(l.output + index + 5, l.classes, 1, l.output + index + 5);
-            }
-        }
+        int index = entry_index(l, 0, 0, l.coords + !l.background);
+        softmax_cpu(net.input + index, l.classes + l.background, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output + index);
     }
 #endif
-    if(!state.train) return;
+
     memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+    if(!net.train) return;
     float avg_iou = 0;
     float recall = 0;
     float avg_cat = 0;
@@ -178,26 +199,29 @@ void forward_region_layer(const layer l, network_state state)
         if(l.softmax_tree){
             int onlyclass = 0;
             for(t = 0; t < 30; ++t){
-                box truth = float_to_box(state.truth + t*5 + b*l.truths);
+                box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
                 if(!truth.x) break;
-                int class = state.truth[t*5 + b*l.truths + 4];
+                int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];
                 float maxp = 0;
                 int maxi = 0;
                 if(truth.x > 100000 && truth.y > 100000){
                     for(n = 0; n < l.n*l.w*l.h; ++n){
-                        int index = size*n + b*l.outputs + 5;
-                        float scale =  l.output[index-1];
-                        l.delta[index - 1] = l.noobject_scale * ((0 - l.output[index - 1]) * logistic_gradient(l.output[index - 1]));
-                        float p = scale*get_hierarchy_probability(l.output + index, l.softmax_tree, class);
+                        int class_index = entry_index(l, b, n, l.coords + 1);
+                        int obj_index = entry_index(l, b, n, l.coords);
+                        float scale =  l.output[obj_index];
+                        l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]);
+                        float p = scale*get_hierarchy_probability(l.output + class_index, l.softmax_tree, class, l.w*l.h);
                         if(p > maxp){
                             maxp = p;
                             maxi = n;
                         }
                     }
-                    int index = size*maxi + b*l.outputs + 5;
-                    delta_region_class(l.output, l.delta, index, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat);
-                    if(l.output[index - 1] < .3) l.delta[index - 1] = l.object_scale * ((.3 - l.output[index - 1]) * logistic_gradient(l.output[index - 1]));
-                    else  l.delta[index - 1] = 0;
+                    int class_index = entry_index(l, b, maxi, l.coords + 1);
+                    int obj_index = entry_index(l, b, maxi, l.coords);
+                    delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat, !l.softmax);
+                    if(l.output[obj_index] < .3) l.delta[obj_index] = l.object_scale * (.3 - l.output[obj_index]);
+                    else  l.delta[obj_index] = 0;
+                    l.delta[obj_index] = 0;
                     ++class_count;
                     onlyclass = 1;
                     break;
@@ -208,190 +232,276 @@ void forward_region_layer(const layer l, network_state state)
         for (j = 0; j < l.h; ++j) {
             for (i = 0; i < l.w; ++i) {
                 for (n = 0; n < l.n; ++n) {
-                    int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
-                    box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
+                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h);
                     float best_iou = 0;
                     for(t = 0; t < 30; ++t){
-                        box truth = float_to_box(state.truth + t*5 + b*l.truths);
+                        box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
                         if(!truth.x) break;
                         float iou = box_iou(pred, truth);
                         if (iou > best_iou) {
                             best_iou = iou;
                         }
                     }
-                    avg_anyobj += l.output[index + 4];
-                    l.delta[index + 4] = l.noobject_scale * ((0 - l.output[index + 4]) * logistic_gradient(l.output[index + 4]));
+                    int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, l.coords);
+                    avg_anyobj += l.output[obj_index];
+                    l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]);
+                    if(l.background) l.delta[obj_index] = l.noobject_scale * (1 - l.output[obj_index]);
                     if (best_iou > l.thresh) {
-                        l.delta[index + 4] = 0;
+                        l.delta[obj_index] = 0;
                     }
 
-                    if(*(state.net.seen) < 12800){
+                    if(*(net.seen) < 12800){
                         box truth = {0};
                         truth.x = (i + .5)/l.w;
                         truth.y = (j + .5)/l.h;
                         truth.w = l.biases[2*n]/l.w;
                         truth.h = l.biases[2*n+1]/l.h;
-                        delta_region_box(truth, l.output, l.biases, n, index, i, j, l.w, l.h, l.delta, .01);
+                        delta_region_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, l.delta, .01, l.w*l.h);
                     }
                 }
             }
         }
         for(t = 0; t < 30; ++t){
-            box truth = float_to_box(state.truth + t*5 + b*l.truths);
+            box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
 
             if(!truth.x) break;
             float best_iou = 0;
-            int best_index = 0;
             int best_n = 0;
             i = (truth.x * l.w);
             j = (truth.y * l.h);
-            //printf("%d %f %d %f\n", i, truth.x*l.w, j, truth.y*l.h);
             box truth_shift = truth;
             truth_shift.x = 0;
             truth_shift.y = 0;
-            //printf("index %d %d\n",i, j);
             for(n = 0; n < l.n; ++n){
-                int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
-                box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
+                int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h);
                 if(l.bias_match){
                     pred.w = l.biases[2*n]/l.w;
                     pred.h = l.biases[2*n+1]/l.h;
                 }
-                //printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h);
                 pred.x = 0;
                 pred.y = 0;
                 float iou = box_iou(pred, truth_shift);
                 if (iou > best_iou){
-                    best_index = index;
                     best_iou = iou;
                     best_n = n;
                 }
             }
-            //printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h);
 
-            float iou = delta_region_box(truth, l.output, l.biases, best_n, best_index, i, j, l.w, l.h, l.delta, l.coord_scale);
+            int box_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 0);
+            float iou = delta_region_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, l.delta, l.coord_scale *  (2 - truth.w*truth.h), l.w*l.h);
+            if(l.coords > 4){
+                int mask_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 4);
+                delta_region_mask(net.truth + t*(l.coords + 1) + b*l.truths + 5, l.output, l.coords - 4, mask_index, l.delta, l.w*l.h, l.mask_scale);
+            }
             if(iou > .5) recall += 1;
             avg_iou += iou;
 
-            //l.delta[best_index + 4] = iou - l.output[best_index + 4];
-            avg_obj += l.output[best_index + 4];
-            l.delta[best_index + 4] = l.object_scale * (1 - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]);
+            int obj_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords);
+            avg_obj += l.output[obj_index];
+            l.delta[obj_index] = l.object_scale * (1 - l.output[obj_index]);
             if (l.rescore) {
-                l.delta[best_index + 4] = l.object_scale * (iou - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]);
+                l.delta[obj_index] = l.object_scale * (iou - l.output[obj_index]);
+            }
+            if(l.background){
+                l.delta[obj_index] = l.object_scale * (0 - l.output[obj_index]);
             }
 
-
-            int class = state.truth[t*5 + b*l.truths + 4];
+            int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];
             if (l.map) class = l.map[class];
-            delta_region_class(l.output, l.delta, best_index + 5, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat);
+            int class_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords + 1);
+            delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat, !l.softmax);
             ++count;
             ++class_count;
         }
     }
-    //printf("\n");
-#ifndef GPU
-    flatten(l.delta, l.w*l.h, size*l.n, l.batch, 0);
-#endif
     *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
     printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f,  count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count);
 }
 
-void backward_region_layer(const layer l, network_state state)
+void backward_region_layer(const layer l, network net)
 {
-    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+    /*
+       int b;
+       int size = l.coords + l.classes + 1;
+       for (b = 0; b < l.batch*l.n; ++b){
+       int index = (b*size + 4)*l.w*l.h;
+       gradient_array(l.output + index, l.w*l.h, LOGISTIC, l.delta + index);
+       }
+       axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
+     */
+}
+
+void correct_region_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
+{
+    int i;
+    int new_w=0;
+    int new_h=0;
+    if (((float)netw/w) < ((float)neth/h)) {
+        new_w = netw;
+        new_h = (h * netw)/w;
+    } else {
+        new_h = neth;
+        new_w = (w * neth)/h;
+    }
+    for (i = 0; i < n; ++i){
+        box b = dets[i].bbox;
+        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); 
+        b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth); 
+        b.w *= (float)netw/new_w;
+        b.h *= (float)neth/new_h;
+        if(!relative){
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+        dets[i].bbox = b;
+    }
 }
 
-void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh)
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets)
 {
-    int i,j,n;
+    int i,j,n,z;
     float *predictions = l.output;
+    if (l.batch == 2) {
+        float *flip = l.output + l.outputs;
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w/2; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    for(z = 0; z < l.classes + l.coords + 1; ++z){
+                        int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+                        int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+                        float swap = flip[i1];
+                        flip[i1] = flip[i2];
+                        flip[i2] = swap;
+                        if(z == 0){
+                            flip[i1] = -flip[i1];
+                            flip[i2] = -flip[i2];
+                        }
+                    }
+                }
+            }
+        }
+        for(i = 0; i < l.outputs; ++i){
+            l.output[i] = (l.output[i] + flip[i])/2.;
+        }
+    }
     for (i = 0; i < l.w*l.h; ++i){
         int row = i / l.w;
         int col = i % l.w;
         for(n = 0; n < l.n; ++n){
-            int index = i*l.n + n;
-            int p_index = index * (l.classes + 5) + 4;
-            float scale = predictions[p_index];
-            int box_index = index * (l.classes + 5);
-            boxes[index] = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);
-            boxes[index].x *= w;
-            boxes[index].y *= h;
-            boxes[index].w *= w;
-            boxes[index].h *= h;
-
-            int class_index = index * (l.classes + 5) + 5;
+            int index = n*l.w*l.h + i;
+            for(j = 0; j < l.classes; ++j){
+                dets[index].prob[j] = 0;
+            }
+            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, l.coords);
+            int box_index  = entry_index(l, 0, n*l.w*l.h + i, 0);
+            int mask_index = entry_index(l, 0, n*l.w*l.h + i, 4);
+            float scale = l.background ? 1 : predictions[obj_index];
+            dets[index].bbox = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h, l.w*l.h);
+            dets[index].objectness = scale > thresh ? scale : 0;
+            if(dets[index].mask){
+                for(j = 0; j < l.coords - 4; ++j){
+                    dets[index].mask[j] = l.output[mask_index + j*l.w*l.h];
+                }
+            }
+
+            int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + !l.background);
             if(l.softmax_tree){
 
-                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);
+                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0, l.w*l.h);
                 if(map){
                     for(j = 0; j < 200; ++j){
-                        float prob = scale*predictions[class_index+map[j]];
-                        probs[index][j] = (prob > thresh) ? prob : 0;
+                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + map[j]);
+                        float prob = scale*predictions[class_index];
+                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
                     }
                 } else {
-                    int j =  hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh);
-                    probs[index][j] = (scale > thresh) ? scale : 0;
-                    probs[index][l.classes] = scale;
+                    int j =  hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h);
+                    dets[index].prob[j] = (scale > thresh) ? scale : 0;
                 }
             } else {
-                for(j = 0; j < l.classes; ++j){
-                    float prob = scale*predictions[class_index+j];
-                    probs[index][j] = (prob > thresh) ? prob : 0;
+                if(dets[index].objectness){
+                    for(j = 0; j < l.classes; ++j){
+                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + j);
+                        float prob = scale*predictions[class_index];
+                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
+                    }
                 }
             }
-            if(only_objectness){
-                probs[index][0] = scale;
-            }
         }
     }
+    correct_region_boxes(dets, l.w*l.h*l.n, w, h, netw, neth, relative);
 }
 
 #ifdef GPU
 
-void forward_region_layer_gpu(const layer l, network_state state)
+void forward_region_layer_gpu(const layer l, network net)
 {
-    /*
-       if(!state.train){
-       copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
-       return;
-       }
-     */
-    flatten_ongpu(state.input, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 1, l.output_gpu);
-    if(l.softmax_tree){
-        int i;
-        int count = 5;
-        for (i = 0; i < l.softmax_tree->groups; ++i) {
-            int group_size = l.softmax_tree->group_size[i];
-            softmax_gpu(l.output_gpu+count, group_size, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + count);
-            count += group_size;
+    copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
+    int b, n;
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            if(l.coords > 4){
+                index = entry_index(l, b, n*l.w*l.h, 4);
+                activate_array_gpu(l.output_gpu + index, (l.coords - 4)*l.w*l.h, LOGISTIC);
+            }
+            index = entry_index(l, b, n*l.w*l.h, l.coords);
+            if(!l.background) activate_array_gpu(l.output_gpu + index,   l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, l.coords + 1);
+            if(!l.softmax && !l.softmax_tree) activate_array_gpu(l.output_gpu + index, l.classes*l.w*l.h, LOGISTIC);
         }
-    }else if (l.softmax){
-        softmax_gpu(l.output_gpu+5, l.classes, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + 5);
     }
-
-    float *in_cpu = calloc(l.batch*l.inputs, sizeof(float));
-    float *truth_cpu = 0;
-    if(state.truth){
-        int num_truth = l.batch*l.truths;
-        truth_cpu = calloc(num_truth, sizeof(float));
-        cuda_pull_array(state.truth, truth_cpu, num_truth);
+    if (l.softmax_tree){
+        int index = entry_index(l, 0, 0, l.coords + 1);
+        softmax_tree(net.input_gpu + index, l.w*l.h, l.batch*l.n, l.inputs/l.n, 1, l.output_gpu + index, *l.softmax_tree);
+    } else if (l.softmax) {
+        int index = entry_index(l, 0, 0, l.coords + !l.background);
+        softmax_gpu(net.input_gpu + index, l.classes + l.background, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu + index);
     }
-    cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs);
-    network_state cpu_state = state;
-    cpu_state.train = state.train;
-    cpu_state.truth = truth_cpu;
-    cpu_state.input = in_cpu;
-    forward_region_layer(l, cpu_state);
+    if(!net.train || l.onlyforward){
+        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        return;
+    }
+
+    cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs);
+    forward_region_layer(l, net);
     //cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs);
-    free(cpu_state.input);
-    if(!state.train) return;
+    if(!net.train) return;
     cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
-    if(cpu_state.truth) free(cpu_state.truth);
 }
 
-void backward_region_layer_gpu(layer l, network_state state)
+void backward_region_layer_gpu(const layer l, network net)
 {
-    flatten_ongpu(l.delta_gpu, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 0, state.delta);
+    int b, n;
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            gradient_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index);
+            if(l.coords > 4){
+                index = entry_index(l, b, n*l.w*l.h, 4);
+                gradient_array_gpu(l.output_gpu + index, (l.coords - 4)*l.w*l.h, LOGISTIC, l.delta_gpu + index);
+            }
+            index = entry_index(l, b, n*l.w*l.h, l.coords);
+            if(!l.background) gradient_array_gpu(l.output_gpu + index,   l.w*l.h, LOGISTIC, l.delta_gpu + index);
+        }
+    }
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
 }
 #endif
 
+void zero_objectness(layer l)
+{
+    int i, n;
+    for (i = 0; i < l.w*l.h; ++i){
+        for(n = 0; n < l.n; ++n){
+            int obj_index = entry_index(l, 0, n*l.w*l.h + i, l.coords);
+            l.output[obj_index] = 0;
+        }
+    }
+}
+
diff --git a/image.darknet/src/region_layer.h b/image.darknet/src/region_layer.h
index 9a3b7cd..9f12fd1 100644
--- a/image.darknet/src/region_layer.h
+++ b/image.darknet/src/region_layer.h
@@ -1,18 +1,18 @@
 #ifndef REGION_LAYER_H
 #define REGION_LAYER_H
 
+#include "darknet.h"
 #include "layer.h"
 #include "network.h"
 
-layer make_region_layer(int batch, int h, int w, int n, int classes, int coords);
-void forward_region_layer(const layer l, network_state state);
-void backward_region_layer(const layer l, network_state state);
-void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh);
+layer make_region_layer(int batch, int w, int h, int n, int classes, int coords);
+void forward_region_layer(const layer l, network net);
+void backward_region_layer(const layer l, network net);
 void resize_region_layer(layer *l, int w, int h);
 
 #ifdef GPU
-void forward_region_layer_gpu(const layer l, network_state state);
-void backward_region_layer_gpu(layer l, network_state state);
+void forward_region_layer_gpu(const layer l, network net);
+void backward_region_layer_gpu(layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/reorg_layer.c b/image.darknet/src/reorg_layer.c
index 2abca8f..31d6b84 100644
--- a/image.darknet/src/reorg_layer.c
+++ b/image.darknet/src/reorg_layer.c
@@ -1,18 +1,21 @@
 #include "reorg_layer.h"
 #include "cuda.h"
 #include "blas.h"
+
 #include <stdio.h>
 
 
-layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse)
+layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse, int flatten, int extra)
 {
     layer l = {0};
     l.type = REORG;
     l.batch = batch;
     l.stride = stride;
+    l.extra = extra;
     l.h = h;
     l.w = w;
     l.c = c;
+    l.flatten = flatten;
     if(reverse){
         l.out_w = w*stride;
         l.out_h = h*stride;
@@ -23,10 +26,20 @@ layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse)
         l.out_c = c*(stride*stride);
     }
     l.reverse = reverse;
-    fprintf(stderr, "reorg              /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n",  stride, w, h, c, l.out_w, l.out_h, l.out_c);
+
     l.outputs = l.out_h * l.out_w * l.out_c;
     l.inputs = h*w*c;
-    int output_size = l.out_h * l.out_w * l.out_c * batch;
+    if(l.extra){
+        l.out_w = l.out_h = l.out_c = 0;
+        l.outputs = l.inputs + l.extra;
+    }
+
+    if(extra){
+        fprintf(stderr, "reorg              %4d   ->  %4d\n",  l.inputs, l.outputs);
+    } else {
+        fprintf(stderr, "reorg              /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n",  stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    }
+    int output_size = l.outputs * batch;
     l.output =  calloc(output_size, sizeof(float));
     l.delta =   calloc(output_size, sizeof(float));
 
@@ -75,40 +88,86 @@ void resize_reorg_layer(layer *l, int w, int h)
 #endif
 }
 
-void forward_reorg_layer(const layer l, network_state state)
+void forward_reorg_layer(const layer l, network net)
 {
-    if(l.reverse){
-        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output);
-    }else {
-        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output);
+    int i;
+    if(l.flatten){
+        memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+        if(l.reverse){
+            flatten(l.output, l.w*l.h, l.c, l.batch, 0);
+        }else{
+            flatten(l.output, l.w*l.h, l.c, l.batch, 1);
+        }
+    } else if (l.extra) {
+        for(i = 0; i < l.batch; ++i){
+            copy_cpu(l.inputs, net.input + i*l.inputs, 1, l.output + i*l.outputs, 1);
+        }
+    } else if (l.reverse){
+        reorg_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output);
+    } else {
+        reorg_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output);
     }
 }
 
-void backward_reorg_layer(const layer l, network_state state)
+void backward_reorg_layer(const layer l, network net)
 {
-    if(l.reverse){
-        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
+    int i;
+    if(l.flatten){
+        memcpy(net.delta, l.delta, l.outputs*l.batch*sizeof(float));
+        if(l.reverse){
+            flatten(net.delta, l.w*l.h, l.c, l.batch, 1);
+        }else{
+            flatten(net.delta, l.w*l.h, l.c, l.batch, 0);
+        }
+    } else if(l.reverse){
+        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta);
+    } else if (l.extra) {
+        for(i = 0; i < l.batch; ++i){
+            copy_cpu(l.inputs, l.delta + i*l.outputs, 1, net.delta + i*l.inputs, 1);
+        }
     }else{
-        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
+        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta);
     }
 }
 
 #ifdef GPU
-void forward_reorg_layer_gpu(layer l, network_state state)
+void forward_reorg_layer_gpu(layer l, network net)
 {
-    if(l.reverse){
-        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
+    int i;
+    if(l.flatten){
+        if(l.reverse){
+            flatten_gpu(net.input_gpu, l.w*l.h, l.c, l.batch, 0, l.output_gpu);
+        }else{
+            flatten_gpu(net.input_gpu, l.w*l.h, l.c, l.batch, 1, l.output_gpu);
+        }
+    } else if (l.extra) {
+        for(i = 0; i < l.batch; ++i){
+            copy_gpu(l.inputs, net.input_gpu + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1);
+        }
+    } else if (l.reverse) {
+        reorg_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
     }else {
-        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
+        reorg_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
     }
 }
 
-void backward_reorg_layer_gpu(layer l, network_state state)
+void backward_reorg_layer_gpu(layer l, network net)
 {
-    if(l.reverse){
-        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
-    }else{
-        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
+    if(l.flatten){
+        if(l.reverse){
+            flatten_gpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, net.delta_gpu);
+        }else{
+            flatten_gpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, net.delta_gpu);
+        }
+    } else if (l.extra) {
+        int i;
+        for(i = 0; i < l.batch; ++i){
+            copy_gpu(l.inputs, l.delta_gpu + i*l.outputs, 1, net.delta_gpu + i*l.inputs, 1);
+        }
+    } else if(l.reverse){
+        reorg_gpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta_gpu);
+    } else {
+        reorg_gpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta_gpu);
     }
 }
 #endif
diff --git a/image.darknet/src/reorg_layer.h b/image.darknet/src/reorg_layer.h
index 21c22cd..e6513a5 100644
--- a/image.darknet/src/reorg_layer.h
+++ b/image.darknet/src/reorg_layer.h
@@ -6,14 +6,14 @@
 #include "layer.h"
 #include "network.h"
 
-layer make_reorg_layer(int batch, int h, int w, int c, int stride, int reverse);
+layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse, int flatten, int extra);
 void resize_reorg_layer(layer *l, int w, int h);
-void forward_reorg_layer(const layer l, network_state state);
-void backward_reorg_layer(const layer l, network_state state);
+void forward_reorg_layer(const layer l, network net);
+void backward_reorg_layer(const layer l, network net);
 
 #ifdef GPU
-void forward_reorg_layer_gpu(layer l, network_state state);
-void backward_reorg_layer_gpu(layer l, network_state state);
+void forward_reorg_layer_gpu(layer l, network net);
+void backward_reorg_layer_gpu(layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/rnn_layer.c b/image.darknet/src/rnn_layer.c
index 83fda13..8c9b457 100644
--- a/image.darknet/src/rnn_layer.c
+++ b/image.darknet/src/rnn_layer.c
@@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps)
 #endif
 }
 
-layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log)
+layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam)
 {
     fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs);
     batch = batch / steps;
@@ -34,24 +34,24 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps,
     l.batch = batch;
     l.type = RNN;
     l.steps = steps;
-    l.hidden = hidden;
     l.inputs = inputs;
 
-    l.state = calloc(batch*hidden*(steps+1), sizeof(float));
+    l.state = calloc(batch*outputs, sizeof(float));
+    l.prev_state = calloc(batch*outputs, sizeof(float));
 
     l.input_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize);
+    *(l.input_layer) = make_connected_layer(batch*steps, inputs, outputs, activation, batch_normalize, adam);
     l.input_layer->batch = batch;
 
     l.self_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize);
+    *(l.self_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam);
     l.self_layer->batch = batch;
 
     l.output_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize);
+    *(l.output_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam);
     l.output_layer->batch = batch;
 
     l.outputs = outputs;
@@ -65,66 +65,72 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps,
     l.forward_gpu = forward_rnn_layer_gpu;
     l.backward_gpu = backward_rnn_layer_gpu;
     l.update_gpu = update_rnn_layer_gpu;
-    l.state_gpu = cuda_make_array(l.state, batch*hidden*(steps+1));
+    l.state_gpu = cuda_make_array(0, batch*outputs);
+    l.prev_state_gpu = cuda_make_array(0, batch*outputs);
     l.output_gpu = l.output_layer->output_gpu;
     l.delta_gpu = l.output_layer->delta_gpu;
+#ifdef CUDNN
+    cudnnSetTensor4dDescriptor(l.input_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.input_layer->out_c, l.input_layer->out_h, l.input_layer->out_w); 
+    cudnnSetTensor4dDescriptor(l.self_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.self_layer->out_c, l.self_layer->out_h, l.self_layer->out_w); 
+    cudnnSetTensor4dDescriptor(l.output_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.output_layer->out_c, l.output_layer->out_h, l.output_layer->out_w); 
+#endif
 #endif
 
     return l;
 }
 
-void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_rnn_layer(layer l, update_args a)
 {
-    update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer(*(l.input_layer),  a);
+    update_connected_layer(*(l.self_layer),   a);
+    update_connected_layer(*(l.output_layer), a);
 }
 
-void forward_rnn_layer(layer l, network_state state)
+void forward_rnn_layer(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
     layer output_layer = *(l.output_layer);
 
     fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
-    fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1);
-    fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1);
-    if(state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, self_layer.delta, 1);
+    fill_cpu(l.outputs * l.batch * l.steps, 0, input_layer.delta, 1);
+    if(net.train) fill_cpu(l.outputs * l.batch, 0, l.state, 1);
 
     for (i = 0; i < l.steps; ++i) {
-        s.input = state.input;
+        s.input = net.input;
         forward_connected_layer(input_layer, s);
 
         s.input = l.state;
         forward_connected_layer(self_layer, s);
 
         float *old_state = l.state;
-        if(state.train) l.state += l.hidden*l.batch;
+        if(net.train) l.state += l.outputs*l.batch;
         if(l.shortcut){
-            copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1);
+            copy_cpu(l.outputs * l.batch, old_state, 1, l.state, 1);
         }else{
-            fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+            fill_cpu(l.outputs * l.batch, 0, l.state, 1);
         }
-        axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1);
-        axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
+        axpy_cpu(l.outputs * l.batch, 1, input_layer.output, 1, l.state, 1);
+        axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1);
 
         s.input = l.state;
         forward_connected_layer(output_layer, s);
 
-        state.input += l.inputs*l.batch;
+        net.input += l.inputs*l.batch;
         increment_layer(&input_layer, 1);
         increment_layer(&self_layer, 1);
         increment_layer(&output_layer, 1);
     }
 }
 
-void backward_rnn_layer(layer l, network_state state)
+void backward_rnn_layer(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = net;
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
@@ -134,34 +140,34 @@ void backward_rnn_layer(layer l, network_state state)
     increment_layer(&self_layer, l.steps-1);
     increment_layer(&output_layer, l.steps-1);
 
-    l.state += l.hidden*l.batch*l.steps;
+    l.state += l.outputs*l.batch*l.steps;
     for (i = l.steps-1; i >= 0; --i) {
-        copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1);
-        axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
+        copy_cpu(l.outputs * l.batch, input_layer.output, 1, l.state, 1);
+        axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1);
 
         s.input = l.state;
         s.delta = self_layer.delta;
         backward_connected_layer(output_layer, s);
 
-        l.state -= l.hidden*l.batch;
+        l.state -= l.outputs*l.batch;
         /*
            if(i > 0){
-           copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1);
-           axpy_cpu(l.hidden * l.batch, 1, self_layer.output - l.hidden*l.batch, 1, l.state, 1);
+           copy_cpu(l.outputs * l.batch, input_layer.output - l.outputs*l.batch, 1, l.state, 1);
+           axpy_cpu(l.outputs * l.batch, 1, self_layer.output - l.outputs*l.batch, 1, l.state, 1);
            }else{
-           fill_cpu(l.hidden * l.batch, 0, l.state, 1);
+           fill_cpu(l.outputs * l.batch, 0, l.state, 1);
            }
          */
 
         s.input = l.state;
-        s.delta = self_layer.delta - l.hidden*l.batch;
+        s.delta = self_layer.delta - l.outputs*l.batch;
         if (i == 0) s.delta = 0;
         backward_connected_layer(self_layer, s);
 
-        copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1);
-        if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1);
-        s.input = state.input + i*l.inputs*l.batch;
-        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
+        copy_cpu(l.outputs*l.batch, self_layer.delta, 1, input_layer.delta, 1);
+        if (i > 0 && l.shortcut) axpy_cpu(l.outputs*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.outputs*l.batch, 1);
+        s.input = net.input + i*l.inputs*l.batch;
+        if(net.delta) s.delta = net.delta + i*l.inputs*l.batch;
         else s.delta = 0;
         backward_connected_layer(input_layer, s);
 
@@ -187,58 +193,56 @@ void push_rnn_layer(layer l)
     push_connected_layer(*(l.output_layer));
 }
 
-void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
+void update_rnn_layer_gpu(layer l, update_args a)
 {
-    update_connected_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay);
-    update_connected_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay);
+    update_connected_layer_gpu(*(l.input_layer),  a);
+    update_connected_layer_gpu(*(l.self_layer),   a);
+    update_connected_layer_gpu(*(l.output_layer), a);
 }
 
-void forward_rnn_layer_gpu(layer l, network_state state)
+void forward_rnn_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = {0};
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
     layer output_layer = *(l.output_layer);
 
-    fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
-    fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
-    fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
-    if(state.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
+    fill_gpu(l.outputs * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
+
+    if(net.train) {
+        fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
+        copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
+    }
 
     for (i = 0; i < l.steps; ++i) {
-        s.input = state.input;
+        s.input_gpu = net.input_gpu;
         forward_connected_layer_gpu(input_layer, s);
 
-        s.input = l.state_gpu;
+        s.input_gpu = l.state_gpu;
         forward_connected_layer_gpu(self_layer, s);
 
-        float *old_state = l.state_gpu;
-        if(state.train) l.state_gpu += l.hidden*l.batch;
-        if(l.shortcut){
-            copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
-        }else{
-            fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
-        }
-        axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
-        axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
+        fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
+        axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
 
-        s.input = l.state_gpu;
+        s.input_gpu = l.state_gpu;
         forward_connected_layer_gpu(output_layer, s);
 
-        state.input += l.inputs*l.batch;
+        net.input_gpu += l.inputs*l.batch;
         increment_layer(&input_layer, 1);
         increment_layer(&self_layer, 1);
         increment_layer(&output_layer, 1);
     }
 }
 
-void backward_rnn_layer_gpu(layer l, network_state state)
+void backward_rnn_layer_gpu(layer l, network net)
 {
-    network_state s = {0};
-    s.train = state.train;
+    network s = {0};
+    s.train = net.train;
     int i;
     layer input_layer = *(l.input_layer);
     layer self_layer = *(l.self_layer);
@@ -246,32 +250,43 @@ void backward_rnn_layer_gpu(layer l, network_state state)
     increment_layer(&input_layer,  l.steps - 1);
     increment_layer(&self_layer,   l.steps - 1);
     increment_layer(&output_layer, l.steps - 1);
-    l.state_gpu += l.hidden*l.batch*l.steps;
+    float *last_input = input_layer.output_gpu;
+    float *last_self = self_layer.output_gpu;
     for (i = l.steps-1; i >= 0; --i) {
+        fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
+        axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
+        axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
 
-        s.input = l.state_gpu;
-        s.delta = self_layer.delta_gpu;
+        s.input_gpu = l.state_gpu;
+        s.delta_gpu = self_layer.delta_gpu;
         backward_connected_layer_gpu(output_layer, s);
 
-        l.state_gpu -= l.hidden*l.batch;
+        if(i != 0) {
+            fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
+            axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
+            axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
+        }else {
+            copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1);
+        }
 
-        copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
+        copy_gpu(l.outputs*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
 
-        s.input = l.state_gpu;
-        s.delta = self_layer.delta_gpu - l.hidden*l.batch;
-        if (i == 0) s.delta = 0;
+        s.input_gpu = l.state_gpu;
+        s.delta_gpu = (i > 0) ? self_layer.delta_gpu - l.outputs*l.batch : 0;
+        if (i == 0) s.delta_gpu = 0;
         backward_connected_layer_gpu(self_layer, s);
 
-        //copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
-        if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
-        s.input = state.input + i*l.inputs*l.batch;
-        if(state.delta) s.delta = state.delta + i*l.inputs*l.batch;
-        else s.delta = 0;
+        s.input_gpu = net.input_gpu + i*l.inputs*l.batch;
+        if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch;
+        else s.delta_gpu = 0;
         backward_connected_layer_gpu(input_layer, s);
 
         increment_layer(&input_layer,  -1);
         increment_layer(&self_layer,   -1);
         increment_layer(&output_layer, -1);
     }
+    fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
+    axpy_gpu(l.outputs * l.batch, 1, last_input, 1, l.state_gpu, 1);
+    axpy_gpu(l.outputs * l.batch, 1, last_self, 1, l.state_gpu, 1);
 }
 #endif
diff --git a/image.darknet/src/rnn_layer.h b/image.darknet/src/rnn_layer.h
index bb9478b..270a63f 100644
--- a/image.darknet/src/rnn_layer.h
+++ b/image.darknet/src/rnn_layer.h
@@ -7,16 +7,16 @@
 #include "network.h"
 #define USET
 
-layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log);
+layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam);
 
-void forward_rnn_layer(layer l, network_state state);
-void backward_rnn_layer(layer l, network_state state);
-void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_rnn_layer(layer l, network net);
+void backward_rnn_layer(layer l, network net);
+void update_rnn_layer(layer l, update_args a);
 
 #ifdef GPU
-void forward_rnn_layer_gpu(layer l, network_state state);
-void backward_rnn_layer_gpu(layer l, network_state state);
-void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void forward_rnn_layer_gpu(layer l, network net);
+void backward_rnn_layer_gpu(layer l, network net);
+void update_rnn_layer_gpu(layer l, update_args a);
 void push_rnn_layer(layer l);
 void pull_rnn_layer(layer l);
 #endif
diff --git a/image.darknet/src/rnn_vid.c b/image.darknet/src/rnn_vid.c
deleted file mode 100644
index 36912d6..0000000
--- a/image.darknet/src/rnn_vid.c
+++ /dev/null
@@ -1,211 +0,0 @@
-#include "network.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "blas.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-image get_image_from_stream(CvCapture *cap);
-image ipl_to_image(IplImage* src);
-
-void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters);
-
-
-typedef struct {
-    float *x;
-    float *y;
-} float_pair;
-
-float_pair get_rnn_vid_data(network net, char **files, int n, int batch, int steps)
-{
-    int b;
-    assert(net.batch == steps + 1);
-    image out_im = get_network_image(net);
-    int output_size = out_im.w*out_im.h*out_im.c;
-    printf("%d %d %d\n", out_im.w, out_im.h, out_im.c);
-    float *feats = calloc(net.batch*batch*output_size, sizeof(float));
-    for(b = 0; b < batch; ++b){
-        int input_size = net.w*net.h*net.c;
-        float *input = calloc(input_size*net.batch, sizeof(float));
-        char *filename = files[rand()%n];
-        CvCapture *cap = cvCaptureFromFile(filename);
-        int frames = cvGetCaptureProperty(cap, CV_CAP_PROP_FRAME_COUNT);
-        int index = rand() % (frames - steps - 2);
-        if (frames < (steps + 4)){
-            --b;
-            free(input);
-            continue;
-        }
-
-        printf("frames: %d, index: %d\n", frames, index);
-        cvSetCaptureProperty(cap, CV_CAP_PROP_POS_FRAMES, index);
-
-        int i;
-        for(i = 0; i < net.batch; ++i){
-            IplImage* src = cvQueryFrame(cap);
-            image im = ipl_to_image(src);
-            rgbgr_image(im);
-            image re = resize_image(im, net.w, net.h);
-            //show_image(re, "loaded");
-            //cvWaitKey(10);
-            memcpy(input + i*input_size, re.data, input_size*sizeof(float));
-            free_image(im);
-            free_image(re);
-        }
-        float *output = network_predict(net, input);
-
-        free(input);
-
-        for(i = 0; i < net.batch; ++i){
-            memcpy(feats + (b + i*batch)*output_size, output + i*output_size, output_size*sizeof(float));
-        }
-
-        cvReleaseCapture(&cap);
-    }
-
-    //printf("%d %d %d\n", out_im.w, out_im.h, out_im.c);
-    float_pair p = {0};
-    p.x = feats;
-    p.y = feats + output_size*batch; //+ out_im.w*out_im.h*out_im.c;
-
-    return p;
-}
-
-
-void train_vid_rnn(char *cfgfile, char *weightfile)
-{
-    char *train_videos = "data/vid/train.txt";
-    char *backup_directory = "/home/pjreddie/backup/";
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = net.batch*net.subdivisions;
-    int i = *net.seen/imgs;
-
-    list *plist = get_paths(train_videos);
-    int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
-    clock_t time;
-    int steps = net.time_steps;
-    int batch = net.batch / net.time_steps;
-
-    network extractor = parse_network_cfg("cfg/extractor.cfg");
-    load_weights(&extractor, "/home/pjreddie/trained/yolo-coco.conv");
-
-    while(get_current_batch(net) < net.max_batches){
-        i += 1;
-        time=clock();
-        float_pair p = get_rnn_vid_data(extractor, paths, N, batch, steps);
-
-        float loss = train_network_datum(net, p.x, p.y) / (net.batch);
-
-
-        free(p.x);
-        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-
-        fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time));
-        if(i%100==0){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-        }
-        if(i%10==0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup", backup_directory, base);
-            save_weights(net, buff);
-        }
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
-}
-
-
-image save_reconstruction(network net, image *init, float *feat, char *name, int i)
-{
-    image recon;
-    if (init) {
-        recon = copy_image(*init);
-    } else {
-        recon = make_random_image(net.w, net.h, 3);
-    }
-
-    image update = make_image(net.w, net.h, 3);
-    reconstruct_picture(net, feat, recon, update, .01, .9, .1, 2, 50);
-    char buff[256];
-    sprintf(buff, "%s%d", name, i);
-    save_image(recon, buff);
-    free_image(update);
-    return recon;
-}
-
-void generate_vid_rnn(char *cfgfile, char *weightfile)
-{
-    network extractor = parse_network_cfg("cfg/extractor.recon.cfg");
-    load_weights(&extractor, "/home/pjreddie/trained/yolo-coco.conv");
-
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&extractor, 1);
-    set_batch_network(&net, 1);
-
-    int i;
-    CvCapture *cap = cvCaptureFromFile("/extra/vid/ILSVRC2015/Data/VID/snippets/val/ILSVRC2015_val_00007030.mp4");
-    float *feat;
-    float *next;
-    image last;
-    for(i = 0; i < 25; ++i){
-        image im = get_image_from_stream(cap);
-        image re = resize_image(im, extractor.w, extractor.h);
-        feat = network_predict(extractor, re.data);
-        if(i > 0){
-            printf("%f %f\n", mean_array(feat, 14*14*512), variance_array(feat, 14*14*512));
-            printf("%f %f\n", mean_array(next, 14*14*512), variance_array(next, 14*14*512));
-            printf("%f\n", mse_array(feat, 14*14*512));
-            axpy_cpu(14*14*512, -1, feat, 1, next, 1);
-            printf("%f\n", mse_array(next, 14*14*512));
-        }
-        next = network_predict(net, feat);
-
-        free_image(im);
-
-        free_image(save_reconstruction(extractor, 0, feat, "feat", i));
-        free_image(save_reconstruction(extractor, 0, next, "next", i));
-        if (i==24) last = copy_image(re);
-        free_image(re);
-    }
-    for(i = 0; i < 30; ++i){
-        next = network_predict(net, next);
-        image new = save_reconstruction(extractor, &last, next, "new", i);
-        free_image(last);
-        last = new;
-    }
-}
-
-void run_vid_rnn(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    //char *filename = (argc > 5) ? argv[5]: 0;
-    if(0==strcmp(argv[2], "train")) train_vid_rnn(cfg, weights);
-    else if(0==strcmp(argv[2], "generate")) generate_vid_rnn(cfg, weights);
-}
-#else
-void run_vid_rnn(int argc, char **argv){}
-#endif
-
diff --git a/image.darknet/src/route_layer.c b/image.darknet/src/route_layer.c
index dce7118..a8970a4 100644
--- a/image.darknet/src/route_layer.c
+++ b/image.darknet/src/route_layer.c
@@ -1,6 +1,7 @@
 #include "route_layer.h"
 #include "cuda.h"
 #include "blas.h"
+
 #include <stdio.h>
 
 route_layer make_route_layer(int batch, int n, int *input_layers, int *input_sizes)
@@ -70,13 +71,13 @@ void resize_route_layer(route_layer *l, network *net)
     
 }
 
-void forward_route_layer(const route_layer l, network_state state)
+void forward_route_layer(const route_layer l, network net)
 {
     int i, j;
     int offset = 0;
     for(i = 0; i < l.n; ++i){
         int index = l.input_layers[i];
-        float *input = state.net.layers[index].output;
+        float *input = net.layers[index].output;
         int input_size = l.input_sizes[i];
         for(j = 0; j < l.batch; ++j){
             copy_cpu(input_size, input + j*input_size, 1, l.output + offset + j*l.outputs, 1);
@@ -85,13 +86,13 @@ void forward_route_layer(const route_layer l, network_state state)
     }
 }
 
-void backward_route_layer(const route_layer l, network_state state)
+void backward_route_layer(const route_layer l, network net)
 {
     int i, j;
     int offset = 0;
     for(i = 0; i < l.n; ++i){
         int index = l.input_layers[i];
-        float *delta = state.net.layers[index].delta;
+        float *delta = net.layers[index].delta;
         int input_size = l.input_sizes[i];
         for(j = 0; j < l.batch; ++j){
             axpy_cpu(input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1);
@@ -101,31 +102,31 @@ void backward_route_layer(const route_layer l, network_state state)
 }
 
 #ifdef GPU
-void forward_route_layer_gpu(const route_layer l, network_state state)
+void forward_route_layer_gpu(const route_layer l, network net)
 {
     int i, j;
     int offset = 0;
     for(i = 0; i < l.n; ++i){
         int index = l.input_layers[i];
-        float *input = state.net.layers[index].output_gpu;
+        float *input = net.layers[index].output_gpu;
         int input_size = l.input_sizes[i];
         for(j = 0; j < l.batch; ++j){
-            copy_ongpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1);
+            copy_gpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1);
         }
         offset += input_size;
     }
 }
 
-void backward_route_layer_gpu(const route_layer l, network_state state)
+void backward_route_layer_gpu(const route_layer l, network net)
 {
     int i, j;
     int offset = 0;
     for(i = 0; i < l.n; ++i){
         int index = l.input_layers[i];
-        float *delta = state.net.layers[index].delta_gpu;
+        float *delta = net.layers[index].delta_gpu;
         int input_size = l.input_sizes[i];
         for(j = 0; j < l.batch; ++j){
-            axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
+            axpy_gpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
         }
         offset += input_size;
     }
diff --git a/image.darknet/src/route_layer.h b/image.darknet/src/route_layer.h
index 45467d9..1d40330 100644
--- a/image.darknet/src/route_layer.h
+++ b/image.darknet/src/route_layer.h
@@ -6,13 +6,13 @@
 typedef layer route_layer;
 
 route_layer make_route_layer(int batch, int n, int *input_layers, int *input_size);
-void forward_route_layer(const route_layer l, network_state state);
-void backward_route_layer(const route_layer l, network_state state);
+void forward_route_layer(const route_layer l, network net);
+void backward_route_layer(const route_layer l, network net);
 void resize_route_layer(route_layer *l, network *net);
 
 #ifdef GPU
-void forward_route_layer_gpu(const route_layer l, network_state state);
-void backward_route_layer_gpu(const route_layer l, network_state state);
+void forward_route_layer_gpu(const route_layer l, network net);
+void backward_route_layer_gpu(const route_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/shortcut_layer.c b/image.darknet/src/shortcut_layer.c
index 8bca50f..49d17f5 100644
--- a/image.darknet/src/shortcut_layer.c
+++ b/image.darknet/src/shortcut_layer.c
@@ -1,12 +1,14 @@
 #include "shortcut_layer.h"
 #include "cuda.h"
 #include "blas.h"
+#include "activations.h"
+
 #include <stdio.h>
 #include <assert.h>
 
 layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2)
 {
-    fprintf(stderr,"Shortcut Layer: %d\n", index);
+    fprintf(stderr, "res  %3d                %4d x%4d x%4d   ->  %4d x%4d x%4d\n",index, w2,h2,c2, w,h,c);
     layer l = {0};
     l.type = SHORTCUT;
     l.batch = batch;
@@ -36,32 +38,53 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
     return l;
 }
 
-void forward_shortcut_layer(const layer l, network_state state)
+void resize_shortcut_layer(layer *l, int w, int h)
+{
+    assert(l->w == l->out_w);
+    assert(l->h == l->out_h);
+    l->w = l->out_w = w;
+    l->h = l->out_h = h;
+    l->outputs = w*h*l->out_c;
+    l->inputs = l->outputs;
+    l->delta =  realloc(l->delta, l->outputs*l->batch*sizeof(float));
+    l->output = realloc(l->output, l->outputs*l->batch*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu   = cuda_make_array(l->delta,  l->outputs*l->batch);
+#endif
+    
+}
+
+
+void forward_shortcut_layer(const layer l, network net)
 {
-    copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
-    shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
+    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
+    shortcut_cpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.alpha, l.beta, l.output);
     activate_array(l.output, l.outputs*l.batch, l.activation);
 }
 
-void backward_shortcut_layer(const layer l, network_state state)
+void backward_shortcut_layer(const layer l, network net)
 {
     gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
-    axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1);
-    shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
+    axpy_cpu(l.outputs*l.batch, l.alpha, l.delta, 1, net.delta, 1);
+    shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, 1, l.beta, net.layers[l.index].delta);
 }
 
 #ifdef GPU
-void forward_shortcut_layer_gpu(const layer l, network_state state)
+void forward_shortcut_layer_gpu(const layer l, network net)
 {
-    copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
-    shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
-    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    shortcut_gpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.alpha, l.beta, l.output_gpu);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
 }
 
-void backward_shortcut_layer_gpu(const layer l, network_state state)
+void backward_shortcut_layer_gpu(const layer l, network net)
 {
-    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
-    axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, state.delta, 1);
-    shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, state.net.layers[l.index].delta_gpu);
+    gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    axpy_gpu(l.outputs*l.batch, l.alpha, l.delta_gpu, 1, net.delta_gpu, 1);
+    shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, 1, l.beta, net.layers[l.index].delta_gpu);
 }
 #endif
diff --git a/image.darknet/src/shortcut_layer.h b/image.darknet/src/shortcut_layer.h
index c09a809..5f684fc 100644
--- a/image.darknet/src/shortcut_layer.h
+++ b/image.darknet/src/shortcut_layer.h
@@ -5,12 +5,13 @@
 #include "network.h"
 
 layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2);
-void forward_shortcut_layer(const layer l, network_state state);
-void backward_shortcut_layer(const layer l, network_state state);
+void forward_shortcut_layer(const layer l, network net);
+void backward_shortcut_layer(const layer l, network net);
+void resize_shortcut_layer(layer *l, int w, int h);
 
 #ifdef GPU
-void forward_shortcut_layer_gpu(const layer l, network_state state);
-void backward_shortcut_layer_gpu(const layer l, network_state state);
+void forward_shortcut_layer_gpu(const layer l, network net);
+void backward_shortcut_layer_gpu(const layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/softmax_layer.c b/image.darknet/src/softmax_layer.c
index 5d15314..9cbc6be 100644
--- a/image.darknet/src/softmax_layer.c
+++ b/image.darknet/src/softmax_layer.c
@@ -1,6 +1,7 @@
 #include "softmax_layer.h"
 #include "blas.h"
 #include "cuda.h"
+
 #include <float.h>
 #include <math.h>
 #include <stdlib.h>
@@ -17,8 +18,10 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups)
     l.groups = groups;
     l.inputs = inputs;
     l.outputs = inputs;
+    l.loss = calloc(inputs*batch, sizeof(float));
     l.output = calloc(inputs*batch, sizeof(float));
     l.delta = calloc(inputs*batch, sizeof(float));
+    l.cost = calloc(1, sizeof(float));
 
     l.forward = forward_softmax_layer;
     l.backward = backward_softmax_layer;
@@ -27,45 +30,35 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups)
     l.backward_gpu = backward_softmax_layer_gpu;
 
     l.output_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.loss_gpu = cuda_make_array(l.loss, inputs*batch); 
     l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
     #endif
     return l;
 }
 
-void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output)
+void forward_softmax_layer(const softmax_layer l, network net)
 {
-    int b;
-    for(b = 0; b < batch; ++b){
+    if(l.softmax_tree){
         int i;
         int count = 0;
-        for(i = 0; i < hierarchy->groups; ++i){
-            int group_size = hierarchy->group_size[i];
-            softmax(input+b*inputs + count, group_size, temp, output+b*inputs + count);
+        for (i = 0; i < l.softmax_tree->groups; ++i) {
+            int group_size = l.softmax_tree->group_size[i];
+            softmax_cpu(net.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output + count);
             count += group_size;
         }
+    } else {
+        softmax_cpu(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output);
     }
-}
 
-void forward_softmax_layer(const softmax_layer l, network_state state)
-{
-    int b;
-    int inputs = l.inputs / l.groups;
-    int batch = l.batch * l.groups;
-    if(l.softmax_tree){
-        softmax_tree(state.input, batch, inputs, l.temperature, l.softmax_tree, l.output);
-    } else {
-        for(b = 0; b < batch; ++b){
-            softmax(state.input+b*inputs, inputs, l.temperature, l.output+b*inputs);
-        }
+    if(net.truth && !l.noloss){
+        softmax_x_ent_cpu(l.batch*l.inputs, l.output, net.truth, l.delta, l.loss);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
     }
 }
 
-void backward_softmax_layer(const softmax_layer l, network_state state)
+void backward_softmax_layer(const softmax_layer l, network net)
 {
-    int i;
-    for(i = 0; i < l.inputs*l.batch; ++i){
-        state.delta[i] += l.delta[i];
-    }
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
 }
 
 #ifdef GPU
@@ -75,26 +68,40 @@ void pull_softmax_layer_output(const softmax_layer layer)
     cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch);
 }
 
-void forward_softmax_layer_gpu(const softmax_layer l, network_state state)
+void forward_softmax_layer_gpu(const softmax_layer l, network net)
 {
-    int inputs = l.inputs / l.groups;
-    int batch = l.batch * l.groups;
     if(l.softmax_tree){
+        softmax_tree(net.input_gpu, 1, l.batch, l.inputs, l.temperature, l.output_gpu, *l.softmax_tree);
+        /*
         int i;
         int count = 0;
         for (i = 0; i < l.softmax_tree->groups; ++i) {
             int group_size = l.softmax_tree->group_size[i];
-            softmax_gpu(state.input+count, group_size, inputs, batch, l.temperature, l.output_gpu + count);
+            softmax_gpu(net.input_gpu + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count);
             count += group_size;
         }
+        */
     } else {
-        softmax_gpu(state.input, inputs, inputs, batch, l.temperature, l.output_gpu);
+        if(l.spatial){
+            softmax_gpu(net.input_gpu, l.c, l.batch*l.c, l.inputs/l.c, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu);
+        }else{
+            softmax_gpu(net.input_gpu, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu);
+        }
+    }
+    if(net.truth && !l.noloss){
+        softmax_x_ent_gpu(l.batch*l.inputs, l.output_gpu, net.truth_gpu, l.delta_gpu, l.loss_gpu);
+        if(l.softmax_tree){
+            mask_gpu(l.batch*l.inputs, l.delta_gpu, SECRET_NUM, net.truth_gpu, 0);
+            mask_gpu(l.batch*l.inputs, l.loss_gpu, SECRET_NUM, net.truth_gpu, 0);
+        }
+        cuda_pull_array(l.loss_gpu, l.loss, l.batch*l.inputs);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
     }
 }
 
-void backward_softmax_layer_gpu(const softmax_layer layer, network_state state)
+void backward_softmax_layer_gpu(const softmax_layer layer, network net)
 {
-    axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, state.delta, 1);
+    axpy_gpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta_gpu, 1);
 }
 
 #endif
diff --git a/image.darknet/src/softmax_layer.h b/image.darknet/src/softmax_layer.h
index 821a8dd..2e3ffe0 100644
--- a/image.darknet/src/softmax_layer.h
+++ b/image.darknet/src/softmax_layer.h
@@ -7,13 +7,13 @@ typedef layer softmax_layer;
 
 void softmax_array(float *input, int n, float temp, float *output);
 softmax_layer make_softmax_layer(int batch, int inputs, int groups);
-void forward_softmax_layer(const softmax_layer l, network_state state);
-void backward_softmax_layer(const softmax_layer l, network_state state);
+void forward_softmax_layer(const softmax_layer l, network net);
+void backward_softmax_layer(const softmax_layer l, network net);
 
 #ifdef GPU
 void pull_softmax_layer_output(const softmax_layer l);
-void forward_softmax_layer_gpu(const softmax_layer l, network_state state);
-void backward_softmax_layer_gpu(const softmax_layer l, network_state state);
+void forward_softmax_layer_gpu(const softmax_layer l, network net);
+void backward_softmax_layer_gpu(const softmax_layer l, network net);
 #endif
 
 #endif
diff --git a/image.darknet/src/stb_image.h b/image.darknet/src/stb_image.h
index d0fa9c2..d9c21bc 100644
--- a/image.darknet/src/stb_image.h
+++ b/image.darknet/src/stb_image.h
@@ -1,5 +1,5 @@
-/* stb_image - v2.06 - public domain image loader - http://nothings.org/stb_image.h
-                                     no warranty implied; use at your own risk
+/* stb_image - v2.19 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
 
    Do this:
       #define STB_IMAGE_IMPLEMENTATION
@@ -21,17 +21,20 @@
           avoid problematic images and only need the trivial interface
 
       JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
-      PNG 1/2/4/8-bit-per-channel (16 bpc not supported)
+      PNG 1/2/4/8/16-bit-per-channel
 
       TGA (not sure what subset, if a subset)
       BMP non-1bpp, non-RLE
-      PSD (composited view only, no extra channels)
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
 
       GIF (*comp always reports as 4-channel)
       HDR (radiance rgbE format)
       PIC (Softimage PIC)
       PNM (PPM and PGM binary only)
 
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
       - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
       - decode from arbitrary I/O callbacks
       - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
@@ -39,176 +42,65 @@
    Full documentation under "DOCUMENTATION" below.
 
 
-   Revision 2.00 release notes:
-
-      - Progressive JPEG is now supported.
-
-      - PPM and PGM binary formats are now supported, thanks to Ken Miller.
-
-      - x86 platforms now make use of SSE2 SIMD instructions for
-        JPEG decoding, and ARM platforms can use NEON SIMD if requested.
-        This work was done by Fabian "ryg" Giesen. SSE2 is used by
-        default, but NEON must be enabled explicitly; see docs.
-
-        With other JPEG optimizations included in this version, we see
-        2x speedup on a JPEG on an x86 machine, and a 1.5x speedup
-        on a JPEG on an ARM machine, relative to previous versions of this
-        library. The same results will not obtain for all JPGs and for all
-        x86/ARM machines. (Note that progressive JPEGs are significantly
-        slower to decode than regular JPEGs.) This doesn't mean that this
-        is the fastest JPEG decoder in the land; rather, it brings it
-        closer to parity with standard libraries. If you want the fastest
-        decode, look elsewhere. (See "Philosophy" section of docs below.)
-
-        See final bullet items below for more info on SIMD.
-
-      - Added STBI_MALLOC, STBI_REALLOC, and STBI_FREE macros for replacing
-        the memory allocator. Unlike other STBI libraries, these macros don't
-        support a context parameter, so if you need to pass a context in to
-        the allocator, you'll have to store it in a global or a thread-local
-        variable.
-
-      - Split existing STBI_NO_HDR flag into two flags, STBI_NO_HDR and
-        STBI_NO_LINEAR.
-            STBI_NO_HDR:     suppress implementation of .hdr reader format
-            STBI_NO_LINEAR:  suppress high-dynamic-range light-linear float API
-
-      - You can suppress implementation of any of the decoders to reduce
-        your code footprint by #defining one or more of the following
-        symbols before creating the implementation.
-
-            STBI_NO_JPEG
-            STBI_NO_PNG
-            STBI_NO_BMP
-            STBI_NO_PSD
-            STBI_NO_TGA
-            STBI_NO_GIF
-            STBI_NO_HDR
-            STBI_NO_PIC
-            STBI_NO_PNM   (.ppm and .pgm)
-
-      - You can request *only* certain decoders and suppress all other ones
-        (this will be more forward-compatible, as addition of new decoders
-        doesn't require you to disable them explicitly):
-
-            STBI_ONLY_JPEG
-            STBI_ONLY_PNG
-            STBI_ONLY_BMP
-            STBI_ONLY_PSD
-            STBI_ONLY_TGA
-            STBI_ONLY_GIF
-            STBI_ONLY_HDR
-            STBI_ONLY_PIC
-            STBI_ONLY_PNM   (.ppm and .pgm)
-
-         Note that you can define multiples of these, and you will get all
-         of them ("only x" and "only y" is interpreted to mean "only x&y").
-
-       - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
-         want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
-
-      - Compilation of all SIMD code can be suppressed with
-            #define STBI_NO_SIMD
-        It should not be necessary to disable SIMD unless you have issues
-        compiling (e.g. using an x86 compiler which doesn't support SSE
-        intrinsics or that doesn't support the method used to detect
-        SSE2 support at run-time), and even those can be reported as
-        bugs so I can refine the built-in compile-time checking to be
-        smarter.
-
-      - The old STBI_SIMD system which allowed installing a user-defined
-        IDCT etc. has been removed. If you need this, don't upgrade. My
-        assumption is that almost nobody was doing this, and those who
-        were will find the built-in SIMD more satisfactory anyway.
-
-      - RGB values computed for JPEG images are slightly different from
-        previous versions of stb_image. (This is due to using less
-        integer precision in SIMD.) The C code has been adjusted so
-        that the same RGB values will be computed regardless of whether
-        SIMD support is available, so your app should always produce
-        consistent results. But these results are slightly different from
-        previous versions. (Specifically, about 3% of available YCbCr values
-        will compute different RGB results from pre-1.49 versions by +-1;
-        most of the deviating values are one smaller in the G channel.)
-
-      - If you must produce consistent results with previous versions of
-        stb_image, #define STBI_JPEG_OLD and you will get the same results
-        you used to; however, you will not get the SIMD speedups for
-        the YCbCr-to-RGB conversion step (although you should still see
-        significant JPEG speedup from the other changes).
-
-        Please note that STBI_JPEG_OLD is a temporary feature; it will be
-        removed in future versions of the library. It is only intended for
-        near-term back-compatibility use.
-
-
-   Latest revision history:
-      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
-      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
-      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
-      2.03  (2015-04-12) additional corruption checking
-                         stbi_set_flip_vertically_on_load
-                         fix NEON support; fix mingw support
-      2.02  (2015-01-19) fix incorrect assert, fix warning
-      2.01  (2015-01-17) fix various warnings
-      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
-      2.00  (2014-12-25) optimize JPEG, including x86 SSE2 & ARM NEON SIMD
-                         progressive JPEG
-                         PGM/PPM support
-                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
-                         STBI_NO_*, STBI_ONLY_*
-                         GIF bugfix
-      1.48  (2014-12-14) fix incorrectly-named assert()
-      1.47  (2014-12-14) 1/2/4-bit PNG support (both grayscale and paletted)
-                         optimize PNG
-                         fix bug in interlaced PNG with user-specified channel count
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
 
    See end of file for full revision history.
 
 
  ============================    Contributors    =========================
 
- Image formats                                Bug fixes & warning fixes
-    Sean Barrett (jpeg, png, bmp)                Marc LeBlanc
-    Nicolas Schulz (hdr, psd)                    Christpher Lloyd
-    Jonathan Dummer (tga)                        Dave Moore
-    Jean-Marc Lienher (gif)                      Won Chun
-    Tom Seddon (pic)                             the Horde3D community
-    Thatcher Ulrich (psd)                        Janez Zemva
-    Ken Miller (pgm, ppm)                        Jonathan Blow
-                                                 Laurent Gomila
-                                                 Aruelien Pocheville
- Extensions, features                            Ryamond Barbiero
-    Jetro Lauha (stbi_info)                      David Woo
-    Martin "SpartanJ" Golini (stbi_info)         Martin Golini
-    James "moose2000" Brown (iPhone PNG)         Roy Eltham
-    Ben "Disch" Wenger (io callbacks)            Luke Graham
-    Omar Cornut (1/2/4-bit PNG)                  Thomas Ruf
-    Nicolas Guillemot (vertical flip)            John Bartholomew
-                                                 Ken Hamada
- Optimizations & bugfixes                        Cort Stratton
-    Fabian "ryg" Giesen                          Blazej Dariusz Roszkowski
-    Arseny Kapoulkine                            Thibault Reuille
-                                                 Paul Du Bois
-                                                 Guillaume George
-  If your name should be here but                Jerry Jansson
-  isn't, let Sean know.                          Hayaki Saito
-                                                 Johan Duparc
-                                                 Ronny Chevalier
-                                                 Michal Cichon
-                                                 Tero Hanninen
-                                                 Sergio Gonzalez
-                                                 Cass Everitt
-                                                 Engin Manap
-                                                 Martins Mozeiko
-                                                 Joseph Thomson
-                                                 Phil Jordan
-
-License:
-   This software is in the public domain. Where that dedication is not
-   recognized, you are granted a perpetual, irrevocable license to copy
-   and modify this file however you want.
-
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine
+    John-Mark Allen
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson     Phil Jordan
+    Dave Moore              Roy Eltham         Hayaki Saito       Nathan Reed
+    Won Chun                Luke Graham        Johan Duparc       Nick Verigakis
+    the Horde3D community   Thomas Ruf         Ronny Chevalier    github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon      github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen      github:svdijk
+    Laurent Gomila          Cort Stratton      Sergio Gonzalez    github:snagar
+    Aruelien Pocheville     Thibault Reuille   Cass Everitt       github:Zelex
+    Ryamond Barbiero        Paul Du Bois       Engin Manap        github:grim210
+    Aldo Culquicondor       Philipp Wiesemann  Dale Weiler        github:sammyhw
+    Oriol Ferrer Mesia      Josh Tobin         Matthew Gregan     github:phprus
+    Julian Raschke          Gregory Mullen     Baldur Karlsson    github:poppolopoppo
+    Christian Floisand      Kevin Schmidt                         github:darealshinji
+    Blazej Dariusz Roszkowski                                     github:Michaelangel007
 */
 
 #ifndef STBI_INCLUDE_STB_IMAGE_H
@@ -217,10 +109,8 @@
 // DOCUMENTATION
 //
 // Limitations:
-//    - no 16-bit-per-channel PNG
 //    - no 12-bit-per-channel JPEG
 //    - no JPEGs with arithmetic coding
-//    - no 1-bit BMP
 //    - GIF always returns *comp=4
 //
 // Basic usage (see HDR discussion below for HDR usage):
@@ -233,10 +123,10 @@
 //    stbi_image_free(data)
 //
 // Standard parameters:
-//    int *x       -- outputs image width in pixels
-//    int *y       -- outputs image height in pixels
-//    int *comp    -- outputs # of image components in image file
-//    int req_comp -- if non-zero, # of image components requested in result
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
 //
 // The return value from an image loader is an 'unsigned char *' which points
 // to the pixel data, or NULL on an allocation failure or if the image is
@@ -244,11 +134,12 @@
 // with each pixel consisting of N interleaved 8-bit components; the first
 // pixel pointed to is top-left-most in the image. There is no padding between
 // image scanlines or between pixels, regardless of format. The number of
-// components N is 'req_comp' if req_comp is non-zero, or *comp otherwise.
-// If req_comp is non-zero, *comp has the number of components that _would_
-// have been output otherwise. E.g. if you set req_comp to 4, you will always
-// get RGBA output, but you can check *comp to see if it's trivially opaque
-// because e.g. there were only 3 channels in the source image.
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
 //
 // An output image with N components has the following components interleaved
 // in this order in each pixel:
@@ -260,10 +151,10 @@
 //       4           red, green, blue, alpha
 //
 // If image loading fails for any reason, the return value will be NULL,
-// and *x, *y, *comp will be unchanged. The function stbi_failure_reason()
-// can be queried for an extremely brief, end-user unfriendly explanation
-// of why the load failed. Define STBI_NO_FAILURE_STRINGS to avoid
-// compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
 // more user-friendly ones.
 //
 // Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
@@ -282,13 +173,13 @@
 // and for best performance I may provide less-easy-to-use APIs that give higher
 // performance, in addition to the easy to use ones. Nevertheless, it's important
 // to keep in mind that from the standpoint of you, a client of this library,
-// all you care about is #1 and #3, and stb libraries do not emphasize #3 above all.
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
 //
 // Some secondary priorities arise directly from the first two, some of which
 // make more explicit reasons why performance can't be emphasized.
 //
 //    - Portable ("ease of use")
-//    - Small footprint ("easy to maintain")
+//    - Small source code footprint ("easy to maintain")
 //    - No dependencies ("ease of use")
 //
 // ===========================================================================
@@ -320,13 +211,6 @@
 // (at least this is true for iOS and Android). Therefore, the NEON support is
 // toggled by a build flag: define STBI_NEON to get NEON loops.
 //
-// The output of the JPEG decoder is slightly different from versions where
-// SIMD support was introduced (that is, for versions before 1.49). The
-// difference is only +-1 in the 8-bit RGB channels, and only on a small
-// fraction of pixels. You can force the pre-1.49 behavior by defining
-// STBI_JPEG_OLD, but this will disable some of the SIMD decoding path
-// and hence cost some performance.
-//
 // If for some reason you do not want to use any of SIMD code, or if
 // you have issues compiling it, you can disable it entirely by
 // defining STBI_NO_SIMD.
@@ -382,6 +266,41 @@
 // says there's premultiplied data (currently only happens in iPhone images,
 // and only if iPhone convert-to-rgb processing is on).
 //
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
 
 
 #ifndef STBI_NO_STDIO
@@ -392,7 +311,7 @@
 
 enum
 {
-   STBI_default = 0, // only used for req_comp
+   STBI_default = 0, // only used for desired_channels
 
    STBI_grey       = 1,
    STBI_grey_alpha = 2,
@@ -401,6 +320,7 @@ enum
 };
 
 typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
 
 #ifdef __cplusplus
 extern "C" {
@@ -428,34 +348,60 @@ typedef struct
    int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
 } stbi_io_callbacks;
 
-STBIDEF stbi_uc *stbi_load               (char              const *filename,           int *x, int *y, int *comp, int req_comp);
-STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *comp, int req_comp);
-STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *comp, int req_comp);
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
 
 #ifndef STBI_NO_STDIO
-STBIDEF stbi_uc *stbi_load_from_file  (FILE *f,                  int *x, int *y, int *comp, int req_comp);
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
 // for stbi_load_from_file, file pointer is left pointing immediately after image
 #endif
 
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
 #ifndef STBI_NO_LINEAR
-   STBIDEF float *stbi_loadf                 (char const *filename,           int *x, int *y, int *comp, int req_comp);
-   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp);
-   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
 
    #ifndef STBI_NO_STDIO
-   STBIDEF float *stbi_loadf_from_file  (FILE *f,                int *x, int *y, int *comp, int req_comp);
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
    #endif
 #endif
 
 #ifndef STBI_NO_HDR
    STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
    STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
-#endif
+#endif // STBI_NO_HDR
 
 #ifndef STBI_NO_LINEAR
    STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
    STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
-#endif // STBI_NO_HDR
+#endif // STBI_NO_LINEAR
 
 // stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
 STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
@@ -476,11 +422,14 @@ STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
 // get image dimensions & components without fully decoding
 STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
 STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
 
 #ifndef STBI_NO_STDIO
-STBIDEF int      stbi_info            (char const *filename,     int *x, int *y, int *comp);
-STBIDEF int      stbi_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
-
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
 #endif
 
 
@@ -561,9 +510,10 @@ STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const ch
 #include <stddef.h> // ptrdiff_t on osx
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 
 #if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
-#include <math.h>  // ldexp
+#include <math.h>  // ldexp, pow
 #endif
 
 #ifndef STBI_NO_STDIO
@@ -619,18 +569,22 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
    #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
 #endif
 
-#if defined(STBI_MALLOC) && defined(STBI_FREE) && defined(STBI_REALLOC)
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
 // ok
-#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC)
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
 // ok
 #else
-#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC."
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
 #endif
 
 #ifndef STBI_MALLOC
-#define STBI_MALLOC(sz)    malloc(sz)
-#define STBI_REALLOC(p,sz) realloc(p,sz)
-#define STBI_FREE(p)       free(p)
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
 #endif
 
 // x86/x64 detection
@@ -640,12 +594,14 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #define STBI__X86_TARGET
 #endif
 
-#if defined(__GNUC__) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET)) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
-// NOTE: not clear do we actually need this for the 64-bit path?
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
 // gcc doesn't support sse2 intrinsics unless you compile with -msse2,
-// (but compiling with -msse2 allows the compiler to use SSE2 everywhere;
-// this is just broken and gcc are jerks for not fixing it properly
-// http://www.virtualdub.org/blog/pivot/entry.php?id=363 )
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
 #define STBI_NO_SIMD
 #endif
 
@@ -664,7 +620,7 @@ typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
 #define STBI_NO_SIMD
 #endif
 
-#if !defined(STBI_NO_SIMD) && defined(STBI__X86_TARGET)
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
 #define STBI_SSE2
 #include <emmintrin.h>
 
@@ -693,7 +649,7 @@ static int stbi__cpuid3(void)
 
 #define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
 
-static int stbi__sse2_available()
+static int stbi__sse2_available(void)
 {
    int info3 = stbi__cpuid3();
    return ((info3 >> 26) & 1) != 0;
@@ -701,16 +657,12 @@ static int stbi__sse2_available()
 #else // assume GCC-style if not VC++
 #define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
 
-static int stbi__sse2_available()
+static int stbi__sse2_available(void)
 {
-#if defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 // GCC 4.8 or later
-   // GCC 4.8+ has a nice way to do this
-   return __builtin_cpu_supports("sse2");
-#else
-   // portable way to do this, preferably without using GCC inline ASM?
-   // just bail for now.
-   return 0;
-#endif
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
 }
 #endif
 #endif
@@ -749,7 +701,7 @@ typedef struct
    stbi_uc buffer_start[128];
 
    stbi_uc *img_buffer, *img_buffer_end;
-   stbi_uc *img_buffer_original;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
 } stbi__context;
 
 
@@ -761,7 +713,7 @@ static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
    s->io.read = NULL;
    s->read_from_callbacks = 0;
    s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
-   s->img_buffer_end = (stbi_uc *) buffer+len;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
 }
 
 // initialize a callback-based context
@@ -773,6 +725,7 @@ static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *
    s->read_from_callbacks = 1;
    s->img_buffer_original = s->buffer_start;
    stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
 }
 
 #ifndef STBI_NO_STDIO
@@ -814,59 +767,76 @@ static void stbi__rewind(stbi__context *s)
    // but we just rewind to the beginning of the initial buffer, because
    // we only use it after doing 'test', which only ever looks at at most 92 bytes
    s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
 }
 
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
 #ifndef STBI_NO_JPEG
 static int      stbi__jpeg_test(stbi__context *s);
-static stbi_uc *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNG
 static int      stbi__png_test(stbi__context *s);
-static stbi_uc *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_BMP
 static int      stbi__bmp_test(stbi__context *s);
-static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_TGA
 static int      stbi__tga_test(stbi__context *s);
-static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PSD
 static int      stbi__psd_test(stbi__context *s);
-static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
 static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
 #endif
 
 #ifndef STBI_NO_HDR
 static int      stbi__hdr_test(stbi__context *s);
-static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PIC
 static int      stbi__pic_test(stbi__context *s);
-static stbi_uc *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_GIF
 static int      stbi__gif_test(stbi__context *s);
-static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
 static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
 #ifndef STBI_NO_PNM
 static int      stbi__pnm_test(stbi__context *s);
-static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
 #endif
 
@@ -889,6 +859,81 @@ static void *stbi__malloc(size_t size)
     return STBI_MALLOC(size);
 }
 
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
 // stbi__err - error
 // stbi__errpf - error returning pointer to float
 // stbi__errpuc - error returning pointer to unsigned char
@@ -901,8 +946,8 @@ static void *stbi__malloc(size_t size)
    #define stbi__err(x,y)  stbi__err(x)
 #endif
 
-#define stbi__errpf(x,y)   ((float *) (stbi__err(x,y)?NULL:NULL))
-#define stbi__errpuc(x,y)  ((unsigned char *) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
 
 STBIDEF void stbi_image_free(void *retval_from_stbi_load)
 {
@@ -924,33 +969,38 @@ STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
     stbi__vertically_flip_on_load = flag_true_if_should_flip;
 }
 
-static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
 {
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
    #ifndef STBI_NO_JPEG
-   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp);
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PNG
-   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp);
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_BMP
-   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp);
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_GIF
-   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp);
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PSD
-   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp);
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
    #endif
    #ifndef STBI_NO_PIC
-   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp);
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
    #endif
    #ifndef STBI_NO_PNM
-   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp);
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
    #endif
 
    #ifndef STBI_NO_HDR
    if (stbi__hdr_test(s)) {
-      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp);
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
       return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
    }
    #endif
@@ -958,58 +1008,138 @@ static unsigned char *stbi__load_main(stbi__context *s, int *x, int *y, int *com
    #ifndef STBI_NO_TGA
    // test tga last because it's a crappy test!
    if (stbi__tga_test(s))
-      return stbi__tga_load(s,x,y,comp,req_comp);
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
    #endif
 
    return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
 }
 
-static unsigned char *stbi__load_flip(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
 {
-   unsigned char *result = stbi__load_main(s, x, y, comp, req_comp);
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
 
-   if (stbi__vertically_flip_on_load && result != NULL) {
-      int w = *x, h = *y;
-      int depth = req_comp ? req_comp : *comp;
-      int row,col,z;
-      stbi_uc temp;
-
-      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
-      for (row = 0; row < (h>>1); row++) {
-         for (col = 0; col < w; col++) {
-            for (z = 0; z < depth; z++) {
-               temp = result[(row * w + col) * depth + z];
-               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
-               result[((h - row - 1) * w + col) * depth + z] = temp;
-            }
-         }
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
       }
    }
+}
 
-   return result;
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel); 
+      bytes += slice_size; 
+   }
+}
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   if (ri.bits_per_channel != 8) {
+      STBI_ASSERT(ri.bits_per_channel == 16);
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
 }
 
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   if (ri.bits_per_channel != 16) {
+      STBI_ASSERT(ri.bits_per_channel == 8);
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) || !defined(STBI_NO_LINEAR)
 static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
 {
    if (stbi__vertically_flip_on_load && result != NULL) {
-      int w = *x, h = *y;
-      int depth = req_comp ? req_comp : *comp;
-      int row,col,z;
-      float temp;
-
-      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
-      for (row = 0; row < (h>>1); row++) {
-         for (col = 0; col < w; col++) {
-            for (z = 0; z < depth; z++) {
-               temp = result[(row * w + col) * depth + z];
-               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
-               result[((h - row - 1) * w + col) * depth + z] = temp;
-            }
-         }
-      }
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
    }
 }
-
+#endif
 
 #ifndef STBI_NO_STDIO
 
@@ -1041,28 +1171,83 @@ STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req
    unsigned char *result;
    stbi__context s;
    stbi__start_file(&s,f);
-   result = stbi__load_flip(&s,x,y,comp,req_comp);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
    if (result) {
       // need to 'unget' all the characters in the IO buffer
       fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
    }
    return result;
 }
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
 #endif //!STBI_NO_STDIO
 
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
 STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_mem(&s,buffer,len);
-   return stbi__load_flip(&s,x,y,comp,req_comp);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
 }
 
 STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
 {
    stbi__context s;
    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
-   return stbi__load_flip(&s,x,y,comp,req_comp);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s; 
+   stbi__start_mem(&s,buffer,len); 
+   
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp ); 
+   }
+
+   return result; 
 }
+#endif
 
 #ifndef STBI_NO_LINEAR
 static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
@@ -1070,13 +1255,14 @@ static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int
    unsigned char *data;
    #ifndef STBI_NO_HDR
    if (stbi__hdr_test(s)) {
-      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp);
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
       if (hdr_data)
          stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
       return hdr_data;
    }
    #endif
-   data = stbi__load_flip(s, x, y, comp, req_comp);
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
    if (data)
       return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
    return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
@@ -1146,13 +1332,18 @@ STBIDEF int      stbi_is_hdr          (char const *filename)
    return result;
 }
 
-STBIDEF int      stbi_is_hdr_from_file(FILE *f)
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
 {
    #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
    stbi__context s;
    stbi__start_file(&s,f);
-   return stbi__hdr_test(&s);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
    #else
+   STBI_NOTUSED(f);
    return 0;
    #endif
 }
@@ -1165,18 +1356,21 @@ STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void
    stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
    return stbi__hdr_test(&s);
    #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
    return 0;
    #endif
 }
 
-static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+#ifndef STBI_NO_LINEAR
 static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
 
-#ifndef STBI_NO_LINEAR
 STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
 STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
 #endif
 
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
 STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
 STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
 
@@ -1285,17 +1479,23 @@ static stbi__uint32 stbi__get32be(stbi__context *s)
    return (z << 16) + stbi__get16be(s);
 }
 
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
 static int stbi__get16le(stbi__context *s)
 {
    int z = stbi__get8(s);
    return z + (stbi__get8(s) << 8);
 }
+#endif
 
+#ifndef STBI_NO_BMP
 static stbi__uint32 stbi__get32le(stbi__context *s)
 {
    stbi__uint32 z = stbi__get16le(s);
    return z + (stbi__get16le(s) << 16);
 }
+#endif
 
 #define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
 
@@ -1324,7 +1524,7 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
    if (req_comp == img_n) return data;
    STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
 
-   good = (unsigned char *) stbi__malloc(req_comp * x * y);
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
    if (good == NULL) {
       STBI_FREE(data);
       return stbi__errpuc("outofmem", "Out of memory");
@@ -1334,26 +1534,75 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
       unsigned char *src  = data + j * x * img_n   ;
       unsigned char *dest = good + j * x * req_comp;
 
-      #define COMBO(a,b)  ((a)*8+(b))
-      #define CASE(a,b)   case COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0], dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0);
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
       // convert source image with img_n components to one with req_comp components;
       // avoid switch per pixel, so use switch per scanline and massive macros
-      switch (COMBO(img_n, req_comp)) {
-         CASE(1,2) dest[0]=src[0], dest[1]=255; break;
-         CASE(1,3) dest[0]=dest[1]=dest[2]=src[0]; break;
-         CASE(1,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=255; break;
-         CASE(2,1) dest[0]=src[0]; break;
-         CASE(2,3) dest[0]=dest[1]=dest[2]=src[0]; break;
-         CASE(2,4) dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1]; break;
-         CASE(3,4) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255; break;
-         CASE(3,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
-         CASE(3,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = 255; break;
-         CASE(4,1) dest[0]=stbi__compute_y(src[0],src[1],src[2]); break;
-         CASE(4,2) dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; break;
-         CASE(4,3) dest[0]=src[0],dest[1]=src[1],dest[2]=src[2]; break;
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0], dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]), dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]), dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];                       } break;
          default: STBI_ASSERT(0);
       }
-      #undef CASE
+      #undef STBI__CASE
    }
 
    STBI_FREE(data);
@@ -1364,7 +1613,9 @@ static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int r
 static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 {
    int i,k,n;
-   float *output = (float *) stbi__malloc(x * y * comp * sizeof(float));
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
    if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
    // compute number of non-alpha components
    if (comp & 1) n = comp; else n = comp-1;
@@ -1384,7 +1635,9 @@ static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
 static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
 {
    int i,k,n;
-   stbi_uc *output = (stbi_uc *) stbi__malloc(x * y * comp);
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
    if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
    // compute number of non-alpha components
    if (comp & 1) n = comp; else n = comp-1;
@@ -1449,7 +1702,7 @@ typedef struct
    stbi__context *s;
    stbi__huffman huff_dc[4];
    stbi__huffman huff_ac[4];
-   stbi_uc dequant[4][64];
+   stbi__uint16 dequant[4][64];
    stbi__int16 fast_ac[4][1 << FAST_BITS];
 
 // sizes for components, interleaved MCUs
@@ -1485,6 +1738,9 @@ typedef struct
    int            succ_high;
    int            succ_low;
    int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
 
    int scan_n, order[4];
    int restart_interval, todo;
@@ -1497,7 +1753,8 @@ typedef struct
 
 static int stbi__build_huffman(stbi__huffman *h, int *count)
 {
-   int i,j,k=0,code;
+   int i,j,k=0;
+   unsigned int code;
    // build size list for each symbol (from JPEG spec)
    for (i=0; i < 16; ++i)
       for (j=0; j < count[i]; ++j)
@@ -1513,7 +1770,7 @@ static int stbi__build_huffman(stbi__huffman *h, int *count)
       if (h->size[k] == j) {
          while (h->size[k] == j)
             h->code[k++] = (stbi__uint16) (code++);
-         if (code-1 >= (1 << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
       }
       // compute largest code + 1 for this size, preshifted as needed later
       h->maxcode[j] = code << (16-j);
@@ -1554,10 +1811,10 @@ static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
             // magnitude code followed by receive_extend code
             int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
             int m = 1 << (magbits - 1);
-            if (k < m) k += (-1 << magbits) + 1;
+            if (k < m) k += (~0U << magbits) + 1;
             // if the result is small enough, we can fit it in fast_ac table
             if (k >= -128 && k <= 127)
-               fast_ac[i] = (stbi__int16) ((k << 8) + (run << 4) + (len + magbits));
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
          }
       }
    }
@@ -1566,9 +1823,10 @@ static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
 static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
 {
    do {
-      int b = j->nomore ? 0 : stbi__get8(j->s);
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
       if (b == 0xff) {
          int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
          if (c != 0) {
             j->marker = (unsigned char) c;
             j->nomore = 1;
@@ -1581,7 +1839,7 @@ static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
 }
 
 // (1 << n) - 1
-static stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
 
 // decode a jpeg huffman value from the bitstream
 stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
@@ -1634,7 +1892,7 @@ stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
 }
 
 // bias[n] = (-1<<n) + 1
-static int const stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
 
 // combined JPEG 'receive' and JPEG 'extend', since baseline
 // always extends everything it receives.
@@ -1677,7 +1935,7 @@ stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
 
 // given a value that's at position X in the zigzag stream,
 // where does it appear in the 8x8 matrix coded as row-major?
-static stbi_uc stbi__jpeg_dezigzag[64+15] =
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
 {
     0,  1,  8, 16,  9,  2,  3, 10,
    17, 24, 32, 25, 18, 11,  4,  5,
@@ -1693,7 +1951,7 @@ static stbi_uc stbi__jpeg_dezigzag[64+15] =
 };
 
 // decode one 64-entry block--
-static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi_uc *dequant)
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
 {
    int diff,dc,k;
    int t;
@@ -1903,7 +2161,7 @@ stbi_inline static stbi_uc stbi__clamp(int x)
 }
 
 #define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
-#define stbi__fsh(x)  ((x) << 12)
+#define stbi__fsh(x)  ((x) * 4096)
 
 // derived from jidctint -- DCT_ISLOW
 #define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
@@ -1958,7 +2216,7 @@ static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
          //    (1|2|3|4|5|6|7)==0          0     seconds
          //    all separate               -0.047 seconds
          //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
-         int dcterm = d[0] << 2;
+         int dcterm = d[0]*4;
          v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
       } else {
          STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
@@ -2402,7 +2660,7 @@ static stbi_uc stbi__get_marker(stbi__jpeg *j)
    x = stbi__get8(j->s);
    if (x != 0xff) return STBI__MARKER_none;
    while (x == 0xff)
-      x = stbi__get8(j->s);
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
    return x;
 }
 
@@ -2417,7 +2675,7 @@ static void stbi__jpeg_reset(stbi__jpeg *j)
    j->code_bits = 0;
    j->code_buffer = 0;
    j->nomore = 0;
-   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
    j->marker = STBI__MARKER_none;
    j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
    j->eob_run = 0;
@@ -2549,7 +2807,7 @@ static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
    }
 }
 
-static void stbi__jpeg_dequantize(short *data, stbi_uc *dequant)
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
 {
    int i;
    for (i=0; i < 64; ++i)
@@ -2591,13 +2849,14 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
          L = stbi__get16be(z->s)-2;
          while (L > 0) {
             int q = stbi__get8(z->s);
-            int p = q >> 4;
+            int p = q >> 4, sixteen = (p != 0);
             int t = q & 15,i;
-            if (p != 0) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
             if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
             for (i=0; i < 64; ++i)
-               z->dequant[t][stbi__jpeg_dezigzag[i]] = stbi__get8(z->s);
-            L -= 65;
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
          }
          return L==0;
 
@@ -2630,12 +2889,50 @@ static int stbi__process_marker(stbi__jpeg *z, int m)
          }
          return L==0;
    }
+
    // check for comment block or APP blocks
    if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
-      stbi__skip(z->s, stbi__get16be(z->s)-2);
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
       return 1;
    }
-   return 0;
+
+   return stbi__err("unknown marker","Corrupt JPEG");
 }
 
 // after we see SOS
@@ -2678,6 +2975,28 @@ static int stbi__process_scan_header(stbi__jpeg *z)
    return 1;
 }
 
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
 static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 {
    stbi__context *s = z->s;
@@ -2687,7 +3006,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
    s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
    s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
    c = stbi__get8(s);
-   if (c != 3 && c != 1) return stbi__err("bad component count","Corrupt JPEG");    // JFIF requires
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
    s->img_n = c;
    for (i=0; i < c; ++i) {
       z->img_comp[i].data = NULL;
@@ -2696,11 +3015,12 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 
    if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
 
+   z->rgb = 0;
    for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
       z->img_comp[i].id = stbi__get8(s);
-      if (z->img_comp[i].id != i+1)   // JFIF requires
-         if (z->img_comp[i].id != i)  // some version of jpegtran outputs non-JFIF-compliant files!
-            return stbi__err("bad component ID","Corrupt JPEG");
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
       q = stbi__get8(s);
       z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
       z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
@@ -2709,7 +3029,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 
    if (scan != STBI__SCAN_load) return 1;
 
-   if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
 
    for (i=0; i < s->img_n; ++i) {
       if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
@@ -2721,6 +3041,7 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
    z->img_v_max = v_max;
    z->img_mcu_w = h_max * 8;
    z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
    z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
    z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
 
@@ -2732,28 +3053,27 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
       // the bogus oversized data from using interleaved MCUs and their
       // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
       // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
       z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
       z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
-      z->img_comp[i].raw_data = stbi__malloc(z->img_comp[i].w2 * z->img_comp[i].h2+15);
-
-      if (z->img_comp[i].raw_data == NULL) {
-         for(--i; i >= 0; --i) {
-            STBI_FREE(z->img_comp[i].raw_data);
-            z->img_comp[i].data = NULL;
-         }
-         return stbi__err("outofmem", "Out of memory");
-      }
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
       // align blocks for idct using mmx/sse
       z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
-      z->img_comp[i].linebuf = NULL;
       if (z->progressive) {
-         z->img_comp[i].coeff_w = (z->img_comp[i].w2 + 7) >> 3;
-         z->img_comp[i].coeff_h = (z->img_comp[i].h2 + 7) >> 3;
-         z->img_comp[i].raw_coeff = STBI_MALLOC(z->img_comp[i].coeff_w * z->img_comp[i].coeff_h * 64 * sizeof(short) + 15);
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
          z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
-      } else {
-         z->img_comp[i].coeff = 0;
-         z->img_comp[i].raw_coeff = 0;
       }
    }
 
@@ -2772,6 +3092,8 @@ static int stbi__process_frame_header(stbi__jpeg *z, int scan)
 static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
 {
    int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
    z->marker = STBI__MARKER_none; // initialize cached marker to empty
    m = stbi__get_marker(z);
    if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
@@ -2813,12 +3135,15 @@ static int stbi__decode_jpeg_image(stbi__jpeg *j)
                if (x == 255) {
                   j->marker = stbi__get8(j->s);
                   break;
-               } else if (x != 0) {
-                  return stbi__err("junk before marker", "Corrupt JPEG");
                }
             }
             // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
          }
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
       } else {
          if (!stbi__process_marker(j, m)) return 0;
       }
@@ -3037,38 +3362,9 @@ static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_
    return out;
 }
 
-#ifdef STBI_JPEG_OLD
-// this is the same YCbCr-to-RGB calculation that stb_image has used
-// historically before the algorithm changes in 1.49
-#define float2fixed(x)  ((int) ((x) * 65536 + 0.5))
-static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
-{
-   int i;
-   for (i=0; i < count; ++i) {
-      int y_fixed = (y[i] << 16) + 32768; // rounding
-      int r,g,b;
-      int cr = pcr[i] - 128;
-      int cb = pcb[i] - 128;
-      r = y_fixed + cr*float2fixed(1.40200f);
-      g = y_fixed - cr*float2fixed(0.71414f) - cb*float2fixed(0.34414f);
-      b = y_fixed                            + cb*float2fixed(1.77200f);
-      r >>= 16;
-      g >>= 16;
-      b >>= 16;
-      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
-      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
-      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
-      out[0] = (stbi_uc)r;
-      out[1] = (stbi_uc)g;
-      out[2] = (stbi_uc)b;
-      out[3] = 255;
-      out += step;
-   }
-}
-#else
 // this is a reduced-precision calculation of YCbCr-to-RGB introduced
 // to make sure the code produces the same results in both SIMD and scalar
-#define float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
 static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
 {
    int i;
@@ -3077,9 +3373,9 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
       int r,g,b;
       int cr = pcr[i] - 128;
       int cb = pcb[i] - 128;
-      r = y_fixed +  cr* float2fixed(1.40200f);
-      g = y_fixed + (cr*-float2fixed(0.71414f)) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                               +   cb* float2fixed(1.77200f);
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
       r >>= 20;
       g >>= 20;
       b >>= 20;
@@ -3093,7 +3389,6 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
       out += step;
    }
 }
-#endif
 
 #if defined(STBI_SSE2) || defined(STBI_NEON)
 static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
@@ -3212,9 +3507,9 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
       int r,g,b;
       int cr = pcr[i] - 128;
       int cb = pcb[i] - 128;
-      r = y_fixed + cr* float2fixed(1.40200f);
-      g = y_fixed + cr*-float2fixed(0.71414f) + ((cb*-float2fixed(0.34414f)) & 0xffff0000);
-      b = y_fixed                             +   cb* float2fixed(1.77200f);
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
       r >>= 20;
       g >>= 20;
       b >>= 20;
@@ -3240,18 +3535,14 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
 #ifdef STBI_SSE2
    if (stbi__sse2_available()) {
       j->idct_block_kernel = stbi__idct_simd;
-      #ifndef STBI_JPEG_OLD
       j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-      #endif
       j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
    }
 #endif
 
 #ifdef STBI_NEON
    j->idct_block_kernel = stbi__idct_simd;
-   #ifndef STBI_JPEG_OLD
    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
-   #endif
    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 #endif
 }
@@ -3259,23 +3550,7 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
 // clean up the temporary component buffers
 static void stbi__cleanup_jpeg(stbi__jpeg *j)
 {
-   int i;
-   for (i=0; i < j->s->img_n; ++i) {
-      if (j->img_comp[i].raw_data) {
-         STBI_FREE(j->img_comp[i].raw_data);
-         j->img_comp[i].raw_data = NULL;
-         j->img_comp[i].data = NULL;
-      }
-      if (j->img_comp[i].raw_coeff) {
-         STBI_FREE(j->img_comp[i].raw_coeff);
-         j->img_comp[i].raw_coeff = 0;
-         j->img_comp[i].coeff = 0;
-      }
-      if (j->img_comp[i].linebuf) {
-         STBI_FREE(j->img_comp[i].linebuf);
-         j->img_comp[i].linebuf = NULL;
-      }
-   }
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
 }
 
 typedef struct
@@ -3288,9 +3563,16 @@ typedef struct
    int ypos;    // which pre-expansion row we're on
 } stbi__resample;
 
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
 static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
 {
-   int n, decode_n;
+   int n, decode_n, is_rgb;
    z->s->img_n = 0; // make stbi__cleanup_jpeg safe
 
    // validate req_comp
@@ -3300,9 +3582,11 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
    if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
 
    // determine actual number of components to generate
-   n = req_comp ? req_comp : z->s->img_n;
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
 
-   if (z->s->img_n == 3 && n < 3)
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
       decode_n = 1;
    else
       decode_n = z->s->img_n;
@@ -3339,7 +3623,7 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
       }
 
       // can't error after this so, this is safe
-      output = (stbi_uc *) stbi__malloc(n * z->s->img_x * z->s->img_y + 1);
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
       if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
 
       // now go ahead and resample
@@ -3362,7 +3646,39 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
          if (n >= 3) {
             stbi_uc *y = coutput[0];
             if (z->s->img_n == 3) {
-               z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
             } else
                for (i=0; i < z->s->img_x; ++i) {
                   out[0] = out[1] = out[2] = y[i];
@@ -3370,37 +3686,70 @@ static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp
                   out += n;
                }
          } else {
-            stbi_uc *y = coutput[0];
-            if (n == 1)
-               for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
-            else
-               for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255;
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255;
+            }
          }
       }
       stbi__cleanup_jpeg(z);
       *out_x = z->s->img_x;
       *out_y = z->s->img_y;
-      if (comp) *comp  = z->s->img_n; // report original components, not output
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
       return output;
    }
 }
 
-static unsigned char *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
-   stbi__jpeg j;
-   j.s = s;
-   stbi__setup_jpeg(&j);
-   return load_jpeg_image(&j, x,y,comp,req_comp);
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
 }
 
 static int stbi__jpeg_test(stbi__context *s)
 {
    int r;
-   stbi__jpeg j;
-   j.s = s;
-   stbi__setup_jpeg(&j);
-   r = stbi__decode_jpeg_header(&j, STBI__SCAN_type);
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
    stbi__rewind(s);
+   STBI_FREE(j);
    return r;
 }
 
@@ -3412,15 +3761,18 @@ static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
    }
    if (x) *x = j->s->img_x;
    if (y) *y = j->s->img_y;
-   if (comp) *comp = j->s->img_n;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
    return 1;
 }
 
 static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   stbi__jpeg j;
-   j.s = s;
-   return stbi__jpeg_info_raw(&j, x, y, comp);
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
 }
 #endif
 
@@ -3466,7 +3818,7 @@ stbi_inline static int stbi__bit_reverse(int v, int bits)
    return stbi__bitreverse16(v) >> (16-bits);
 }
 
-static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
 {
    int i,k=0;
    int code, next_code[16], sizes[17];
@@ -3501,10 +3853,10 @@ static int stbi__zbuild_huffman(stbi__zhuffman *z, stbi_uc *sizelist, int num)
          z->size [c] = (stbi_uc     ) s;
          z->value[c] = (stbi__uint16) i;
          if (s <= STBI__ZFAST_BITS) {
-            int k = stbi__bit_reverse(next_code[s],s);
-            while (k < (1 << STBI__ZFAST_BITS)) {
-               z->fast[k] = fastv;
-               k += (1 << s);
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
             }
          }
          ++next_code[s];
@@ -3543,7 +3895,7 @@ static void stbi__fill_bits(stbi__zbuf *z)
 {
    do {
       STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
-      z->code_buffer |= stbi__zget8(z) << z->num_bits;
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
       z->num_bits += 8;
    } while (z->num_bits <= 24);
 }
@@ -3593,14 +3945,15 @@ stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
 static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
 {
    char *q;
-   int cur, limit;
+   int cur, limit, old_limit;
    z->zout = zout;
    if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
    cur   = (int) (z->zout     - z->zout_start);
-   limit = (int) (z->zout_end - z->zout_start);
+   limit = old_limit = (int) (z->zout_end - z->zout_start);
    while (cur + n > limit)
       limit *= 2;
-   q = (char *) STBI_REALLOC(z->zout_start, limit);
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
    if (q == NULL) return stbi__err("outofmem", "Out of memory");
    z->zout_start = q;
    z->zout       = q + cur;
@@ -3608,18 +3961,18 @@ static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room
    return 1;
 }
 
-static int stbi__zlength_base[31] = {
+static const int stbi__zlength_base[31] = {
    3,4,5,6,7,8,9,10,11,13,
    15,17,19,23,27,31,35,43,51,59,
    67,83,99,115,131,163,195,227,258,0,0 };
 
-static int stbi__zlength_extra[31]=
+static const int stbi__zlength_extra[31]=
 { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
 
-static int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
 
-static int stbi__zdist_extra[32] =
+static const int stbi__zdist_extra[32] =
 { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
 
 static int stbi__parse_huffman_block(stbi__zbuf *a)
@@ -3666,7 +4019,7 @@ static int stbi__parse_huffman_block(stbi__zbuf *a)
 
 static int stbi__compute_huffman_codes(stbi__zbuf *a)
 {
-   static stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
    stbi__zhuffman z_codelength;
    stbi_uc lencodes[286+32+137];//padding for maximum single op
    stbi_uc codelength_sizes[19];
@@ -3675,6 +4028,7 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a)
    int hlit  = stbi__zreceive(a,5) + 257;
    int hdist = stbi__zreceive(a,5) + 1;
    int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
 
    memset(codelength_sizes, 0, sizeof(codelength_sizes));
    for (i=0; i < hclen; ++i) {
@@ -3684,33 +4038,35 @@ static int stbi__compute_huffman_codes(stbi__zbuf *a)
    if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
 
    n = 0;
-   while (n < hlit + hdist) {
+   while (n < ntot) {
       int c = stbi__zhuffman_decode(a, &z_codelength);
       if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
       if (c < 16)
          lencodes[n++] = (stbi_uc) c;
-      else if (c == 16) {
-         c = stbi__zreceive(a,2)+3;
-         memset(lencodes+n, lencodes[n-1], c);
-         n += c;
-      } else if (c == 17) {
-         c = stbi__zreceive(a,3)+3;
-         memset(lencodes+n, 0, c);
-         n += c;
-      } else {
-         STBI_ASSERT(c == 18);
-         c = stbi__zreceive(a,7)+11;
-         memset(lencodes+n, 0, c);
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17)
+            c = stbi__zreceive(a,3)+3;
+         else {
+            STBI_ASSERT(c == 18);
+            c = stbi__zreceive(a,7)+11;
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
          n += c;
       }
    }
-   if (n != hlit+hdist) return stbi__err("bad codelengths","Corrupt PNG");
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
    if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
    if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
    return 1;
 }
 
-static int stbi__parse_uncomperssed_block(stbi__zbuf *a)
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
 {
    stbi_uc header[4];
    int len,nlen,k;
@@ -3752,9 +4108,24 @@ static int stbi__parse_zlib_header(stbi__zbuf *a)
    return 1;
 }
 
-// @TODO: should statically initialize these for optimal thread safety
-static stbi_uc stbi__zdefault_length[288], stbi__zdefault_distance[32];
-static void stbi__init_zdefaults(void)
+static const stbi_uc stbi__zdefault_length[288] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
 {
    int i;   // use <= to match clearly with spec
    for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
@@ -3764,6 +4135,7 @@ static void stbi__init_zdefaults(void)
 
    for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
 }
+*/
 
 static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
 {
@@ -3776,13 +4148,12 @@ static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
       final = stbi__zreceive(a,1);
       type = stbi__zreceive(a,2);
       if (type == 0) {
-         if (!stbi__parse_uncomperssed_block(a)) return 0;
+         if (!stbi__parse_uncompressed_block(a)) return 0;
       } else if (type == 3) {
          return 0;
       } else {
          if (type == 1) {
             // use fixed code lengths
-            if (!stbi__zdefault_distance[31]) stbi__init_zdefaults();
             if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
             if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
          } else {
@@ -3907,7 +4278,7 @@ static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
 
 static int stbi__check_png_header(stbi__context *s)
 {
-   static stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
    int i;
    for (i=0; i < 8; ++i)
       if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
@@ -3918,6 +4289,7 @@ typedef struct
 {
    stbi__context *s;
    stbi_uc *idata, *expanded, *out;
+   int depth;
 } stbi__png;
 
 
@@ -3952,35 +4324,40 @@ static int stbi__paeth(int a, int b, int c)
    return c;
 }
 
-static stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
 
 // create the png data from post-deflated data
 static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
 {
+   int bytes = (depth == 16? 2 : 1);
    stbi__context *s = a->s;
-   stbi__uint32 i,j,stride = x*out_n;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
    stbi__uint32 img_len, img_width_bytes;
    int k;
    int img_n = s->img_n; // copy it into a local for later
 
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
    STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
-   a->out = (stbi_uc *) stbi__malloc(x * y * out_n); // extra bytes to write off the end into
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
    if (!a->out) return stbi__err("outofmem", "Out of memory");
 
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
    img_width_bytes = (((img_n * x * depth) + 7) >> 3);
    img_len = (img_width_bytes + 1) * y;
-   if (s->img_x == x && s->img_y == y) {
-      if (raw_len != img_len) return stbi__err("not enough pixels","Corrupt PNG");
-   } else { // interlaced:
-      if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
-   }
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
 
    for (j=0; j < y; ++j) {
       stbi_uc *cur = a->out + stride*j;
-      stbi_uc *prior = cur - stride;
+      stbi_uc *prior;
       int filter = *raw++;
-      int filter_bytes = img_n;
-      int width = x;
+
       if (filter > 4)
          return stbi__err("invalid filter","Corrupt PNG");
 
@@ -3990,6 +4367,7 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
          filter_bytes = 1;
          width = img_width_bytes;
       }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
 
       // if first row, use special filter that doesn't sample previous row
       if (j == 0) filter = first_row_filter[filter];
@@ -4013,6 +4391,14 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
          raw += img_n;
          cur += out_n;
          prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
       } else {
          raw += 1;
          cur += 1;
@@ -4021,38 +4407,47 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
 
       // this is a little gross, so that we don't switch per-pixel or per-component
       if (depth < 8 || img_n == out_n) {
-         int nk = (width - 1)*img_n;
-         #define CASE(f) \
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
              case f:     \
                 for (k=0; k < nk; ++k)
          switch (filter) {
             // "none" filter turns into a memcpy here; make that explicit.
             case STBI__F_none:         memcpy(cur, raw, nk); break;
-            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); break;
-            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); break;
-            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); break;
-            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); break;
-            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
          }
-         #undef CASE
+         #undef STBI__CASE
          raw += nk;
       } else {
          STBI_ASSERT(img_n+1 == out_n);
-         #define CASE(f) \
+         #define STBI__CASE(f) \
              case f:     \
-                for (i=x-1; i >= 1; --i, cur[img_n]=255,raw+=img_n,cur+=out_n,prior+=out_n) \
-                   for (k=0; k < img_n; ++k)
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
          switch (filter) {
-            CASE(STBI__F_none)         cur[k] = raw[k]; break;
-            CASE(STBI__F_sub)          cur[k] = STBI__BYTECAST(raw[k] + cur[k-out_n]); break;
-            CASE(STBI__F_up)           cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
-            CASE(STBI__F_avg)          cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-out_n])>>1)); break;
-            CASE(STBI__F_paeth)        cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-out_n],prior[k],prior[k-out_n])); break;
-            CASE(STBI__F_avg_first)    cur[k] = STBI__BYTECAST(raw[k] + (cur[k-out_n] >> 1)); break;
-            CASE(STBI__F_paeth_first)  cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-out_n],0,0)); break;
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
          }
-         #undef CASE
       }
    }
 
@@ -4109,25 +4504,36 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
             if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
          }
          if (img_n != out_n) {
+            int q;
             // insert alpha = 255
-            stbi_uc *cur = a->out + stride*j;
-            int i;
+            cur = a->out + stride*j;
             if (img_n == 1) {
-               for (i=x-1; i >= 0; --i) {
-                  cur[i*2+1] = 255;
-                  cur[i*2+0] = cur[i];
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
                }
             } else {
                STBI_ASSERT(img_n == 3);
-               for (i=x-1; i >= 0; --i) {
-                  cur[i*4+3] = 255;
-                  cur[i*4+2] = cur[i*3+2];
-                  cur[i*4+1] = cur[i*3+1];
-                  cur[i*4+0] = cur[i*3+0];
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
                }
             }
          }
       }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
    }
 
    return 1;
@@ -4135,13 +4541,15 @@ static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 r
 
 static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
 {
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
    stbi_uc *final;
    int p;
    if (!interlaced)
       return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
 
    // de-interlacing
-   final = (stbi_uc *) stbi__malloc(a->s->img_x * a->s->img_y * out_n);
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
    for (p=0; p < 7; ++p) {
       int xorig[] = { 0,4,0,2,0,1,0 };
       int yorig[] = { 0,0,4,0,2,0,1 };
@@ -4161,8 +4569,8 @@ static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint3
             for (i=0; i < x; ++i) {
                int out_y = j*yspc[p]+yorig[p];
                int out_x = i*xspc[p]+xorig[p];
-               memcpy(final + out_y*a->s->img_x*out_n + out_x*out_n,
-                      a->out + (j*x+i)*out_n, out_n);
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
             }
          }
          STBI_FREE(a->out);
@@ -4200,12 +4608,37 @@ static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
    return 1;
 }
 
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
 static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
 {
    stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
    stbi_uc *p, *temp_out, *orig = a->out;
 
-   p = (stbi_uc *) stbi__malloc(pixel_count * pal_img_n);
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
    if (p == NULL) return stbi__err("outofmem", "Out of memory");
 
    // between here and free(out) below, exitting would leak
@@ -4271,9 +4704,10 @@ static void stbi__de_iphone(stbi__png *z)
             stbi_uc a = p[3];
             stbi_uc t = p[0];
             if (a) {
-               p[0] = p[2] * 255 / a;
-               p[1] = p[1] * 255 / a;
-               p[2] =  t   * 255 / a;
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
             } else {
                p[0] = p[2];
                p[2] = t;
@@ -4292,14 +4726,15 @@ static void stbi__de_iphone(stbi__png *z)
    }
 }
 
-#define STBI__PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
 
 static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
 {
    stbi_uc palette[1024], pal_img_n=0;
    stbi_uc has_trans=0, tc[3];
+   stbi__uint16 tc16[3];
    stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
-   int first=1,k,interlace=0, color=0, depth=0, is_iphone=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
    stbi__context *s = z->s;
 
    z->expanded = NULL;
@@ -4324,8 +4759,9 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
             s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
             s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
-            depth = stbi__get8(s);  if (depth != 1 && depth != 2 && depth != 4 && depth != 8)  return stbi__err("1/2/4/8-bit only","PNG not supported: 1/2/4/8-bit only");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
             color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
             if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
             comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
             filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
@@ -4373,8 +4809,11 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
                if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
                has_trans = 1;
-               for (k=0; k < s->img_n; ++k)
-                  tc[k] = (stbi_uc) (stbi__get16be(s) & 255) * stbi__depth_scale_table[depth]; // non 8-bit images will be larger
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
             }
             break;
          }
@@ -4385,11 +4824,13 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
             if ((int)(ioff + c.length) < (int)ioff) return 0;
             if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
                stbi_uc *p;
                if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
                while (ioff + c.length > idata_limit)
                   idata_limit *= 2;
-               p = (stbi_uc *) STBI_REALLOC(z->idata, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
                z->idata = p;
             }
             if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
@@ -4403,7 +4844,7 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
             if (scan != STBI__SCAN_load) return 1;
             if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
             // initial guess for decoded data size to avoid unnecessary reallocs
-            bpl = (s->img_x * depth + 7) / 8; // bytes per line, per component
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
             raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
             z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
             if (z->expanded == NULL) return 0; // zlib should set error
@@ -4412,9 +4853,14 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                s->img_out_n = s->img_n+1;
             else
                s->img_out_n = s->img_n;
-            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, depth, color, interlace)) return 0;
-            if (has_trans)
-               if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
             if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
                stbi__de_iphone(z);
             if (pal_img_n) {
@@ -4424,6 +4870,9 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
                if (req_comp >= 3) s->img_out_n = req_comp;
                if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
                   return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
             }
             STBI_FREE(z->expanded); z->expanded = NULL;
             return 1;
@@ -4451,21 +4900,28 @@ static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
    }
 }
 
-static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp)
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
 {
-   unsigned char *result=NULL;
+   void *result=NULL;
    if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
    if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth < 8)
+         ri->bits_per_channel = 8;
+      else
+         ri->bits_per_channel = p->depth;
       result = p->out;
       p->out = NULL;
       if (req_comp && req_comp != p->s->img_out_n) {
-         result = stbi__convert_format(result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
          p->s->img_out_n = req_comp;
          if (result == NULL) return result;
       }
       *x = p->s->img_x;
       *y = p->s->img_y;
-      if (n) *n = p->s->img_out_n;
+      if (n) *n = p->s->img_n;
    }
    STBI_FREE(p->out);      p->out      = NULL;
    STBI_FREE(p->expanded); p->expanded = NULL;
@@ -4474,11 +4930,11 @@ static unsigned char *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req
    return result;
 }
 
-static unsigned char *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi__png p;
    p.s = s;
-   return stbi__do_png(&p, x,y,comp,req_comp);
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
 }
 
 static int stbi__png_test(stbi__context *s)
@@ -4507,6 +4963,19 @@ static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
    p.s = s;
    return stbi__png_info_raw(&p, x, y, comp);
 }
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
 #endif
 
 // Microsoft/Windows BMP image
@@ -4558,36 +5027,46 @@ static int stbi__bitcount(unsigned int a)
    return a & 0xff;
 }
 
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
 static int stbi__shiftsigned(int v, int shift, int bits)
 {
-   int result;
-   int z=0;
-
-   if (shift < 0) v <<= -shift;
-   else v >>= shift;
-   result = v;
-
-   z = bits;
-   while (z < 8) {
-      result += v >> z;
-      z += bits;
-   }
-   return result;
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v >= 0 && v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
 }
 
-static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+typedef struct
 {
-   stbi_uc *out;
-   unsigned int mr=0,mg=0,mb=0,ma=0, fake_a=0;
-   stbi_uc pal[256][4];
-   int psize=0,i,j,compress=0,width;
-   int bpp, flip_vertically, pad, target, offset, hsz;
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+} stbi__bmp_data;
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
    if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
    stbi__get32le(s); // discard filesize
    stbi__get16le(s); // discard reserved
    stbi__get16le(s); // discard reserved
-   offset = stbi__get32le(s);
-   hsz = stbi__get32le(s);
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+
    if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
    if (hsz == 12) {
       s->img_x = stbi__get16le(s);
@@ -4597,15 +5076,9 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
       s->img_y = stbi__get32le(s);
    }
    if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
-   bpp = stbi__get16le(s);
-   if (bpp == 1) return stbi__errpuc("monochrome", "BMP type not supported: 1-bit");
-   flip_vertically = ((int) s->img_y) > 0;
-   s->img_y = abs((int) s->img_y);
-   if (hsz == 12) {
-      if (bpp < 24)
-         psize = (offset - 14 - 24) / 3;
-   } else {
-      compress = stbi__get32le(s);
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
       if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
       stbi__get32le(s); // discard sizeof
       stbi__get32le(s); // discard hres
@@ -4619,27 +5092,25 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
             stbi__get32le(s);
             stbi__get32le(s);
          }
-         if (bpp == 16 || bpp == 32) {
-            mr = mg = mb = 0;
+         if (info->bpp == 16 || info->bpp == 32) {
             if (compress == 0) {
-               if (bpp == 32) {
-                  mr = 0xffu << 16;
-                  mg = 0xffu <<  8;
-                  mb = 0xffu <<  0;
-                  ma = 0xffu << 24;
-                  fake_a = 1; // @TODO: check for cases like alpha value is all 0 and switch it to 255
-                  STBI_NOTUSED(fake_a);
+               if (info->bpp == 32) {
+                  info->mr = 0xffu << 16;
+                  info->mg = 0xffu <<  8;
+                  info->mb = 0xffu <<  0;
+                  info->ma = 0xffu << 24;
+                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
                } else {
-                  mr = 31u << 10;
-                  mg = 31u <<  5;
-                  mb = 31u <<  0;
+                  info->mr = 31u << 10;
+                  info->mg = 31u <<  5;
+                  info->mb = 31u <<  0;
                }
             } else if (compress == 3) {
-               mr = stbi__get32le(s);
-               mg = stbi__get32le(s);
-               mb = stbi__get32le(s);
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
                // not documented, but generated by photoshop and handled by mspaint
-               if (mr == mg && mg == mb) {
+               if (info->mr == info->mg && info->mg == info->mb) {
                   // ?!?!?
                   return stbi__errpuc("bad BMP", "bad BMP");
                }
@@ -4647,11 +5118,13 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
                return stbi__errpuc("bad BMP", "bad BMP");
          }
       } else {
-         STBI_ASSERT(hsz == 108 || hsz == 124);
-         mr = stbi__get32le(s);
-         mg = stbi__get32le(s);
-         mb = stbi__get32le(s);
-         ma = stbi__get32le(s);
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
          stbi__get32le(s); // discard color space
          for (i=0; i < 12; ++i)
             stbi__get32le(s); // discard color space parameters
@@ -4662,63 +5135,119 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
             stbi__get32le(s); // discard reserved
          }
       }
-      if (bpp < 16)
-         psize = (offset - 14 - hsz) >> 2;
    }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - 14 - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - 14 - info.hsz) >> 2;
+   }
+
    s->img_n = ma ? 4 : 3;
    if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
       target = req_comp;
    else
       target = s->img_n; // if they want monochrome, we'll post-convert
-   out = (stbi_uc *) stbi__malloc(target * s->img_x * s->img_y);
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
-   if (bpp < 16) {
+   if (info.bpp < 16) {
       int z=0;
       if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
       for (i=0; i < psize; ++i) {
          pal[i][2] = stbi__get8(s);
          pal[i][1] = stbi__get8(s);
          pal[i][0] = stbi__get8(s);
-         if (hsz != 12) stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
          pal[i][3] = 255;
       }
-      stbi__skip(s, offset - 14 - hsz - psize * (hsz == 12 ? 3 : 4));
-      if (bpp == 4) width = (s->img_x + 1) >> 1;
-      else if (bpp == 8) width = s->img_x;
+      stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
       else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
       pad = (-width)&3;
-      for (j=0; j < (int) s->img_y; ++j) {
-         for (i=0; i < (int) s->img_x; i += 2) {
-            int v=stbi__get8(s),v2=0;
-            if (bpp == 4) {
-               v2 = v & 15;
-               v >>= 4;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
             }
-            out[z++] = pal[v][0];
-            out[z++] = pal[v][1];
-            out[z++] = pal[v][2];
-            if (target == 4) out[z++] = 255;
-            if (i+1 == (int) s->img_x) break;
-            v = (bpp == 8) ? stbi__get8(s) : v2;
-            out[z++] = pal[v][0];
-            out[z++] = pal[v][1];
-            out[z++] = pal[v][2];
-            if (target == 4) out[z++] = 255;
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
          }
-         stbi__skip(s, pad);
       }
    } else {
       int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
       int z = 0;
       int easy=0;
-      stbi__skip(s, offset - 14 - hsz);
-      if (bpp == 24) width = 3 * s->img_x;
-      else if (bpp == 16) width = 2*s->img_x;
+      stbi__skip(s, info.offset - 14 - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
       else /* bpp = 32 and pad = 0 */ width=0;
       pad = (-width) & 3;
-      if (bpp == 24) {
+      if (info.bpp == 24) {
          easy = 1;
-      } else if (bpp == 32) {
+      } else if (info.bpp == 32) {
          if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
             easy = 2;
       }
@@ -4739,22 +5268,31 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
                out[z+0] = stbi__get8(s);
                z += 3;
                a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
                if (target == 4) out[z++] = a;
             }
          } else {
+            int bpp = info.bpp;
             for (i=0; i < (int) s->img_x; ++i) {
                stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
-               int a;
+               unsigned int a;
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
                out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
                a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
                if (target == 4) out[z++] = STBI__BYTECAST(a);
             }
          }
          stbi__skip(s, pad);
       }
    }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
    if (flip_vertically) {
       stbi_uc t;
       for (j=0; j < (int) s->img_y>>1; ++j) {
@@ -4781,20 +5319,55 @@ static stbi_uc *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int
 // Targa Truevision - TGA
 // by Jonathan Dummer
 #ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
 static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
 {
-    int tga_w, tga_h, tga_comp;
-    int sz;
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
     stbi__get8(s);                   // discard Offset
-    sz = stbi__get8(s);              // color type
-    if( sz > 1 ) {
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
         stbi__rewind(s);
         return 0;      // only RGB or indexed allowed
     }
-    sz = stbi__get8(s);              // image type
-    // only RGB or grey allowed, +/- RLE
-    if ((sz != 1) && (sz != 2) && (sz != 3) && (sz != 9) && (sz != 10) && (sz != 11)) return 0;
-    stbi__skip(s,9);
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
     tga_w = stbi__get16le(s);
     if( tga_w < 1 ) {
         stbi__rewind(s);
@@ -4805,45 +5378,81 @@ static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
         stbi__rewind(s);
         return 0;   // test height
     }
-    sz = stbi__get8(s);               // bits per pixel
-    // only RGB or RGBA or grey allowed
-    if ((sz != 8) && (sz != 16) && (sz != 24) && (sz != 32)) {
-        stbi__rewind(s);
-        return 0;
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
     }
-    tga_comp = sz;
     if (x) *x = tga_w;
     if (y) *y = tga_h;
-    if (comp) *comp = tga_comp / 8;
+    if (comp) *comp = tga_comp;
     return 1;                   // seems to have passed everything
 }
 
 static int stbi__tga_test(stbi__context *s)
 {
-   int res;
-   int sz;
+   int res = 0;
+   int sz, tga_color_type;
    stbi__get8(s);      //   discard Offset
-   sz = stbi__get8(s);   //   color type
-   if ( sz > 1 ) return 0;   //   only RGB or indexed allowed
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
    sz = stbi__get8(s);   //   image type
-   if ( (sz != 1) && (sz != 2) && (sz != 3) && (sz != 9) && (sz != 10) && (sz != 11) ) return 0;   //   only RGB or grey allowed, +/- RLE
-   stbi__get16be(s);      //   discard palette start
-   stbi__get16be(s);      //   discard palette length
-   stbi__get8(s);         //   discard bits per palette color entry
-   stbi__get16be(s);      //   discard x origin
-   stbi__get16be(s);      //   discard y origin
-   if ( stbi__get16be(s) < 1 ) return 0;      //   test width
-   if ( stbi__get16be(s) < 1 ) return 0;      //   test height
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
    sz = stbi__get8(s);   //   bits per pixel
-   if ( (sz != 8) && (sz != 16) && (sz != 24) && (sz != 32) )
-      res = 0;
-   else
-      res = 1;
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
    stbi__rewind(s);
    return res;
 }
 
-static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    //   read in the TGA header stuff
    int tga_offset = stbi__get8(s);
@@ -4858,16 +5467,18 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
    int tga_width = stbi__get16le(s);
    int tga_height = stbi__get16le(s);
    int tga_bits_per_pixel = stbi__get8(s);
-   int tga_comp = tga_bits_per_pixel / 8;
+   int tga_comp, tga_rgb16=0;
    int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
    //   image data
    unsigned char *tga_data;
    unsigned char *tga_palette = NULL;
    int i, j;
-   unsigned char raw_data[4];
+   unsigned char raw_data[4] = {0};
    int RLE_count = 0;
    int RLE_repeating = 0;
    int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
 
    //   do a tiny bit of precessing
    if ( tga_image_type >= 8 )
@@ -4875,41 +5486,33 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
       tga_image_type -= 8;
       tga_is_RLE = 1;
    }
-   /* int tga_alpha_bits = tga_inverted & 15; */
    tga_inverted = 1 - ((tga_inverted >> 5) & 1);
 
-   //   error check
-   if ( //(tga_indexed) ||
-      (tga_width < 1) || (tga_height < 1) ||
-      (tga_image_type < 1) || (tga_image_type > 3) ||
-      ((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16) &&
-      (tga_bits_per_pixel != 24) && (tga_bits_per_pixel != 32))
-      )
-   {
-      return NULL; // we don't report this as a bad TGA because we don't even know if it's TGA
-   }
-
    //   If I'm paletted, then I'll use the number of bits from the palette
-   if ( tga_indexed )
-   {
-      tga_comp = tga_palette_bits / 8;
-   }
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
 
    //   tga info
    *x = tga_width;
    *y = tga_height;
    if (comp) *comp = tga_comp;
 
-   tga_data = (unsigned char*)stbi__malloc( (size_t)tga_width * tga_height * tga_comp );
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
    if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
 
    // skip to the data's starting position (offset usually = 0)
    stbi__skip(s, tga_offset );
 
-   if ( !tga_indexed && !tga_is_RLE) {
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
       for (i=0; i < tga_height; ++i) {
-         int y = tga_inverted ? tga_height -i - 1 : i;
-         stbi_uc *tga_row = tga_data + y*tga_width*tga_comp;
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
          stbi__getn(s, tga_row, tga_width * tga_comp);
       }
    } else  {
@@ -4919,15 +5522,22 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
          //   any data to skip? (offset usually = 0)
          stbi__skip(s, tga_palette_start );
          //   load the palette
-         tga_palette = (unsigned char*)stbi__malloc( tga_palette_len * tga_palette_bits / 8 );
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
          if (!tga_palette) {
             STBI_FREE(tga_data);
             return stbi__errpuc("outofmem", "Out of memory");
          }
-         if (!stbi__getn(s, tga_palette, tga_palette_len * tga_palette_bits / 8 )) {
-            STBI_FREE(tga_data);
-            STBI_FREE(tga_palette);
-            return stbi__errpuc("bad palette", "Corrupt TGA");
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
          }
       }
       //   load the data
@@ -4957,23 +5567,22 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
             //   load however much data we did have
             if ( tga_indexed )
             {
-               //   read in 1 byte, then perform the lookup
-               int pal_idx = stbi__get8(s);
-               if ( pal_idx >= tga_palette_len )
-               {
-                  //   invalid index
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
                   pal_idx = 0;
                }
-               pal_idx *= tga_bits_per_pixel / 8;
-               for (j = 0; j*8 < tga_bits_per_pixel; ++j)
-               {
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
                   raw_data[j] = tga_palette[pal_idx+j];
                }
-            } else
-            {
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
                //   read in the data raw
-               for (j = 0; j*8 < tga_bits_per_pixel; ++j)
-               {
+               for (j = 0; j < tga_comp; ++j) {
                   raw_data[j] = stbi__get8(s);
                }
             }
@@ -5012,8 +5621,8 @@ static stbi_uc *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int
       }
    }
 
-   // swap RGB
-   if (tga_comp >= 3)
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
    {
       unsigned char* tga_pixel = tga_data;
       for (i=0; i < tga_width * tga_height; ++i)
@@ -5049,13 +5658,53 @@ static int stbi__psd_test(stbi__context *s)
    return r;
 }
 
-static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
 {
-   int   pixelCount;
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
    int channelCount, compression;
-   int channel, i, count, len;
+   int channel, i;
+   int bitdepth;
    int w,h;
    stbi_uc *out;
+   STBI_NOTUSED(ri);
 
    // Check identifier
    if (stbi__get32be(s) != 0x38425053)   // "8BPS"
@@ -5078,8 +5727,9 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
    w = stbi__get32be(s);
 
    // Make sure the depth is 8 bits.
-   if (stbi__get16be(s) != 8)
-      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 bit");
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
 
    // Make sure the color mode is RGB.
    // Valid options are:
@@ -5111,8 +5761,18 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
    if (compression > 1)
       return stbi__errpuc("bad compression", "PSD has an unknown compression format");
 
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
    // Create the destination image.
-   out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
    pixelCount = w*h;
 
@@ -5144,61 +5804,86 @@ static stbi_uc *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int
                *p = (channel == 3 ? 255 : 0);
          } else {
             // Read the RLE data.
-            count = 0;
-            while (count < pixelCount) {
-               len = stbi__get8(s);
-               if (len == 128) {
-                  // No-op.
-               } else if (len < 128) {
-                  // Copy next len+1 bytes literally.
-                  len++;
-                  count += len;
-                  while (len) {
-                     *p = stbi__get8(s);
-                     p += 4;
-                     len--;
-                  }
-               } else if (len > 128) {
-                  stbi_uc   val;
-                  // Next -len+1 bytes in the dest are replicated from next source byte.
-                  // (Interpret len as a negative 8-bit int.)
-                  len ^= 0x0FF;
-                  len += 2;
-                  val = stbi__get8(s);
-                  count += len;
-                  while (len) {
-                     *p = val;
-                     p += 4;
-                     len--;
-                  }
-               }
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
             }
          }
       }
 
    } else {
       // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
-      // where each channel consists of an 8-bit value for each pixel in the image.
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
 
       // Read the data by channel.
       for (channel = 0; channel < 4; channel++) {
-         stbi_uc *p;
-
-         p = out + channel;
-         if (channel > channelCount) {
+         if (channel >= channelCount) {
             // Fill this channel with default data.
-            for (i = 0; i < pixelCount; i++, p += 4)
-               *p = channel == 3 ? 255 : 0;
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
          } else {
-            // Read the data.
-            for (i = 0; i < pixelCount; i++, p += 4)
-               *p = stbi__get8(s);
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
          }
       }
    }
 
+   // convert to desired output format
    if (req_comp && req_comp != 4) {
-      out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
       if (out == NULL) return out; // stbi__convert_format frees input on failure
    }
 
@@ -5350,7 +6035,6 @@ static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *c
 
                   if (count >= 128) { // Repeated
                      stbi_uc value[4];
-                     int i;
 
                      if (count==128)
                         count = stbi__get16be(s);
@@ -5383,10 +6067,13 @@ static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *c
    return result;
 }
 
-static stbi_uc *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp)
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
 {
    stbi_uc *result;
-   int i, x,y;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
 
    for (i=0; i<92; ++i)
       stbi__get8(s);
@@ -5394,14 +6081,14 @@ static stbi_uc *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int re
    x = stbi__get16be(s);
    y = stbi__get16be(s);
    if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
-   if ((1 << 28) / x < y) return stbi__errpuc("too large", "Image too large to decode");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
 
    stbi__get32be(s); //skip `ratio'
    stbi__get16be(s); //skip `fields'
    stbi__get16be(s); //skip `pad'
 
    // intermediate buffer is RGBA
-   result = (stbi_uc *) stbi__malloc(x*y*4);
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
    memset(result, 0xff, x*y*4);
 
    if (!stbi__pic_load_core(s,x,y,comp, result)) {
@@ -5439,10 +6126,12 @@ typedef struct
 {
    int w,h;
    stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history; 
    int flags, bgindex, ratio, transparent, eflags;
    stbi_uc  pal[256][4];
    stbi_uc lpal[256][4];
-   stbi__gif_lzw codes[4096];
+   stbi__gif_lzw codes[8192];
    stbi_uc *color_table;
    int parse, step;
    int lflags;
@@ -5450,6 +6139,7 @@ typedef struct
    int max_x, max_y;
    int cur_x, cur_y;
    int line_size;
+   int delay;
 } stbi__gif;
 
 static int stbi__gif_test_raw(stbi__context *s)
@@ -5510,19 +6200,22 @@ static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_in
 
 static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
 {
-   stbi__gif g;
-   if (!stbi__gif_header(s, &g, comp, 1)) {
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
       stbi__rewind( s );
       return 0;
    }
-   if (x) *x = g.w;
-   if (y) *y = g.h;
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
    return 1;
 }
 
 static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 {
    stbi_uc *p, *c;
+   int idx; 
 
    // recurse to decode the prefixes, since the linked-list is backwards,
    // and working backwards through an interleaved image would be nasty
@@ -5531,10 +6224,12 @@ static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 
    if (g->cur_y >= g->max_y) return;
 
-   p = &g->out[g->cur_x + g->cur_y];
-   c = &g->color_table[g->codes[code].suffix * 4];
+   idx = g->cur_x + g->cur_y; 
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;  
 
-   if (c[3] >= 128) {
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels; 
       p[0] = c[2];
       p[1] = c[1];
       p[2] = c[0];
@@ -5557,7 +6252,7 @@ static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
 static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
 {
    stbi_uc lzw_cs;
-   stbi__int32 len, code;
+   stbi__int32 len, init_code;
    stbi__uint32 first;
    stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
    stbi__gif_lzw *p;
@@ -5570,10 +6265,10 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
    codemask = (1 << codesize) - 1;
    bits = 0;
    valid_bits = 0;
-   for (code = 0; code < clear; code++) {
-      g->codes[code].prefix = -1;
-      g->codes[code].first = (stbi_uc) code;
-      g->codes[code].suffix = (stbi_uc) code;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
    }
 
    // support no starting clear code
@@ -5608,11 +6303,16 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
                stbi__skip(s,len);
             return g->out;
          } else if (code <= avail) {
-            if (first) return stbi__errpuc("no clear code", "Corrupt GIF");
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
 
             if (oldcode >= 0) {
                p = &g->codes[avail++];
-               if (avail > 4096)        return stbi__errpuc("too many codes", "Corrupt GIF");
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
                p->prefix = (stbi__int16) oldcode;
                p->first = g->codes[oldcode].first;
                p->suffix = (code == avail) ? p->first : g->codes[code].first;
@@ -5634,43 +6334,70 @@ static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
    }
 }
 
-static void stbi__fill_gif_background(stbi__gif *g)
-{
-   int i;
-   stbi_uc *c = g->pal[g->bgindex];
-   // @OPTIMIZE: write a dword at a time
-   for (i = 0; i < g->w * g->h * 4; i += 4) {
-      stbi_uc *p  = &g->out[i];
-      p[0] = c[2];
-      p[1] = c[1];
-      p[2] = c[0];
-      p[3] = c[3];
-   }
-}
-
 // this function is designed to support animated gifs, although stb_image doesn't support it
-static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp)
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
 {
-   int i;
-   stbi_uc *old_out = 0;
+   int dispose; 
+   int first_frame; 
+   int pi; 
+   int pcount; 
 
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0; 
    if (g->out == 0) {
       if (!stbi__gif_header(s, g, comp,0))     return 0; // stbi__g_failure_reason set by stbi__gif_header
       g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
+      g->background = (stbi_uc *) stbi__malloc(4 * g->w * g->h); 
+      g->history = (stbi_uc *) stbi__malloc(g->w * g->h); 
       if (g->out == 0)                      return stbi__errpuc("outofmem", "Out of memory");
-      stbi__fill_gif_background(g);
+
+      // image is treated as "tranparent" at the start - ie, nothing overwrites the current background; 
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to teh color that was there the previous frame. 
+      memset( g->out, 0x00, 4 * g->w * g->h ); 
+      memset( g->background, 0x00, 4 * g->w * g->h ); // state of the background (starts transparent)
+      memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+      first_frame = 1; 
    } else {
-      // animated-gif-only path
-      if (((g->eflags & 0x1C) >> 2) == 3) {
-         old_out = g->out;
-         g->out = (stbi_uc *) stbi__malloc(4 * g->w * g->h);
-         if (g->out == 0)                   return stbi__errpuc("outofmem", "Out of memory");
-         memcpy(g->out, old_out, g->w*g->h*4);
+      // second frame - how do we dispoase of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2; 
+      pcount = g->w * g->h; 
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
       }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 ); 
+            }
+         }
+      } else if (dispose == 2) { 
+         // restore what was changed last frame to background before that frame; 
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 ); 
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just 
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame; 
+      memcpy( g->background, g->out, 4 * g->w * g->h ); 
    }
 
+   // clear my history; 
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
    for (;;) {
-      switch (stbi__get8(s)) {
+      int tag = stbi__get8(s); 
+      switch (tag) {
          case 0x2C: /* Image Descriptor */
          {
             stbi__int32 x, y, w, h;
@@ -5705,38 +6432,60 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
                stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
                g->color_table = (stbi_uc *) g->lpal;
             } else if (g->flags & 0x80) {
-               for (i=0; i < 256; ++i)  // @OPTIMIZE: stbi__jpeg_reset only the previous transparent
-                  g->pal[i][3] = 255;
-               if (g->transparent >= 0 && (g->eflags & 0x01))
-                  g->pal[g->transparent][3] = 0;
                g->color_table = (stbi_uc *) g->pal;
             } else
-               return stbi__errpuc("missing color table", "Corrupt GIF");
-
+               return stbi__errpuc("missing color table", "Corrupt GIF");            
+            
             o = stbi__process_gif_raster(s, g);
             if (o == NULL) return NULL;
 
-            if (req_comp && req_comp != 4)
-               o = stbi__convert_format(o, 4, req_comp, g->w, g->h);
+            // if this was the first frame, 
+            pcount = g->w * g->h; 
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be; 
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 ); 
+                  }
+               }
+            }
+
             return o;
          }
 
          case 0x21: // Comment Extension.
          {
             int len;
-            if (stbi__get8(s) == 0xF9) { // Graphic Control Extension.
+            int ext = stbi__get8(s); 
+            if (ext == 0xF9) { // Graphic Control Extension.
                len = stbi__get8(s);
                if (len == 4) {
                   g->eflags = stbi__get8(s);
-                  stbi__get16le(s); // delay
-                  g->transparent = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255; 
+                  } 
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0; 
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1); 
+                     g->transparent = -1; 
+                  }
                } else {
                   stbi__skip(s, len);
                   break;
                }
-            }
-            while ((len = stbi__get8(s)) != 0)
+            } 
+            while ((len = stbi__get8(s)) != 0) {
                stbi__skip(s, len);
+            }
             break;
          }
 
@@ -5749,19 +6498,90 @@ static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, i
    }
 }
 
-static stbi_uc *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0; 
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0; 
+      stbi__gif g;
+      int stride; 
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0; 
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers; 
+            stride = g.w * g.h * 4; 
+         
+            if (out) {
+               out = (stbi_uc*) STBI_REALLOC( out, layers * stride ); 
+               if (delays) {
+                  *delays = (int*) STBI_REALLOC( *delays, sizeof(int) * layers ); 
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride ); 
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) ); 
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride ); 
+            if (layers >= 2) {
+               two_back = out - 2 * stride; 
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay; 
+            }
+         }
+      } while (u != 0); 
+
+      // free temp buffer; 
+      STBI_FREE(g.out); 
+      STBI_FREE(g.history); 
+      STBI_FREE(g.background); 
+
+      // do the final conversion after loading everything; 
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers; 
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type."); 
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi_uc *u = 0;
    stbi__gif g;
    memset(&g, 0, sizeof(g));
 
-   u = stbi__gif_load_next(s, &g, comp, req_comp);
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
    if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
    if (u) {
       *x = g.w;
       *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames. 
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
    }
 
+   // free buffers needed for multiple frame loading; 
+   STBI_FREE(g.history);
+   STBI_FREE(g.background); 
+
    return u;
 }
 
@@ -5775,20 +6595,24 @@ static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
 // Radiance RGBE HDR loader
 // originally by Nicolas Schulz
 #ifndef STBI_NO_HDR
-static int stbi__hdr_test_core(stbi__context *s)
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
 {
-   const char *signature = "#?RADIANCE\n";
    int i;
    for (i=0; signature[i]; ++i)
       if (stbi__get8(s) != signature[i])
-         return 0;
+          return 0;
+   stbi__rewind(s);
    return 1;
 }
 
 static int stbi__hdr_test(stbi__context* s)
 {
-   int r = stbi__hdr_test_core(s);
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
    stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
    return r;
 }
 
@@ -5842,7 +6666,7 @@ static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
    }
 }
 
-static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    char buffer[STBI__HDR_BUFLEN];
    char *token;
@@ -5853,10 +6677,12 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    int len;
    unsigned char count, value;
    int i, j, k, c1,c2, z;
-
+   const char *headerToken;
+   STBI_NOTUSED(ri);
 
    // Check identifier
-   if (strcmp(stbi__hdr_gettoken(s,buffer), "#?RADIANCE") != 0)
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
       return stbi__errpf("not HDR", "Corrupt HDR image");
 
    // Parse header
@@ -5885,8 +6711,13 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
    if (comp) *comp = 3;
    if (req_comp == 0) req_comp = 3;
 
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
    // Read data
-   hdr_data = (float *) stbi__malloc(height * width * req_comp * sizeof(float));
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
 
    // Load image data
    // image data is stored as some number of sca
@@ -5925,20 +6756,29 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
          len <<= 8;
          len |= stbi__get8(s);
          if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
-         if (scanline == NULL) scanline = (stbi_uc *) stbi__malloc(width * 4);
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
 
          for (k = 0; k < 4; ++k) {
+            int nleft;
             i = 0;
-            while (i < width) {
+            while ((nleft = width - i) > 0) {
                count = stbi__get8(s);
                if (count > 128) {
                   // Run
                   value = stbi__get8(s);
                   count -= 128;
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                   for (z = 0; z < count; ++z)
                      scanline[i++ * 4 + k] = value;
                } else {
                   // Dump
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
                   for (z = 0; z < count; ++z)
                      scanline[i++ * 4 + k] = stbi__get8(s);
                }
@@ -5947,7 +6787,8 @@ static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int re
          for (i=0; i < width; ++i)
             stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
       }
-      STBI_FREE(scanline);
+      if (scanline)
+         STBI_FREE(scanline);
    }
 
    return hdr_data;
@@ -5958,8 +6799,13 @@ static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
    char buffer[STBI__HDR_BUFLEN];
    char *token;
    int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
 
-   if (strcmp(stbi__hdr_gettoken(s,buffer), "#?RADIANCE") != 0) {
+   if (stbi__hdr_test(s) == 0) {
        stbi__rewind( s );
        return 0;
    }
@@ -5996,29 +6842,17 @@ static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
 #ifndef STBI_NO_BMP
 static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int hsz;
-   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') {
-       stbi__rewind( s );
-       return 0;
-   }
-   stbi__skip(s,12);
-   hsz = stbi__get32le(s);
-   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) {
-       stbi__rewind( s );
-       return 0;
-   }
-   if (hsz == 12) {
-      *x = stbi__get16le(s);
-      *y = stbi__get16le(s);
-   } else {
-      *x = stbi__get32le(s);
-      *y = stbi__get32le(s);
-   }
-   if (stbi__get16le(s) != 1) {
-       stbi__rewind( s );
-       return 0;
-   }
-   *comp = stbi__get16le(s) / 8;
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   stbi__rewind( s );
+   if (p == NULL)
+      return 0;
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) *comp = info.ma ? 4 : 3;
    return 1;
 }
 #endif
@@ -6026,7 +6860,10 @@ static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
 #ifndef STBI_NO_PSD
 static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int channelCount;
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
    if (stbi__get32be(s) != 0x38425053) {
        stbi__rewind( s );
        return 0;
@@ -6043,7 +6880,8 @@ static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
    }
    *y = stbi__get32be(s);
    *x = stbi__get32be(s);
-   if (stbi__get16be(s) != 8) {
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
        stbi__rewind( s );
        return 0;
    }
@@ -6054,22 +6892,61 @@ static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
    *comp = 4;
    return 1;
 }
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   (void) stbi__get32be(s);
+   (void) stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
 #endif
 
 #ifndef STBI_NO_PIC
 static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int act_comp=0,num_packets=0,chained;
+   int act_comp=0,num_packets=0,chained,dummy;
    stbi__pic_packet packets[10];
 
-   stbi__skip(s, 92);
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
 
    *x = stbi__get16be(s);
    *y = stbi__get16be(s);
-   if (stbi__at_eof(s))  return 0;
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
    if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
-       stbi__rewind( s );
-       return 0;
+      stbi__rewind( s );
+      return 0;
    }
 
    stbi__skip(s, 8);
@@ -6129,16 +7006,22 @@ static int      stbi__pnm_test(stbi__context *s)
    return 1;
 }
 
-static stbi_uc *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
 {
    stbi_uc *out;
+   STBI_NOTUSED(ri);
+
    if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
       return 0;
+
    *x = s->img_x;
    *y = s->img_y;
-   *comp = s->img_n;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "PNM too large");
 
-   out = (stbi_uc *) stbi__malloc(s->img_n * s->img_x * s->img_y);
+   out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
    if (!out) return stbi__errpuc("outofmem", "Out of memory");
    stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
 
@@ -6156,8 +7039,16 @@ static int      stbi__pnm_isspace(char c)
 
 static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
 {
-   while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
-      *c = (char) stbi__get8(s);
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
 }
 
 static int      stbi__pnm_isdigit(char c)
@@ -6179,16 +7070,20 @@ static int      stbi__pnm_getinteger(stbi__context *s, char *c)
 
 static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
 {
-   int maxv;
+   int maxv, dummy;
    char c, p, t;
 
-   stbi__rewind( s );
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
 
    // Get identifier
    p = (char) stbi__get8(s);
    t = (char) stbi__get8(s);
    if (p != 'P' || (t != '5' && t != '6')) {
-       stbi__rewind( s );
+       stbi__rewind(s);
        return 0;
    }
 
@@ -6254,6 +7149,19 @@ static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
    return stbi__err("unknown image type", "Image not of any known type, or corrupt");
 }
 
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   return 0;
+}
+
 #ifndef STBI_NO_STDIO
 STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
 {
@@ -6275,6 +7183,27 @@ STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
    fseek(f,pos,SEEK_SET);
    return r;
 }
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
 #endif // !STBI_NO_STDIO
 
 STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
@@ -6291,10 +7220,63 @@ STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int
    return stbi__info_main(&s,x,y,comp);
 }
 
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
 #endif // STB_IMAGE_IMPLEMENTATION
 
 /*
    revision history:
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
       2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
       2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
       2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
@@ -6435,3 +7417,46 @@ STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int
       0.50  (2006-11-19)
               first released version
 */
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/image.darknet/src/stb_image_write.h b/image.darknet/src/stb_image_write.h
index f5250b3..c05e958 100644
--- a/image.darknet/src/stb_image_write.h
+++ b/image.darknet/src/stb_image_write.h
@@ -1,7 +1,6 @@
-/* stb_image_write - v0.98 - public domain - http://nothings.org/stb/stb_image_write.h
-   writes out PNG/BMP/TGA images to C stdio - Sean Barrett 2010
-                            no warranty implied; use at your own risk
-
+/* stb_image_write - v1.09 - public domain - http://nothings.org/stb/stb_image_write.h
+   writes out PNG/BMP/TGA/JPEG/HDR images to C stdio - Sean Barrett 2010-2015
+                                     no warranty implied; use at your own risk
 
    Before #including,
 
@@ -11,31 +10,67 @@
 
    Will probably not work correctly with strict-aliasing optimizations.
 
+   If using a modern Microsoft Compiler, non-safe versions of CRT calls may cause 
+   compilation warnings or even errors. To avoid this, also before #including,
+
+       #define STBI_MSC_SECURE_CRT
+
 ABOUT:
 
    This header file is a library for writing images to C stdio. It could be
    adapted to write to memory or a general streaming interface; let me know.
 
    The PNG output is not optimal; it is 20-50% larger than the file
-   written by a decent optimizing implementation. This library is designed
-   for source code compactness and simplicitly, not optimal image file size
-   or run-time performance.
+   written by a decent optimizing implementation; though providing a custom
+   zlib compress function (see STBIW_ZLIB_COMPRESS) can mitigate that.
+   This library is designed for source code compactness and simplicity,
+   not optimal image file size or run-time performance.
 
 BUILDING:
 
    You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
    You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
    malloc,realloc,free.
-   You can define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_MEMMOVE() to replace memmove()
+   You can #define STBIW_ZLIB_COMPRESS to use a custom zlib-style compress function
+   for PNG compression (instead of the builtin one), it must have the following signature:
+   unsigned char * my_compress(unsigned char *data, int data_len, int *out_len, int quality);
+   The returned data will be freed with STBIW_FREE() (free() by default),
+   so it must be heap allocated with STBIW_MALLOC() (malloc() by default),
 
 USAGE:
 
-   There are four functions, one for each image file format:
+   There are five functions, one for each image file format:
 
      int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
      int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
      int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
-     int stbi_write_hdr(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_jpg(char const *filename, int w, int h, int comp, const void *data, int quality);
+     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+
+     void stbi_flip_vertically_on_write(int flag); // flag is non-zero to flip data vertically
+
+   There are also five equivalent functions that use an arbitrary write function. You are
+   expected to open/close your file-equivalent before and after calling these:
+
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+     int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality);
+
+   where the callback is:
+      void stbi_write_func(void *context, void *data, int size);
+
+   You can configure it with these global variables:
+      int stbi_write_tga_with_rle;             // defaults to true; set to 0 to disable RLE
+      int stbi_write_png_compression_level;    // defaults to 8; set to higher for more compression
+      int stbi_write_force_png_filter;         // defaults to -1; set to 0..5 to force a filter mode
+
+
+   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
+   functions, so the library will not use stdio.h at all. However, this will
+   also disable HDR writing, because it requires stdio for formatted output.
 
    Each function returns 0 on failure and non-0 on success.
 
@@ -59,63 +94,138 @@
    writer, both because it is in BGR order and because it may have padding
    at the end of the line.)
 
+   PNG allows you to set the deflate compression level by setting the global
+   variable 'stbi_write_png_compression_level' (it defaults to 8).
+
    HDR expects linear float data. Since the format is always 32-bit rgb(e)
    data, alpha (if provided) is discarded, and for monochrome data it is
    replicated across all three channels.
 
+   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
+   data, set the global variable 'stbi_write_tga_with_rle' to 0.
+   
+   JPEG does ignore alpha channels in input data; quality is between 1 and 100.
+   Higher quality looks better but results in a bigger image.
+   JPEG baseline (no JPEG progressive).
+
 CREDITS:
 
-   PNG/BMP/TGA
-      Sean Barrett
-   HDR
-      Baldur Karlsson
-   TGA monochrome:
-      Jean-Sebastien Guay
-   misc enhancements:
-      Tim Kelsey
+
+   Sean Barrett           -    PNG/BMP/TGA 
+   Baldur Karlsson        -    HDR
+   Jean-Sebastien Guay    -    TGA monochrome
+   Tim Kelsey             -    misc enhancements
+   Alan Hickman           -    TGA RLE
+   Emmanuel Julien        -    initial file IO callback implementation
+   Jon Olick              -    original jo_jpeg.cpp code
+   Daniel Gibson          -    integrate JPEG, allow external zlib
+   Aarni Koskela          -    allow choosing PNG filter
+
    bugfixes:
       github:Chribba
+      Guillaume Chereau
+      github:jry2
+      github:romigrou
+      Sergio Gonzalez
+      Jonas Karlsson
+      Filip Wasil
+      Thatcher Ulrich
+      github:poppolopoppo
+      Patrick Boettcher
+      github:xeekworx
+      Cap Petschulat
+      Simon Rodriguez
+      Ivan Tikhonov
+      github:ignotion
+      Adam Schackart
+
+LICENSE
+
+  See end of file for license information.
+
 */
 
 #ifndef INCLUDE_STB_IMAGE_WRITE_H
 #define INCLUDE_STB_IMAGE_WRITE_H
 
+// if STB_IMAGE_WRITE_STATIC causes problems, try defining STBIWDEF to 'inline' or 'static inline'
+#ifndef STBIWDEF
+#ifdef STB_IMAGE_WRITE_STATIC
+#define STBIWDEF  static
+#else
 #ifdef __cplusplus
-extern "C" {
+#define STBIWDEF  extern "C"
+#else
+#define STBIWDEF  extern
+#endif
+#endif
 #endif
 
-extern int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
-extern int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
-extern int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
-extern int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+#ifndef STB_IMAGE_WRITE_STATIC  // C++ forbids static forward declarations
+extern int stbi_write_tga_with_rle;
+extern int stbi_write_png_compression_level;
+extern int stbi_write_force_png_filter;
+#endif
 
-#ifdef __cplusplus
-}
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void  *data, int quality);
 #endif
 
+typedef void stbi_write_func(void *context, void *data, int size);
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void  *data, int quality);
+
+STBIWDEF void stbi_flip_vertically_on_write(int flip_boolean);
+
 #endif//INCLUDE_STB_IMAGE_WRITE_H
 
 #ifdef STB_IMAGE_WRITE_IMPLEMENTATION
 
+#ifdef _WIN32
+   #ifndef _CRT_SECURE_NO_WARNINGS
+   #define _CRT_SECURE_NO_WARNINGS
+   #endif
+   #ifndef _CRT_NONSTDC_NO_DEPRECATE
+   #define _CRT_NONSTDC_NO_DEPRECATE
+   #endif
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+#include <stdio.h>
+#endif // STBI_WRITE_NO_STDIO
+
 #include <stdarg.h>
 #include <stdlib.h>
-#include <stdio.h>
 #include <string.h>
 #include <math.h>
 
-#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && defined(STBIW_REALLOC)
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
 // ok
-#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC)
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
 // ok
 #else
-#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC."
+#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
 #endif
 
 #ifndef STBIW_MALLOC
-#define STBIW_MALLOC(sz)    malloc(sz)
-#define STBIW_REALLOC(p,sz) realloc(p,sz)
-#define STBIW_FREE(p)       free(p)
+#define STBIW_MALLOC(sz)        malloc(sz)
+#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
+#define STBIW_FREE(p)           free(p)
+#endif
+
+#ifndef STBIW_REALLOC_SIZED
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
 #endif
+
+
 #ifndef STBIW_MEMMOVE
 #define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
 #endif
@@ -126,22 +236,90 @@ extern int stbi_write_hdr(char const *filename, int w, int h, int comp, const fl
 #define STBIW_ASSERT(x) assert(x)
 #endif
 
+#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi__flip_vertically_on_write=0;
+static int stbi_write_png_compression_level = 8;
+static int stbi_write_tga_with_rle = 1;
+static int stbi_write_force_png_filter = -1;
+#else
+int stbi_write_png_compression_level = 8;
+int stbi__flip_vertically_on_write=0;
+int stbi_write_tga_with_rle = 1;
+int stbi_write_force_png_filter = -1;
+#endif
+
+STBIWDEF void stbi_flip_vertically_on_write(int flag)
+{
+   stbi__flip_vertically_on_write = flag;
+}
+
+typedef struct
+{
+   stbi_write_func *func;
+   void *context;
+} stbi__write_context;
+
+// initialize a callback-based context
+static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
+{
+   s->func    = c;
+   s->context = context;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbi__stdio_write(void *context, void *data, int size)
+{
+   fwrite(data,1,size,(FILE*) context);
+}
+
+static int stbi__start_write_file(stbi__write_context *s, const char *filename)
+{
+   FILE *f;
+#ifdef STBI_MSC_SECURE_CRT
+   if (fopen_s(&f, filename, "wb"))
+      f = NULL;
+#else
+   f = fopen(filename, "wb");
+#endif
+   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
+   return f != NULL;
+}
+
+static void stbi__end_write_file(stbi__write_context *s)
+{
+   fclose((FILE *)s->context);
+}
+
+#endif // !STBI_WRITE_NO_STDIO
+
 typedef unsigned int stbiw_uint32;
 typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
 
-static void writefv(FILE *f, const char *fmt, va_list v)
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
 {
    while (*fmt) {
       switch (*fmt++) {
          case ' ': break;
-         case '1': { unsigned char x = (unsigned char) va_arg(v, int); fputc(x,f); break; }
-         case '2': { int x = va_arg(v,int); unsigned char b[2];
-                     b[0] = (unsigned char) x; b[1] = (unsigned char) (x>>8);
-                     fwrite(b,2,1,f); break; }
-         case '4': { stbiw_uint32 x = va_arg(v,int); unsigned char b[4];
-                     b[0]=(unsigned char)x; b[1]=(unsigned char)(x>>8);
-                     b[2]=(unsigned char)(x>>16); b[3]=(unsigned char)(x>>24);
-                     fwrite(b,4,1,f); break; }
+         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
+                     s->func(s->context,&x,1);
+                     break; }
+         case '2': { int x = va_arg(v,int);
+                     unsigned char b[2];
+                     b[0] = STBIW_UCHAR(x);
+                     b[1] = STBIW_UCHAR(x>>8);
+                     s->func(s->context,b,2);
+                     break; }
+         case '4': { stbiw_uint32 x = va_arg(v,int);
+                     unsigned char b[4];
+                     b[0]=STBIW_UCHAR(x);
+                     b[1]=STBIW_UCHAR(x>>8);
+                     b[2]=STBIW_UCHAR(x>>16);
+                     b[3]=STBIW_UCHAR(x>>24);
+                     s->func(s->context,b,4);
+                     break; }
          default:
             STBIW_ASSERT(0);
             return;
@@ -149,22 +327,70 @@ static void writefv(FILE *f, const char *fmt, va_list v)
    }
 }
 
-static void write3(FILE *f, unsigned char a, unsigned char b, unsigned char c)
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
+{
+   va_list v;
+   va_start(v, fmt);
+   stbiw__writefv(s, fmt, v);
+   va_end(v);
+}
+
+static void stbiw__putc(stbi__write_context *s, unsigned char c)
+{
+   s->func(s->context, &c, 1);
+}
+
+static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
 {
    unsigned char arr[3];
    arr[0] = a, arr[1] = b, arr[2] = c;
-   fwrite(arr, 3, 1, f);
+   s->func(s->context, arr, 3);
 }
 
-static void write_pixels(FILE *f, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
 {
    unsigned char bg[3] = { 255, 0, 255}, px[3];
+   int k;
+
+   if (write_alpha < 0)
+      s->func(s->context, &d[comp - 1], 1);
+
+   switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+      case 1:
+         if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+         else
+            s->func(s->context, d, 1);  // monochrome TGA
+         break;
+      case 4:
+         if (!write_alpha) {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+            break;
+         }
+         /* FALLTHROUGH */
+      case 3:
+         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+         break;
+   }
+   if (write_alpha > 0)
+      s->func(s->context, &d[comp - 1], 1);
+}
+
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+{
    stbiw_uint32 zero = 0;
-   int i,j,k, j_end;
+   int i,j, j_end;
 
    if (y <= 0)
       return;
 
+   if (stbi__flip_vertically_on_write)
+      vdir *= -1;
+
    if (vdir < 0)
       j_end = -1, j = y-1;
    else
@@ -173,73 +399,157 @@ static void write_pixels(FILE *f, int rgb_dir, int vdir, int x, int y, int comp,
    for (; j != j_end; j += vdir) {
       for (i=0; i < x; ++i) {
          unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
-         if (write_alpha < 0)
-            fwrite(&d[comp-1], 1, 1, f);
-         switch (comp) {
-            case 1: fwrite(d, 1, 1, f);
-                    break;
-            case 2: if (expand_mono)
-                       write3(f, d[0],d[0],d[0]); // monochrome bmp
-                    else
-                       fwrite(d, 1, 1, f);  // monochrome TGA
-                    break;
-            case 4:
-               if (!write_alpha) {
-                  // composite against pink background
-                  for (k=0; k < 3; ++k)
-                     px[k] = bg[k] + ((d[k] - bg[k]) * d[3])/255;
-                  write3(f, px[1-rgb_dir],px[1],px[1+rgb_dir]);
-                  break;
-               }
-               /* FALLTHROUGH */
-            case 3:
-               write3(f, d[1-rgb_dir],d[1],d[1+rgb_dir]);
-               break;
-         }
-         if (write_alpha > 0)
-            fwrite(&d[comp-1], 1, 1, f);
+         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
       }
-      fwrite(&zero,scanline_pad,1,f);
+      s->func(s->context, &zero, scanline_pad);
    }
 }
 
-static int outfile(char const *filename, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
 {
-   FILE *f;
-   if (y < 0 || x < 0) return 0;
-   f = fopen(filename, "wb");
-   if (f) {
+   if (y < 0 || x < 0) {
+      return 0;
+   } else {
       va_list v;
       va_start(v, fmt);
-      writefv(f, fmt, v);
+      stbiw__writefv(s, fmt, v);
       va_end(v);
-      write_pixels(f,rgb_dir,vdir,x,y,comp,data,alpha,pad,expand_mono);
-      fclose(f);
+      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
+      return 1;
    }
-   return f != NULL;
 }
 
-int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
 {
    int pad = (-x*3) & 3;
-   return outfile(filename,-1,-1,x,y,comp,1,(void *) data,0,pad,
+   return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
            "11 4 22 4" "4 44 22 444444",
            'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
             40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
 }
 
-int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_bmp_core(&s, x, y, comp, data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_bmp_core(&s, x, y, comp, data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif //!STBI_WRITE_NO_STDIO
+
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
 {
    int has_alpha = (comp == 2 || comp == 4);
    int colorbytes = has_alpha ? comp-1 : comp;
    int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
-   return outfile(filename, -1,-1, x, y, comp, 0, (void *) data, has_alpha, 0,
-                  "111 221 2222 11", 0,0,format, 0,0,0, 0,0,x,y, (colorbytes+has_alpha)*8, has_alpha*8);
+
+   if (y < 0 || x < 0)
+      return 0;
+
+   if (!stbi_write_tga_with_rle) {
+      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
+         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+   } else {
+      int i,j,k;
+      int jend, jdir;
+
+      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+
+      if (stbi__flip_vertically_on_write) {
+         j = 0;
+         jend = y;
+         jdir = 1;
+      } else {
+         j = y-1;
+         jend = -1;
+         jdir = -1;
+      }
+      for (; j != jend; j += jdir) {
+         unsigned char *row = (unsigned char *) data + j * x * comp;
+         int len;
+
+         for (i = 0; i < x; i += len) {
+            unsigned char *begin = row + i * comp;
+            int diff = 1;
+            len = 1;
+
+            if (i < x - 1) {
+               ++len;
+               diff = memcmp(begin, row + (i + 1) * comp, comp);
+               if (diff) {
+                  const unsigned char *prev = begin;
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (memcmp(prev, row + k * comp, comp)) {
+                        prev += comp;
+                        ++len;
+                     } else {
+                        --len;
+                        break;
+                     }
+                  }
+               } else {
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (!memcmp(begin, row + k * comp, comp)) {
+                        ++len;
+                     } else {
+                        break;
+                     }
+                  }
+               }
+            }
+
+            if (diff) {
+               unsigned char header = STBIW_UCHAR(len - 1);
+               s->func(s->context, &header, 1);
+               for (k = 0; k < len; ++k) {
+                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+               }
+            } else {
+               unsigned char header = STBIW_UCHAR(len - 129);
+               s->func(s->context, &header, 1);
+               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+            }
+         }
+      }
+   }
+   return 1;
+}
+
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
 }
+#endif
 
 // *************************************************************************************************
 // Radiance RGBE HDR writer
 // by Baldur Karlsson
+
 #define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
 
 void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
@@ -247,7 +557,7 @@ void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
    int exponent;
    float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
 
-   if (maxcomp < 1e-32) {
+   if (maxcomp < 1e-32f) {
       rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
    } else {
       float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
@@ -259,27 +569,27 @@ void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
    }
 }
 
-void stbiw__write_run_data(FILE *f, int length, unsigned char databyte)
+void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
 {
-   unsigned char lengthbyte = (unsigned char) (length+128);
+   unsigned char lengthbyte = STBIW_UCHAR(length+128);
    STBIW_ASSERT(length+128 <= 255);
-   fwrite(&lengthbyte, 1, 1, f);
-   fwrite(&databyte, 1, 1, f);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, &databyte, 1);
 }
 
-void stbiw__write_dump_data(FILE *f, int length, unsigned char *data)
+void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
 {
-   unsigned char lengthbyte = (unsigned char )(length & 0xff);
+   unsigned char lengthbyte = STBIW_UCHAR(length);
    STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
-   fwrite(&lengthbyte, 1, 1, f);
-   fwrite(data, length, 1, f);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, data, length);
 }
 
-void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scratch, const float *scanline)
+void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
 {
    unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
    unsigned char rgbe[4];
-   float linear[3] = {0};
+   float linear[3];
    int x;
 
    scanlineheader[2] = (width&0xff00)>>8;
@@ -288,31 +598,31 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
    /* skip RLE for images too small or large */
    if (width < 8 || width >= 32768) {
       for (x=0; x < width; x++) {
-         switch (comp) {
+         switch (ncomp) {
             case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*comp + 2];
-                    linear[1] = scanline[x*comp + 1];
-                    linear[0] = scanline[x*comp + 0];
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
                     break;
-            case 2: /* fallthrough */
-            case 1: linear[0] = linear[1] = linear[2] = scanline[x*comp + 0];
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
                     break;
          }
          stbiw__linear_to_rgbe(rgbe, linear);
-         fwrite(rgbe, 4, 1, f);
+         s->func(s->context, rgbe, 4);
       }
    } else {
       int c,r;
       /* encode into scratch buffer */
       for (x=0; x < width; x++) {
-         switch(comp) {
+         switch(ncomp) {
             case 4: /* fallthrough */
-            case 3: linear[2] = scanline[x*comp + 2];
-                    linear[1] = scanline[x*comp + 1];
-                    linear[0] = scanline[x*comp + 0];
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
                     break;
-            case 2: /* fallthrough */
-            case 1: linear[0] = linear[1] = linear[2] = scanline[x*comp + 0];
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
                     break;
          }
          stbiw__linear_to_rgbe(rgbe, linear);
@@ -322,7 +632,7 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
          scratch[x + width*3] = rgbe[3];
       }
 
-      fwrite(scanlineheader, 4, 1, f);
+      s->func(s->context, scanlineheader, 4);
 
       /* RLE each component separately */
       for (c=0; c < 4; c++) {
@@ -343,7 +653,7 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
             while (x < r) {
                int len = r-x;
                if (len > 128) len = 128;
-               stbiw__write_dump_data(f, len, &comp[x]);
+               stbiw__write_dump_data(s, len, &comp[x]);
                x += len;
             }
             // if there's a run, output it
@@ -355,7 +665,7 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
                while (x < r) {
                   int len = r-x;
                   if (len > 127) len = 127;
-                  stbiw__write_run_data(f, len, comp[x]);
+                  stbiw__write_run_data(s, len, comp[x]);
                   x += len;
                }
             }
@@ -364,28 +674,59 @@ void stbiw__write_hdr_scanline(FILE *f, int width, int comp, unsigned char *scra
    }
 }
 
-int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
 {
-   int i;
-   FILE *f;
-   if (y <= 0 || x <= 0 || data == NULL) return 0;
-   f = fopen(filename, "wb");
-   if (f) {
-      /* Each component is stored separately. Allocate scratch space for full output scanline. */
+   if (y <= 0 || x <= 0 || data == NULL)
+      return 0;
+   else {
+      // Each component is stored separately. Allocate scratch space for full output scanline.
       unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
-      fprintf(f, "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n"      );
-      fprintf(f, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n"                 , y, x);
+      int i, len;
+      char buffer[128];
+      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+      s->func(s->context, header, sizeof(header)-1);
+
+#ifdef STBI_MSC_SECURE_CRT
+      len = sprintf_s(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#else
+      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+#endif
+      s->func(s->context, buffer, len);
+
       for(i=0; i < y; i++)
-         stbiw__write_hdr_scanline(f, x, comp, scratch, data + comp*i*x);
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*x*(stbi__flip_vertically_on_write ? y-1-i : i)*x);
       STBIW_FREE(scratch);
-      fclose(f);
+      return 1;
    }
-   return f != NULL;
 }
 
-/////////////////////////////////////////////////////////
-// PNG
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif // STBI_WRITE_NO_STDIO
+
 
+//////////////////////////////////////////////////////////////////////////////
+//
+// PNG writer
+//
+
+#ifndef STBIW_ZLIB_COMPRESS
 // stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
 #define stbiw__sbraw(a) ((int *) (a) - 2)
 #define stbiw__sbm(a)   stbiw__sbraw(a)[0]
@@ -402,7 +743,7 @@ int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *da
 static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
 {
    int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
-   void *p = STBIW_REALLOC(*arr ? stbiw__sbraw(*arr) : 0, itemsize * m + sizeof(int)*2);
+   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
    STBIW_ASSERT(p);
    if (p) {
       if (!*arr) ((int *) p)[1] = 0;
@@ -415,7 +756,7 @@ static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
 static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
 {
    while (*bitcount >= 8) {
-      stbiw__sbpush(data, (unsigned char) *bitbuffer);
+      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
       *bitbuffer >>= 8;
       *bitcount -= 8;
    }
@@ -466,8 +807,14 @@ static unsigned int stbiw__zhash(unsigned char *data)
 
 #define stbiw__ZHASH   16384
 
+#endif // STBIW_ZLIB_COMPRESS
+
 unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
 {
+#ifdef STBIW_ZLIB_COMPRESS
+   // user provided a zlib compress implementation, use that
+   return STBIW_ZLIB_COMPRESS(data, data_len, out_len, quality);
+#else // use builtin
    static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
    static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
    static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
@@ -475,7 +822,9 @@ unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_l
    unsigned int bitbuf=0;
    int i,j, bitcount=0;
    unsigned char *out = NULL;
-   unsigned char **hash_table[stbiw__ZHASH]; // 64KB on the stack!
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
+   if (hash_table == NULL)
+      return NULL;
    if (quality < 5) quality = 5;
 
    stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
@@ -547,43 +896,77 @@ unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_l
 
    for (i=0; i < stbiw__ZHASH; ++i)
       (void) stbiw__sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
 
    {
       // compute adler32 on input
-      unsigned int i=0, s1=1, s2=0, blocklen = data_len % 5552;
-      int j=0;
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
       while (j < data_len) {
          for (i=0; i < blocklen; ++i) s1 += data[j+i], s2 += s1;
          s1 %= 65521, s2 %= 65521;
          j += blocklen;
          blocklen = 5552;
       }
-      stbiw__sbpush(out, (unsigned char) (s2 >> 8));
-      stbiw__sbpush(out, (unsigned char) s2);
-      stbiw__sbpush(out, (unsigned char) (s1 >> 8));
-      stbiw__sbpush(out, (unsigned char) s1);
+      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s2));
+      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s1));
    }
    *out_len = stbiw__sbn(out);
    // make returned pointer freeable
    STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
    return (unsigned char *) stbiw__sbraw(out);
+#endif // STBIW_ZLIB_COMPRESS
 }
 
-unsigned int stbiw__crc32(unsigned char *buffer, int len)
+static unsigned int stbiw__crc32(unsigned char *buffer, int len)
 {
-   static unsigned int crc_table[256];
+   static unsigned int crc_table[256] =
+   {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+   };
+
    unsigned int crc = ~0u;
-   int i,j;
-   if (crc_table[1] == 0)
-      for(i=0; i < 256; i++)
-         for (crc_table[i]=i, j=0; j < 8; ++j)
-            crc_table[i] = (crc_table[i] >> 1) ^ (crc_table[i] & 1 ? 0xedb88320 : 0);
+   int i;
    for (i=0; i < len; ++i)
       crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
    return ~crc;
 }
 
-#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=(unsigned char)(a),(o)[1]=(unsigned char)(b),(o)[2]=(unsigned char)(c),(o)[3]=(unsigned char)(d),(o)+=4)
+#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
 #define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
 #define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
 
@@ -596,66 +979,94 @@ static void stbiw__wpcrc(unsigned char **data, int len)
 static unsigned char stbiw__paeth(int a, int b, int c)
 {
    int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
-   if (pa <= pb && pa <= pc) return (unsigned char) a;
-   if (pb <= pc) return (unsigned char) b;
-   return (unsigned char) c;
+   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+   if (pb <= pc) return STBIW_UCHAR(b);
+   return STBIW_UCHAR(c);
+}
+
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+static void stbiw__encode_png_line(unsigned char *pixels, int stride_bytes, int width, int height, int y, int n, int filter_type, signed char *line_buffer)
+{
+   static int mapping[] = { 0,1,2,3,4 };
+   static int firstmap[] = { 0,1,0,5,6 };
+   int *mymap = (y != 0) ? mapping : firstmap;
+   int i;
+   int type = mymap[filter_type];
+   unsigned char *z = pixels + stride_bytes * (stbi__flip_vertically_on_write ? height-1-y : y);
+   int signed_stride = stbi__flip_vertically_on_write ? -stride_bytes : stride_bytes;
+   for (i = 0; i < n; ++i) {
+      switch (type) {
+         case 0: line_buffer[i] = z[i]; break;
+         case 1: line_buffer[i] = z[i]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - (z[i-signed_stride]>>1); break;
+         case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-signed_stride],0)); break;
+         case 5: line_buffer[i] = z[i]; break;
+         case 6: line_buffer[i] = z[i]; break;
+      }
+   }
+   for (i=n; i < width*n; ++i) {
+      switch (type) {
+         case 0: line_buffer[i] = z[i]; break;
+         case 1: line_buffer[i] = z[i] - z[i-n]; break;
+         case 2: line_buffer[i] = z[i] - z[i-signed_stride]; break;
+         case 3: line_buffer[i] = z[i] - ((z[i-n] + z[i-signed_stride])>>1); break;
+         case 4: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-signed_stride], z[i-signed_stride-n]); break;
+         case 5: line_buffer[i] = z[i] - (z[i-n]>>1); break;
+         case 6: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+      }
+   }
 }
 
 unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
 {
+   int force_filter = stbi_write_force_png_filter;
    int ctype[5] = { -1, 0, 4, 2, 6 };
    unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
    unsigned char *out,*o, *filt, *zlib;
    signed char *line_buffer;
-   int i,j,k,p,zlen;
+   int j,zlen;
 
    if (stride_bytes == 0)
       stride_bytes = x * n;
 
+   if (force_filter >= 5) {
+      force_filter = -1;
+   }
+
    filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
    line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
    for (j=0; j < y; ++j) {
-      static int mapping[] = { 0,1,2,3,4 };
-      static int firstmap[] = { 0,1,0,5,6 };
-      int *mymap = j ? mapping : firstmap;
-      int best = 0, bestval = 0x7fffffff;
-      for (p=0; p < 2; ++p) {
-         for (k= p?best:0; k < 5; ++k) {
-            int type = mymap[k],est=0;
-            unsigned char *z = pixels + stride_bytes*j;
-            for (i=0; i < n; ++i)
-               switch (type) {
-                  case 0: line_buffer[i] = z[i]; break;
-                  case 1: line_buffer[i] = z[i]; break;
-                  case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
-                  case 3: line_buffer[i] = z[i] - (z[i-stride_bytes]>>1); break;
-                  case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-stride_bytes],0)); break;
-                  case 5: line_buffer[i] = z[i]; break;
-                  case 6: line_buffer[i] = z[i]; break;
-               }
-            for (i=n; i < x*n; ++i) {
-               switch (type) {
-                  case 0: line_buffer[i] = z[i]; break;
-                  case 1: line_buffer[i] = z[i] - z[i-n]; break;
-                  case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
-                  case 3: line_buffer[i] = z[i] - ((z[i-n] + z[i-stride_bytes])>>1); break;
-                  case 4: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-stride_bytes], z[i-stride_bytes-n]); break;
-                  case 5: line_buffer[i] = z[i] - (z[i-n]>>1); break;
-                  case 6: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
-               }
-            }
-            if (p) break;
-            for (i=0; i < x*n; ++i)
+      int filter_type;
+      if (force_filter > -1) {
+         filter_type = force_filter;
+         stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, force_filter, line_buffer);
+      } else { // Estimate the best filter by running through all of them:
+         int best_filter = 0, best_filter_val = 0x7fffffff, est, i;
+         for (filter_type = 0; filter_type < 5; filter_type++) {
+            stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, filter_type, line_buffer);
+
+            // Estimate the entropy of the line using this filter; the less, the better.
+            est = 0;
+            for (i = 0; i < x*n; ++i) {
                est += abs((signed char) line_buffer[i]);
-            if (est < bestval) { bestval = est; best = k; }
+            }
+            if (est < best_filter_val) {
+               best_filter_val = est;
+               best_filter = filter_type;
+            }
+         }
+         if (filter_type != best_filter) {  // If the last iteration already got us the best filter, don't redo it
+            stbiw__encode_png_line(pixels, stride_bytes, x, y, j, n, best_filter, line_buffer);
+            filter_type = best_filter;
          }
       }
-      // when we get here, best contains the filter type, and line_buffer contains the data
-      filt[j*(x*n+1)] = (unsigned char) best;
+      // when we get here, filter_type contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) filter_type;
       STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
    }
    STBIW_FREE(line_buffer);
-   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, 8); // increase 8 to get smaller but use more memory
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, stbi_write_png_compression_level);
    STBIW_FREE(filt);
    if (!zlib) return 0;
 
@@ -671,7 +1082,7 @@ unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, in
    stbiw__wp32(o, x);
    stbiw__wp32(o, y);
    *o++ = 8;
-   *o++ = (unsigned char) ctype[n];
+   *o++ = STBIW_UCHAR(ctype[n]);
    *o++ = 0;
    *o++ = 0;
    *o++ = 0;
@@ -693,22 +1104,407 @@ unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, in
    return out;
 }
 
-int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
 {
    FILE *f;
    int len;
    unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
-   if (!png) return 0;
+   if (png == NULL) return 0;
+#ifdef STBI_MSC_SECURE_CRT
+   if (fopen_s(&f, filename, "wb"))
+      f = NULL;
+#else
    f = fopen(filename, "wb");
+#endif
    if (!f) { STBIW_FREE(png); return 0; }
    fwrite(png, 1, len, f);
    fclose(f);
    STBIW_FREE(png);
    return 1;
 }
+#endif
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   func(context, png, len);
+   STBIW_FREE(png);
+   return 1;
+}
+
+
+/* ***************************************************************************
+ *
+ * JPEG writer
+ *
+ * This is based on Jon Olick's jo_jpeg.cpp:
+ * public domain Simple, Minimalistic JPEG writer - http://www.jonolick.com/code.html
+ */
+
+static const unsigned char stbiw__jpg_ZigZag[] = { 0,1,5,6,14,15,27,28,2,4,7,13,16,26,29,42,3,8,12,17,25,30,41,43,9,11,18,
+      24,31,40,44,53,10,19,23,32,39,45,52,54,20,22,33,38,46,51,55,60,21,34,37,47,50,56,59,61,35,36,48,49,57,58,62,63 };
+
+static void stbiw__jpg_writeBits(stbi__write_context *s, int *bitBufP, int *bitCntP, const unsigned short *bs) {
+   int bitBuf = *bitBufP, bitCnt = *bitCntP;
+   bitCnt += bs[1];
+   bitBuf |= bs[0] << (24 - bitCnt);
+   while(bitCnt >= 8) {
+      unsigned char c = (bitBuf >> 16) & 255;
+      stbiw__putc(s, c);
+      if(c == 255) {
+         stbiw__putc(s, 0);
+      }
+      bitBuf <<= 8;
+      bitCnt -= 8;
+   }
+   *bitBufP = bitBuf;
+   *bitCntP = bitCnt;
+}
+
+static void stbiw__jpg_DCT(float *d0p, float *d1p, float *d2p, float *d3p, float *d4p, float *d5p, float *d6p, float *d7p) {
+   float d0 = *d0p, d1 = *d1p, d2 = *d2p, d3 = *d3p, d4 = *d4p, d5 = *d5p, d6 = *d6p, d7 = *d7p;
+   float z1, z2, z3, z4, z5, z11, z13;
+
+   float tmp0 = d0 + d7;
+   float tmp7 = d0 - d7;
+   float tmp1 = d1 + d6;
+   float tmp6 = d1 - d6;
+   float tmp2 = d2 + d5;
+   float tmp5 = d2 - d5;
+   float tmp3 = d3 + d4;
+   float tmp4 = d3 - d4;
+
+   // Even part
+   float tmp10 = tmp0 + tmp3;   // phase 2
+   float tmp13 = tmp0 - tmp3;
+   float tmp11 = tmp1 + tmp2;
+   float tmp12 = tmp1 - tmp2;
+
+   d0 = tmp10 + tmp11;       // phase 3
+   d4 = tmp10 - tmp11;
+
+   z1 = (tmp12 + tmp13) * 0.707106781f; // c4
+   d2 = tmp13 + z1;       // phase 5
+   d6 = tmp13 - z1;
+
+   // Odd part
+   tmp10 = tmp4 + tmp5;       // phase 2
+   tmp11 = tmp5 + tmp6;
+   tmp12 = tmp6 + tmp7;
+
+   // The rotator is modified from fig 4-8 to avoid extra negations.
+   z5 = (tmp10 - tmp12) * 0.382683433f; // c6
+   z2 = tmp10 * 0.541196100f + z5; // c2-c6
+   z4 = tmp12 * 1.306562965f + z5; // c2+c6
+   z3 = tmp11 * 0.707106781f; // c4
+
+   z11 = tmp7 + z3;      // phase 5
+   z13 = tmp7 - z3;
+
+   *d5p = z13 + z2;         // phase 6
+   *d3p = z13 - z2;
+   *d1p = z11 + z4;
+   *d7p = z11 - z4;
+
+   *d0p = d0;  *d2p = d2;  *d4p = d4;  *d6p = d6;
+}
+
+static void stbiw__jpg_calcBits(int val, unsigned short bits[2]) {
+   int tmp1 = val < 0 ? -val : val;
+   val = val < 0 ? val-1 : val;
+   bits[1] = 1;
+   while(tmp1 >>= 1) {
+      ++bits[1];
+   }
+   bits[0] = val & ((1<<bits[1])-1);
+}
+
+static int stbiw__jpg_processDU(stbi__write_context *s, int *bitBuf, int *bitCnt, float *CDU, float *fdtbl, int DC, const unsigned short HTDC[256][2], const unsigned short HTAC[256][2]) {
+   const unsigned short EOB[2] = { HTAC[0x00][0], HTAC[0x00][1] };
+   const unsigned short M16zeroes[2] = { HTAC[0xF0][0], HTAC[0xF0][1] };
+   int dataOff, i, diff, end0pos;
+   int DU[64];
+
+   // DCT rows
+   for(dataOff=0; dataOff<64; dataOff+=8) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+1], &CDU[dataOff+2], &CDU[dataOff+3], &CDU[dataOff+4], &CDU[dataOff+5], &CDU[dataOff+6], &CDU[dataOff+7]);
+   }
+   // DCT columns
+   for(dataOff=0; dataOff<8; ++dataOff) {
+      stbiw__jpg_DCT(&CDU[dataOff], &CDU[dataOff+8], &CDU[dataOff+16], &CDU[dataOff+24], &CDU[dataOff+32], &CDU[dataOff+40], &CDU[dataOff+48], &CDU[dataOff+56]);
+   }
+   // Quantize/descale/zigzag the coefficients
+   for(i=0; i<64; ++i) {
+      float v = CDU[i]*fdtbl[i];
+      // DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? ceilf(v - 0.5f) : floorf(v + 0.5f));
+      // ceilf() and floorf() are C99, not C89, but I /think/ they're not needed here anyway?
+      DU[stbiw__jpg_ZigZag[i]] = (int)(v < 0 ? v - 0.5f : v + 0.5f);
+   }
+
+   // Encode DC
+   diff = DU[0] - DC;
+   if (diff == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[0]);
+   } else {
+      unsigned short bits[2];
+      stbiw__jpg_calcBits(diff, bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTDC[bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   // Encode ACs
+   end0pos = 63;
+   for(; (end0pos>0)&&(DU[end0pos]==0); --end0pos) {
+   }
+   // end0pos = first element in reverse order !=0
+   if(end0pos == 0) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+      return DU[0];
+   }
+   for(i = 1; i <= end0pos; ++i) {
+      int startpos = i;
+      int nrzeroes;
+      unsigned short bits[2];
+      for (; DU[i]==0 && i<=end0pos; ++i) {
+      }
+      nrzeroes = i-startpos;
+      if ( nrzeroes >= 16 ) {
+         int lng = nrzeroes>>4;
+         int nrmarker;
+         for (nrmarker=1; nrmarker <= lng; ++nrmarker)
+            stbiw__jpg_writeBits(s, bitBuf, bitCnt, M16zeroes);
+         nrzeroes &= 15;
+      }
+      stbiw__jpg_calcBits(DU[i], bits);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, HTAC[(nrzeroes<<4)+bits[1]]);
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, bits);
+   }
+   if(end0pos != 63) {
+      stbiw__jpg_writeBits(s, bitBuf, bitCnt, EOB);
+   }
+   return DU[0];
+}
+
+static int stbi_write_jpg_core(stbi__write_context *s, int width, int height, int comp, const void* data, int quality) {
+   // Constants that don't pollute global namespace
+   static const unsigned char std_dc_luminance_nrcodes[] = {0,0,1,5,1,1,1,1,1,1,0,0,0,0,0,0,0};
+   static const unsigned char std_dc_luminance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_luminance_nrcodes[] = {0,0,2,1,3,3,2,4,3,5,5,4,4,0,0,1,0x7d};
+   static const unsigned char std_ac_luminance_values[] = {
+      0x01,0x02,0x03,0x00,0x04,0x11,0x05,0x12,0x21,0x31,0x41,0x06,0x13,0x51,0x61,0x07,0x22,0x71,0x14,0x32,0x81,0x91,0xa1,0x08,
+      0x23,0x42,0xb1,0xc1,0x15,0x52,0xd1,0xf0,0x24,0x33,0x62,0x72,0x82,0x09,0x0a,0x16,0x17,0x18,0x19,0x1a,0x25,0x26,0x27,0x28,
+      0x29,0x2a,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,0x59,
+      0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x83,0x84,0x85,0x86,0x87,0x88,0x89,
+      0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,0xb5,0xb6,
+      0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xe1,0xe2,
+      0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   static const unsigned char std_dc_chrominance_nrcodes[] = {0,0,3,1,1,1,1,1,1,1,1,1,0,0,0,0,0};
+   static const unsigned char std_dc_chrominance_values[] = {0,1,2,3,4,5,6,7,8,9,10,11};
+   static const unsigned char std_ac_chrominance_nrcodes[] = {0,0,2,1,2,4,4,3,4,7,5,4,4,0,1,2,0x77};
+   static const unsigned char std_ac_chrominance_values[] = {
+      0x00,0x01,0x02,0x03,0x11,0x04,0x05,0x21,0x31,0x06,0x12,0x41,0x51,0x07,0x61,0x71,0x13,0x22,0x32,0x81,0x08,0x14,0x42,0x91,
+      0xa1,0xb1,0xc1,0x09,0x23,0x33,0x52,0xf0,0x15,0x62,0x72,0xd1,0x0a,0x16,0x24,0x34,0xe1,0x25,0xf1,0x17,0x18,0x19,0x1a,0x26,
+      0x27,0x28,0x29,0x2a,0x35,0x36,0x37,0x38,0x39,0x3a,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x53,0x54,0x55,0x56,0x57,0x58,
+      0x59,0x5a,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x82,0x83,0x84,0x85,0x86,0x87,
+      0x88,0x89,0x8a,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xb2,0xb3,0xb4,
+      0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,
+      0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa
+   };
+   // Huffman tables
+   static const unsigned short YDC_HT[256][2] = { {0,2},{2,3},{3,3},{4,3},{5,3},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9}};
+   static const unsigned short UVDC_HT[256][2] = { {0,2},{1,2},{2,2},{6,3},{14,4},{30,5},{62,6},{126,7},{254,8},{510,9},{1022,10},{2046,11}};
+   static const unsigned short YAC_HT[256][2] = {
+      {10,4},{0,2},{1,2},{4,3},{11,4},{26,5},{120,7},{248,8},{1014,10},{65410,16},{65411,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {12,4},{27,5},{121,7},{502,9},{2038,11},{65412,16},{65413,16},{65414,16},{65415,16},{65416,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {28,5},{249,8},{1015,10},{4084,12},{65417,16},{65418,16},{65419,16},{65420,16},{65421,16},{65422,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{503,9},{4085,12},{65423,16},{65424,16},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1016,10},{65430,16},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2039,11},{65438,16},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {123,7},{4086,12},{65446,16},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {250,8},{4087,12},{65454,16},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{32704,15},{65462,16},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65470,16},{65471,16},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65479,16},{65480,16},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1017,10},{65488,16},{65489,16},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{65497,16},{65498,16},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2040,11},{65506,16},{65507,16},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {65515,16},{65516,16},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65525,16},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const unsigned short UVAC_HT[256][2] = {
+      {0,2},{1,2},{4,3},{10,4},{24,5},{25,5},{56,6},{120,7},{500,9},{1014,10},{4084,12},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {11,4},{57,6},{246,8},{501,9},{2038,11},{4085,12},{65416,16},{65417,16},{65418,16},{65419,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {26,5},{247,8},{1015,10},{4086,12},{32706,15},{65420,16},{65421,16},{65422,16},{65423,16},{65424,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {27,5},{248,8},{1016,10},{4087,12},{65425,16},{65426,16},{65427,16},{65428,16},{65429,16},{65430,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {58,6},{502,9},{65431,16},{65432,16},{65433,16},{65434,16},{65435,16},{65436,16},{65437,16},{65438,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {59,6},{1017,10},{65439,16},{65440,16},{65441,16},{65442,16},{65443,16},{65444,16},{65445,16},{65446,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {121,7},{2039,11},{65447,16},{65448,16},{65449,16},{65450,16},{65451,16},{65452,16},{65453,16},{65454,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {122,7},{2040,11},{65455,16},{65456,16},{65457,16},{65458,16},{65459,16},{65460,16},{65461,16},{65462,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {249,8},{65463,16},{65464,16},{65465,16},{65466,16},{65467,16},{65468,16},{65469,16},{65470,16},{65471,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {503,9},{65472,16},{65473,16},{65474,16},{65475,16},{65476,16},{65477,16},{65478,16},{65479,16},{65480,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {504,9},{65481,16},{65482,16},{65483,16},{65484,16},{65485,16},{65486,16},{65487,16},{65488,16},{65489,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {505,9},{65490,16},{65491,16},{65492,16},{65493,16},{65494,16},{65495,16},{65496,16},{65497,16},{65498,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {506,9},{65499,16},{65500,16},{65501,16},{65502,16},{65503,16},{65504,16},{65505,16},{65506,16},{65507,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {2041,11},{65508,16},{65509,16},{65510,16},{65511,16},{65512,16},{65513,16},{65514,16},{65515,16},{65516,16},{0,0},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {16352,14},{65517,16},{65518,16},{65519,16},{65520,16},{65521,16},{65522,16},{65523,16},{65524,16},{65525,16},{0,0},{0,0},{0,0},{0,0},{0,0},
+      {1018,10},{32707,15},{65526,16},{65527,16},{65528,16},{65529,16},{65530,16},{65531,16},{65532,16},{65533,16},{65534,16},{0,0},{0,0},{0,0},{0,0},{0,0}
+   };
+   static const int YQT[] = {16,11,10,16,24,40,51,61,12,12,14,19,26,58,60,55,14,13,16,24,40,57,69,56,14,17,22,29,51,87,80,62,18,22,
+                             37,56,68,109,103,77,24,35,55,64,81,104,113,92,49,64,78,87,103,121,120,101,72,92,95,98,112,100,103,99};
+   static const int UVQT[] = {17,18,24,47,99,99,99,99,18,21,26,66,99,99,99,99,24,26,56,99,99,99,99,99,47,66,99,99,99,99,99,99,
+                              99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99,99};
+   static const float aasf[] = { 1.0f * 2.828427125f, 1.387039845f * 2.828427125f, 1.306562965f * 2.828427125f, 1.175875602f * 2.828427125f, 
+                                 1.0f * 2.828427125f, 0.785694958f * 2.828427125f, 0.541196100f * 2.828427125f, 0.275899379f * 2.828427125f };
+
+   int row, col, i, k;
+   float fdtbl_Y[64], fdtbl_UV[64];
+   unsigned char YTable[64], UVTable[64];
+
+   if(!data || !width || !height || comp > 4 || comp < 1) {
+      return 0;
+   }
+
+   quality = quality ? quality : 90;
+   quality = quality < 1 ? 1 : quality > 100 ? 100 : quality;
+   quality = quality < 50 ? 5000 / quality : 200 - quality * 2;
+
+   for(i = 0; i < 64; ++i) {
+      int uvti, yti = (YQT[i]*quality+50)/100;
+      YTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (yti < 1 ? 1 : yti > 255 ? 255 : yti);
+      uvti = (UVQT[i]*quality+50)/100;
+      UVTable[stbiw__jpg_ZigZag[i]] = (unsigned char) (uvti < 1 ? 1 : uvti > 255 ? 255 : uvti);
+   }
+
+   for(row = 0, k = 0; row < 8; ++row) {
+      for(col = 0; col < 8; ++col, ++k) {
+         fdtbl_Y[k]  = 1 / (YTable [stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+         fdtbl_UV[k] = 1 / (UVTable[stbiw__jpg_ZigZag[k]] * aasf[row] * aasf[col]);
+      }
+   }
+
+   // Write Headers
+   {
+      static const unsigned char head0[] = { 0xFF,0xD8,0xFF,0xE0,0,0x10,'J','F','I','F',0,1,1,0,0,1,0,1,0,0,0xFF,0xDB,0,0x84,0 };
+      static const unsigned char head2[] = { 0xFF,0xDA,0,0xC,3,1,0,2,0x11,3,0x11,0,0x3F,0 };
+      const unsigned char head1[] = { 0xFF,0xC0,0,0x11,8,(unsigned char)(height>>8),STBIW_UCHAR(height),(unsigned char)(width>>8),STBIW_UCHAR(width),
+                                      3,1,0x11,0,2,0x11,1,3,0x11,1,0xFF,0xC4,0x01,0xA2,0 };
+      s->func(s->context, (void*)head0, sizeof(head0));
+      s->func(s->context, (void*)YTable, sizeof(YTable));
+      stbiw__putc(s, 1);
+      s->func(s->context, UVTable, sizeof(UVTable));
+      s->func(s->context, (void*)head1, sizeof(head1));
+      s->func(s->context, (void*)(std_dc_luminance_nrcodes+1), sizeof(std_dc_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_luminance_values, sizeof(std_dc_luminance_values));
+      stbiw__putc(s, 0x10); // HTYACinfo
+      s->func(s->context, (void*)(std_ac_luminance_nrcodes+1), sizeof(std_ac_luminance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_luminance_values, sizeof(std_ac_luminance_values));
+      stbiw__putc(s, 1); // HTUDCinfo
+      s->func(s->context, (void*)(std_dc_chrominance_nrcodes+1), sizeof(std_dc_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_dc_chrominance_values, sizeof(std_dc_chrominance_values));
+      stbiw__putc(s, 0x11); // HTUACinfo
+      s->func(s->context, (void*)(std_ac_chrominance_nrcodes+1), sizeof(std_ac_chrominance_nrcodes)-1);
+      s->func(s->context, (void*)std_ac_chrominance_values, sizeof(std_ac_chrominance_values));
+      s->func(s->context, (void*)head2, sizeof(head2));
+   }
+
+   // Encode 8x8 macroblocks
+   {
+      static const unsigned short fillBits[] = {0x7F, 7};
+      const unsigned char *imageData = (const unsigned char *)data;
+      int DCY=0, DCU=0, DCV=0;
+      int bitBuf=0, bitCnt=0;
+      // comp == 2 is grey+alpha (alpha is ignored)
+      int ofsG = comp > 2 ? 1 : 0, ofsB = comp > 2 ? 2 : 0;
+      int x, y, pos;
+      for(y = 0; y < height; y += 8) {
+         for(x = 0; x < width; x += 8) {
+            float YDU[64], UDU[64], VDU[64];
+            for(row = y, pos = 0; row < y+8; ++row) {
+               for(col = x; col < x+8; ++col, ++pos) {
+                  int p = (stbi__flip_vertically_on_write ? height-1-row : row)*width*comp + col*comp;
+                  float r, g, b;
+                  if(row >= height) {
+                     p -= width*comp*(row+1 - height);
+                  }
+                  if(col >= width) {
+                     p -= comp*(col+1 - width);
+                  }
+
+                  r = imageData[p+0];
+                  g = imageData[p+ofsG];
+                  b = imageData[p+ofsB];
+                  YDU[pos]=+0.29900f*r+0.58700f*g+0.11400f*b-128;
+                  UDU[pos]=-0.16874f*r-0.33126f*g+0.50000f*b;
+                  VDU[pos]=+0.50000f*r-0.41869f*g-0.08131f*b;
+               }
+            }
+
+            DCY = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, YDU, fdtbl_Y, DCY, YDC_HT, YAC_HT);
+            DCU = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, UDU, fdtbl_UV, DCU, UVDC_HT, UVAC_HT);
+            DCV = stbiw__jpg_processDU(s, &bitBuf, &bitCnt, VDU, fdtbl_UV, DCV, UVDC_HT, UVAC_HT);
+         }
+      }
+
+      // Do the bit alignment of the EOI marker
+      stbiw__jpg_writeBits(s, &bitBuf, &bitCnt, fillBits);
+   }
+
+   // EOI
+   stbiw__putc(s, 0xFF);
+   stbiw__putc(s, 0xD9);
+
+   return 1;
+}
+
+STBIWDEF int stbi_write_jpg_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_jpg_core(&s, x, y, comp, (void *) data, quality);
+}
+
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_jpg(char const *filename, int x, int y, int comp, const void *data, int quality)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_jpg_core(&s, x, y, comp, data, quality);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
 #endif // STB_IMAGE_WRITE_IMPLEMENTATION
 
 /* Revision history
+      1.09  (2018-02-11)
+             fix typo in zlib quality API, improve STB_I_W_STATIC in C++
+      1.08  (2018-01-29)
+             add stbi__flip_vertically_on_write, external zlib, zlib quality, choose PNG filter
+      1.07  (2017-07-24)
+             doc fix
+      1.06 (2017-07-23)
+             writing JPEG (using Jon Olick's code)
+      1.05   ???
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
+      1.02 (2016-04-02)
+             avoid allocating large structures on the stack
+      1.01 (2016-01-16)
+             STBIW_REALLOC_SIZED: support allocators with no realloc support
+             avoid race-condition in crc initialization
+             minor compile issues
+      1.00 (2015-09-14)
+             installable file IO function
+      0.99 (2015-09-13)
+             warning fixes; TGA rle support
       0.98 (2015-04-08)
              added STBIW_MALLOC, STBIW_ASSERT etc
       0.97 (2015-01-18)
@@ -728,3 +1524,45 @@ int stbi_write_png(char const *filename, int x, int y, int comp, const void *dat
              first public release
       0.90   first internal release
 */
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/image.darknet/src/tree.c b/image.darknet/src/tree.c
index dd44515..67b6d43 100644
--- a/image.darknet/src/tree.c
+++ b/image.darknet/src/tree.c
@@ -24,33 +24,33 @@ void change_leaves(tree *t, char *leaf_list)
     fprintf(stderr, "Found %d leaves.\n", found);
 }
 
-float get_hierarchy_probability(float *x, tree *hier, int c)
+float get_hierarchy_probability(float *x, tree *hier, int c, int stride)
 {
     float p = 1;
     while(c >= 0){
-        p = p * x[c];
+        p = p * x[c*stride];
         c = hier->parent[c];
     }
     return p;
 }
 
-void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves)
+void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves, int stride)
 {
     int j;
     for(j = 0; j < n; ++j){
         int parent = hier->parent[j];
         if(parent >= 0){
-            predictions[j] *= predictions[parent]; 
+            predictions[j*stride] *= predictions[parent*stride]; 
         }
     }
     if(only_leaves){
         for(j = 0; j < n; ++j){
-            if(!hier->leaf[j]) predictions[j] = 0;
+            if(!hier->leaf[j]) predictions[j*stride] = 0;
         }
     }
 }
 
-int hierarchy_top_prediction(float *predictions, tree *hier, float thresh)
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride)
 {
     float p = 1;
     int group = 0;
@@ -61,7 +61,7 @@ int hierarchy_top_prediction(float *predictions, tree *hier, float thresh)
 
         for(i = 0; i < hier->group_size[group]; ++i){
             int index = i + hier->group_offset[group];
-            float val = predictions[i + hier->group_offset[group]];
+            float val = predictions[(i + hier->group_offset[group])*stride];
             if(val > max){
                 max_i = index;
                 max = val;
@@ -71,6 +71,8 @@ int hierarchy_top_prediction(float *predictions, tree *hier, float thresh)
             p = p*max;
             group = hier->child[max_i];
             if(hier->child[max_i] < 0) return max_i;
+        } else if (group == 0){
+            return max_i;
         } else {
             return hier->parent[hier->group_offset[group]];
         }
diff --git a/image.darknet/src/tree.h b/image.darknet/src/tree.h
index dbd4c39..3802b8e 100644
--- a/image.darknet/src/tree.h
+++ b/image.darknet/src/tree.h
@@ -1,23 +1,8 @@
 #ifndef TREE_H
 #define TREE_H
+#include "darknet.h"
 
-typedef struct{
-    int *leaf;
-    int n;
-    int *parent;
-    int *child;
-    int *group;
-    char **name;
-
-    int groups;
-    int *group_size;
-    int *group_offset;
-} tree;
-
-tree *read_tree(char *filename);
-void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves);
-void change_leaves(tree *t, char *leaf_list);
-int hierarchy_top_prediction(float *predictions, tree *hier, float thresh);
-float get_hierarchy_probability(float *x, tree *hier, int c);
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride);
+float get_hierarchy_probability(float *x, tree *hier, int c, int stride);
 
 #endif
diff --git a/image.darknet/src/upsample_layer.c b/image.darknet/src/upsample_layer.c
new file mode 100644
index 0000000..605f21f
--- /dev/null
+++ b/image.darknet/src/upsample_layer.c
@@ -0,0 +1,106 @@
+#include "upsample_layer.h"
+#include "cuda.h"
+#include "blas.h"
+
+#include <stdio.h>
+
+layer make_upsample_layer(int batch, int w, int h, int c, int stride)
+{
+    layer l = {0};
+    l.type = UPSAMPLE;
+    l.batch = batch;
+    l.w = w;
+    l.h = h;
+    l.c = c;
+    l.out_w = w*stride;
+    l.out_h = h*stride;
+    l.out_c = c;
+    if(stride < 0){
+        stride = -stride;
+        l.reverse=1;
+        l.out_w = w/stride;
+        l.out_h = h/stride;
+    }
+    l.stride = stride;
+    l.outputs = l.out_w*l.out_h*l.out_c;
+    l.inputs = l.w*l.h*l.c;
+    l.delta =  calloc(l.outputs*batch, sizeof(float));
+    l.output = calloc(l.outputs*batch, sizeof(float));;
+
+    l.forward = forward_upsample_layer;
+    l.backward = backward_upsample_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_upsample_layer_gpu;
+    l.backward_gpu = backward_upsample_layer_gpu;
+
+    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+    #endif
+    if(l.reverse) fprintf(stderr, "downsample         %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    else fprintf(stderr, "upsample           %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    return l;
+}
+
+void resize_upsample_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+    l->out_w = w*l->stride;
+    l->out_h = h*l->stride;
+    if(l->reverse){
+        l->out_w = w/l->stride;
+        l->out_h = h/l->stride;
+    }
+    l->outputs = l->out_w*l->out_h*l->out_c;
+    l->inputs = l->h*l->w*l->c;
+    l->delta =  realloc(l->delta, l->outputs*l->batch*sizeof(float));
+    l->output = realloc(l->output, l->outputs*l->batch*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu   = cuda_make_array(l->delta,  l->outputs*l->batch);
+#endif
+    
+}
+
+void forward_upsample_layer(const layer l, network net)
+{
+    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+    if(l.reverse){
+        upsample_cpu(l.output, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input);
+    }else{
+        upsample_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output);
+    }
+}
+
+void backward_upsample_layer(const layer l, network net)
+{
+    if(l.reverse){
+        upsample_cpu(l.delta, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, net.delta);
+    }else{
+        upsample_cpu(net.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta);
+    }
+}
+
+#ifdef GPU
+void forward_upsample_layer_gpu(const layer l, network net)
+{
+    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    if(l.reverse){
+        upsample_gpu(l.output_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input_gpu);
+    }else{
+        upsample_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output_gpu);
+    }
+}
+
+void backward_upsample_layer_gpu(const layer l, network net)
+{
+    if(l.reverse){
+        upsample_gpu(l.delta_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, net.delta_gpu);
+    }else{
+        upsample_gpu(net.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta_gpu);
+    }
+}
+#endif
diff --git a/image.darknet/src/upsample_layer.h b/image.darknet/src/upsample_layer.h
new file mode 100644
index 0000000..86790d1
--- /dev/null
+++ b/image.darknet/src/upsample_layer.h
@@ -0,0 +1,15 @@
+#ifndef UPSAMPLE_LAYER_H
+#define UPSAMPLE_LAYER_H
+#include "darknet.h"
+
+layer make_upsample_layer(int batch, int w, int h, int c, int stride);
+void forward_upsample_layer(const layer l, network net);
+void backward_upsample_layer(const layer l, network net);
+void resize_upsample_layer(layer *l, int w, int h);
+
+#ifdef GPU
+void forward_upsample_layer_gpu(const layer l, network net);
+void backward_upsample_layer_gpu(const layer l, network net);
+#endif
+
+#endif
diff --git a/image.darknet/src/utils.c b/image.darknet/src/utils.c
index b5181d7..626b467 100644
--- a/image.darknet/src/utils.c
+++ b/image.darknet/src/utils.c
@@ -6,9 +6,56 @@
 #include <unistd.h>
 #include <float.h>
 #include <limits.h>
+#include <time.h>
+#include <sys/time.h>
 
 #include "utils.h"
 
+
+/*
+// old timing. is it better? who knows!!
+double get_wall_time()
+{
+    struct timeval time;
+    if (gettimeofday(&time,NULL)){
+        return 0;
+    }
+    return (double)time.tv_sec + (double)time.tv_usec * .000001;
+}
+*/
+
+double what_time_is_it_now()
+{
+    struct timeval time;
+    if (gettimeofday(&time,NULL)){
+        return 0;
+    }
+    return (double)time.tv_sec + (double)time.tv_usec * .000001;
+}
+
+int *read_intlist(char *gpu_list, int *ngpus, int d)
+{
+    int *gpus = 0;
+    if(gpu_list){
+        int len = strlen(gpu_list);
+        *ngpus = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (gpu_list[i] == ',') ++*ngpus;
+        }
+        gpus = calloc(*ngpus, sizeof(int));
+        for(i = 0; i < *ngpus; ++i){
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',')+1;
+        }
+    } else {
+        gpus = calloc(1, sizeof(float));
+        *gpus = d;
+        *ngpus = 1;
+    }
+    return gpus;
+}
+
 int *read_map(char *filename)
 {
     int n = 0;
@@ -47,6 +94,22 @@ void shuffle(void *arr, size_t n, size_t size)
     }
 }
 
+int *random_index_order(int min, int max)
+{
+    int *inds = calloc(max-min, sizeof(int));
+    int i;
+    for(i = min; i < max; ++i){
+        inds[i] = i;
+    }
+    for(i = min; i < max-1; ++i){
+        int swap = inds[i];
+        int index = i + rand()%(max-i);
+        inds[i] = inds[index];
+        inds[index] = swap;
+    }
+    return inds;
+}
+
 void del_arg(int argc, char **argv, int index)
 {
     int i;
@@ -194,6 +257,21 @@ void error(const char *s)
     exit(-1);
 }
 
+unsigned char *read_file(char *filename)
+{
+    FILE *fp = fopen(filename, "rb");
+    size_t size;
+
+    fseek(fp, 0, SEEK_END); 
+    size = ftell(fp);
+    fseek(fp, 0, SEEK_SET); 
+
+    unsigned char *text = calloc(size+1, sizeof(char));
+    fread(text, 1, size, fp);
+    fclose(fp);
+    return text;
+}
+
 void malloc_error()
 {
     fprintf(stderr, "Malloc error\n");
@@ -524,6 +602,20 @@ int sample_array(float *a, int n)
     return n-1;
 }
 
+int max_int_index(int *a, int n)
+{
+    if(n <= 0) return -1;
+    int i, max_i = 0;
+    int max = a[0];
+    for(i = 1; i < n; ++i){
+        if(a[i] > max){
+            max = a[i];
+            max_i = i;
+        }
+    }
+    return max_i;
+}
+
 int max_index(float *a, int n)
 {
     if(n <= 0) return -1;
@@ -538,6 +630,15 @@ int max_index(float *a, int n)
     return max_i;
 }
 
+int int_index(int *a, int val, int n)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        if(a[i] == val) return i;
+    }
+    return -1;
+}
+
 int rand_int(int min, int max)
 {
     if (max < min){
@@ -585,13 +686,13 @@ float rand_normal()
 size_t rand_size_t()
 {
     return  ((size_t)(rand()&0xff) << 56) | 
-            ((size_t)(rand()&0xff) << 48) |
-            ((size_t)(rand()&0xff) << 40) |
-            ((size_t)(rand()&0xff) << 32) |
-            ((size_t)(rand()&0xff) << 24) |
-            ((size_t)(rand()&0xff) << 16) |
-            ((size_t)(rand()&0xff) << 8) |
-            ((size_t)(rand()&0xff) << 0);
+        ((size_t)(rand()&0xff) << 48) |
+        ((size_t)(rand()&0xff) << 40) |
+        ((size_t)(rand()&0xff) << 32) |
+        ((size_t)(rand()&0xff) << 24) |
+        ((size_t)(rand()&0xff) << 16) |
+        ((size_t)(rand()&0xff) << 8) |
+        ((size_t)(rand()&0xff) << 0);
 }
 
 float rand_uniform(float min, float max)
diff --git a/image.darknet/src/utils.h b/image.darknet/src/utils.h
index bbc6765..ef24da7 100644
--- a/image.darknet/src/utils.h
+++ b/image.darknet/src/utils.h
@@ -2,16 +2,22 @@
 #define UTILS_H
 #include <stdio.h>
 #include <time.h>
+#include "darknet.h"
 #include "list.h"
 
-#define SECRET_NUM -1234
-#define TWO_PI 6.2831853071795864769252866
+#define TIME(a) \
+    do { \
+    double start = what_time_is_it_now(); \
+    a; \
+    printf("%s took: %f seconds\n", #a, what_time_is_it_now() - start); \
+    } while (0)
 
-int *read_map(char *filename);
+#define TWO_PI 6.2831853071795864769252866f
+
+double what_time_is_it_now();
 void shuffle(void *arr, size_t n, size_t size);
 void sorta_shuffle(void *arr, size_t n, size_t size, size_t sections);
 void free_ptrs(void **ptrs, int n);
-char *basecfg(char *cfgfile);
 int alphanum_to_int(char c);
 char int_to_alphanum(int i);
 int read_int(int fd);
@@ -21,44 +27,27 @@ void write_all(int fd, char *buffer, size_t bytes);
 int read_all_fail(int fd, char *buffer, size_t bytes);
 int write_all_fail(int fd, char *buffer, size_t bytes);
 void find_replace(char *str, char *orig, char *rep, char *output);
-void error(const char *s);
 void malloc_error();
 void file_error(char *s);
 void strip(char *s);
 void strip_char(char *s, char bad);
-void top_k(float *a, int n, int k, int *index);
 list *split_str(char *s, char delim);
 char *fgetl(FILE *fp);
 list *parse_csv_line(char *line);
 char *copy_string(char *s);
 int count_fields(char *line);
 float *parse_fields(char *line, int n);
-void normalize_array(float *a, int n);
-void scale_array(float *a, int n, float s);
 void translate_array(float *a, int n, float s);
-int max_index(float *a, int n);
 float constrain(float min, float max, float a);
 int constrain_int(int a, int min, int max);
-float mse_array(float *a, int n);
-float rand_normal();
-size_t rand_size_t();
-float rand_uniform(float min, float max);
 float rand_scale(float s);
 int rand_int(int min, int max);
-float sum_array(float *a, int n);
-float mean_array(float *a, int n);
 void mean_arrays(float **a, int n, int els, float *avg);
-float variance_array(float *a, int n);
-float mag_array(float *a, int n);
 float dist_array(float *a, float *b, int n, int sub);
 float **one_hot_encode(float *a, int n, int k);
 float sec(clock_t clocks);
-int find_int_arg(int argc, char **argv, char *arg, int def);
-float find_float_arg(int argc, char **argv, char *arg, float def);
-int find_arg(int argc, char* argv[], char *arg);
-char *find_char_arg(int argc, char **argv, char *arg, char *def);
-int sample_array(float *a, int n);
 void print_statistics(float *a, int n);
+int int_index(int *a, int val, int n);
 
 #endif
 
diff --git a/image.darknet/src/writing.c b/image.darknet/src/writing.c
deleted file mode 100644
index 0a76d48..0000000
--- a/image.darknet/src/writing.c
+++ /dev/null
@@ -1,150 +0,0 @@
-#include "network.h"
-#include "utils.h"
-#include "parser.h"
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui_c.h"
-#endif
-
-void train_writing(char *cfgfile, char *weightfile)
-{
-    char *backup_directory = "/home/pjreddie/backup/";
-    srand(time(0));
-    float avg_loss = -1;
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
-    int imgs = net.batch*net.subdivisions;
-    list *plist = get_paths("figures.list");
-    char **paths = (char **)list_to_array(plist);
-    clock_t time;
-    int N = plist->size;
-    printf("N: %d\n", N);
-    image out = get_network_image(net);
-
-    data train, buffer;
-
-    load_args args = {0};
-    args.w = net.w;
-    args.h = net.h;
-    args.out_w = out.w;
-    args.out_h = out.h;
-    args.paths = paths;
-    args.n = imgs;
-    args.m = N;
-    args.d = &buffer;
-    args.type = WRITING_DATA;
-
-    pthread_t load_thread = load_data_in_thread(args);
-    int epoch = (*net.seen)/N;
-    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data_in_thread(args);
-        printf("Loaded %lf seconds\n",sec(clock()-time));
-
-        time=clock();
-        float loss = train_network(net, train);
-
-        /*
-           image pred = float_to_image(64, 64, 1, out);
-           print_image(pred);
-         */
-
-        /*
-           image im = float_to_image(256, 256, 3, train.X.vals[0]);
-           image lab = float_to_image(64, 64, 1, train.y.vals[0]);
-           image pred = float_to_image(64, 64, 1, out);
-           show_image(im, "image");
-           show_image(lab, "label");
-           print_image(lab);
-           show_image(pred, "pred");
-           cvWaitKey(0);
-         */
-
-        if(avg_loss == -1) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
-        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
-        free_data(train);
-        if(get_current_batch(net)%100 == 0){
-            char buff[256];
-            sprintf(buff, "%s/%s_batch_%d.weights", backup_directory, base, get_current_batch(net));
-            save_weights(net, buff);
-        }
-        if(*net.seen/N > epoch){
-            epoch = *net.seen/N;
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
-            save_weights(net, buff);
-        }
-    }
-}
-
-void test_writing(char *cfgfile, char *weightfile, char *filename)
-{
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-    srand(2222222);
-    clock_t time;
-    char buff[256];
-    char *input = buff;
-    while(1){
-        if(filename){
-            strncpy(input, filename, 256);
-        }else{
-            printf("Enter Image Path: ");
-            fflush(stdout);
-            input = fgets(input, 256, stdin);
-            if(!input) return;
-            strtok(input, "\n");
-        }
-
-        image im = load_image_color(input, 0, 0);
-        resize_network(&net, im.w, im.h);
-        printf("%d %d %d\n", im.h, im.w, im.c);
-        float *X = im.data;
-        time=clock();
-        network_predict(net, X);
-        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        image pred = get_network_image(net);
-
-        image upsampled = resize_image(pred, im.w, im.h);
-        image thresh = threshold_image(upsampled, .5);
-        pred = thresh;
-
-        show_image(pred, "prediction");
-        show_image(im, "orig");
-#ifdef OPENCV
-        cvWaitKey(0);
-        cvDestroyAllWindows();
-#endif
-
-        free_image(upsampled);
-        free_image(thresh);
-        free_image(im);
-        if (filename) break;
-    }
-}
-
-void run_writing(int argc, char **argv)
-{
-    if(argc < 4){
-        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
-        return;
-    }
-
-    char *cfg = argv[3];
-    char *weights = (argc > 4) ? argv[4] : 0;
-    char *filename = (argc > 5) ? argv[5] : 0;
-    if(0==strcmp(argv[2], "train")) train_writing(cfg, weights);
-    else if(0==strcmp(argv[2], "test")) test_writing(cfg, weights, filename);
-}
-
diff --git a/image.darknet/src/yolo_layer.c b/image.darknet/src/yolo_layer.c
new file mode 100644
index 0000000..c338036
--- /dev/null
+++ b/image.darknet/src/yolo_layer.c
@@ -0,0 +1,374 @@
+#include "yolo_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "box.h"
+#include "cuda.h"
+#include "utils.h"
+
+#include <stdio.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes)
+{
+    int i;
+    layer l = {0};
+    l.type = YOLO;
+
+    l.n = n;
+    l.total = total;
+    l.batch = batch;
+    l.h = h;
+    l.w = w;
+    l.c = n*(classes + 4 + 1);
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
+    l.classes = classes;
+    l.cost = calloc(1, sizeof(float));
+    l.biases = calloc(total*2, sizeof(float));
+    if(mask) l.mask = mask;
+    else{
+        l.mask = calloc(n, sizeof(int));
+        for(i = 0; i < n; ++i){
+            l.mask[i] = i;
+        }
+    }
+    l.bias_updates = calloc(n*2, sizeof(float));
+    l.outputs = h*w*n*(classes + 4 + 1);
+    l.inputs = l.outputs;
+    l.truths = 90*(4 + 1);
+    l.delta = calloc(batch*l.outputs, sizeof(float));
+    l.output = calloc(batch*l.outputs, sizeof(float));
+    for(i = 0; i < total*2; ++i){
+        l.biases[i] = .5;
+    }
+
+    l.forward = forward_yolo_layer;
+    l.backward = backward_yolo_layer;
+#ifdef GPU
+    l.forward_gpu = forward_yolo_layer_gpu;
+    l.backward_gpu = backward_yolo_layer_gpu;
+    l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
+    l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
+#endif
+
+    fprintf(stderr, "yolo\n");
+    srand(0);
+
+    return l;
+}
+
+void resize_yolo_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+
+    l->outputs = h*w*l->n*(l->classes + 4 + 1);
+    l->inputs = l->outputs;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->delta_gpu);
+    cuda_free(l->output_gpu);
+
+    l->delta_gpu =     cuda_make_array(l->delta, l->batch*l->outputs);
+    l->output_gpu =    cuda_make_array(l->output, l->batch*l->outputs);
+#endif
+}
+
+box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
+{
+    box b;
+    b.x = (i + x[index + 0*stride]) / lw;
+    b.y = (j + x[index + 1*stride]) / lh;
+    b.w = exp(x[index + 2*stride]) * biases[2*n]   / w;
+    b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
+    return b;
+}
+
+float delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride)
+{
+    box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
+    float iou = box_iou(pred, truth);
+
+    float tx = (truth.x*lw - i);
+    float ty = (truth.y*lh - j);
+    float tw = log(truth.w*w / biases[2*n]);
+    float th = log(truth.h*h / biases[2*n + 1]);
+
+    delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
+    delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
+    delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
+    delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
+    return iou;
+}
+
+
+void delta_yolo_class(float *output, float *delta, int index, int class, int classes, int stride, float *avg_cat)
+{
+    int n;
+    if (delta[index]){
+        delta[index + stride*class] = 1 - output[index + stride*class];
+        if(avg_cat) *avg_cat += output[index + stride*class];
+        return;
+    }
+    for(n = 0; n < classes; ++n){
+        delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n];
+        if(n == class && avg_cat) *avg_cat += output[index + stride*n];
+    }
+}
+
+static int entry_index(layer l, int batch, int location, int entry)
+{
+    int n =   location / (l.w*l.h);
+    int loc = location % (l.w*l.h);
+    return batch*l.outputs + n*l.w*l.h*(4+l.classes+1) + entry*l.w*l.h + loc;
+}
+
+void forward_yolo_layer(const layer l, network net)
+{
+    int i,j,b,t,n;
+    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
+
+#ifndef GPU
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, 4);
+            activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
+        }
+    }
+#endif
+
+    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+    if(!net.train) return;
+    float avg_iou = 0;
+    float recall = 0;
+    float recall75 = 0;
+    float avg_cat = 0;
+    float avg_obj = 0;
+    float avg_anyobj = 0;
+    int count = 0;
+    int class_count = 0;
+    *(l.cost) = 0;
+    for (b = 0; b < l.batch; ++b) {
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
+                    float best_iou = 0;
+                    int best_t = 0;
+                    for(t = 0; t < l.max_boxes; ++t){
+                        box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
+                        if(!truth.x) break;
+                        float iou = box_iou(pred, truth);
+                        if (iou > best_iou) {
+                            best_iou = iou;
+                            best_t = t;
+                        }
+                    }
+                    int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
+                    avg_anyobj += l.output[obj_index];
+                    l.delta[obj_index] = 0 - l.output[obj_index];
+                    if (best_iou > l.ignore_thresh) {
+                        l.delta[obj_index] = 0;
+                    }
+                    if (best_iou > l.truth_thresh) {
+                        l.delta[obj_index] = 1 - l.output[obj_index];
+
+                        int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
+                        if (l.map) class = l.map[class];
+                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
+                        delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
+                        box truth = float_to_box(net.truth + best_t*(4 + 1) + b*l.truths, 1);
+                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+                    }
+                }
+            }
+        }
+        for(t = 0; t < l.max_boxes; ++t){
+            box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
+
+            if(!truth.x) break;
+            float best_iou = 0;
+            int best_n = 0;
+            i = (truth.x * l.w);
+            j = (truth.y * l.h);
+            box truth_shift = truth;
+            truth_shift.x = truth_shift.y = 0;
+            for(n = 0; n < l.total; ++n){
+                box pred = {0};
+                pred.w = l.biases[2*n]/net.w;
+                pred.h = l.biases[2*n+1]/net.h;
+                float iou = box_iou(pred, truth_shift);
+                if (iou > best_iou){
+                    best_iou = iou;
+                    best_n = n;
+                }
+            }
+
+            int mask_n = int_index(l.mask, best_n, l.n);
+            if(mask_n >= 0){
+                int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
+                float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
+
+                int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
+                avg_obj += l.output[obj_index];
+                l.delta[obj_index] = 1 - l.output[obj_index];
+
+                int class = net.truth[t*(4 + 1) + b*l.truths + 4];
+                if (l.map) class = l.map[class];
+                int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
+                delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);
+
+                ++count;
+                ++class_count;
+                if(iou > .5) recall += 1;
+                if(iou > .75) recall75 += 1;
+                avg_iou += iou;
+            }
+        }
+    }
+    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
+    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
+}
+
+void backward_yolo_layer(const layer l, network net)
+{
+   axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
+}
+
+void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
+{
+    int i;
+    int new_w=0;
+    int new_h=0;
+    if (((float)netw/w) < ((float)neth/h)) {
+        new_w = netw;
+        new_h = (h * netw)/w;
+    } else {
+        new_h = neth;
+        new_w = (w * neth)/h;
+    }
+    for (i = 0; i < n; ++i){
+        box b = dets[i].bbox;
+        b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); 
+        b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth); 
+        b.w *= (float)netw/new_w;
+        b.h *= (float)neth/new_h;
+        if(!relative){
+            b.x *= w;
+            b.w *= w;
+            b.y *= h;
+            b.h *= h;
+        }
+        dets[i].bbox = b;
+    }
+}
+
+int yolo_num_detections(layer l, float thresh)
+{
+    int i, n;
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
+            if(l.output[obj_index] > thresh){
+                ++count;
+            }
+        }
+    }
+    return count;
+}
+
+void avg_flipped_yolo(layer l)
+{
+    int i,j,n,z;
+    float *flip = l.output + l.outputs;
+    for (j = 0; j < l.h; ++j) {
+        for (i = 0; i < l.w/2; ++i) {
+            for (n = 0; n < l.n; ++n) {
+                for(z = 0; z < l.classes + 4 + 1; ++z){
+                    int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+                    int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+                    float swap = flip[i1];
+                    flip[i1] = flip[i2];
+                    flip[i2] = swap;
+                    if(z == 0){
+                        flip[i1] = -flip[i1];
+                        flip[i2] = -flip[i2];
+                    }
+                }
+            }
+        }
+    }
+    for(i = 0; i < l.outputs; ++i){
+        l.output[i] = (l.output[i] + flip[i])/2.;
+    }
+}
+
+int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets)
+{
+    int i,j,n;
+    float *predictions = l.output;
+    if (l.batch == 2) avg_flipped_yolo(l);
+    int count = 0;
+    for (i = 0; i < l.w*l.h; ++i){
+        int row = i / l.w;
+        int col = i % l.w;
+        for(n = 0; n < l.n; ++n){
+            int obj_index  = entry_index(l, 0, n*l.w*l.h + i, 4);
+            float objectness = predictions[obj_index];
+            if(objectness <= thresh) continue;
+            int box_index  = entry_index(l, 0, n*l.w*l.h + i, 0);
+            dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
+            dets[count].objectness = objectness;
+            dets[count].classes = l.classes;
+            for(j = 0; j < l.classes; ++j){
+                int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
+                float prob = objectness*predictions[class_index];
+                dets[count].prob[j] = (prob > thresh) ? prob : 0;
+            }
+            ++count;
+        }
+    }
+    correct_yolo_boxes(dets, count, w, h, netw, neth, relative);
+    return count;
+}
+
+#ifdef GPU
+
+void forward_yolo_layer_gpu(const layer l, network net)
+{
+    copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
+    int b, n;
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, 4);
+            activate_array_gpu(l.output_gpu + index, (1+l.classes)*l.w*l.h, LOGISTIC);
+        }
+    }
+    if(!net.train || l.onlyforward){
+        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        return;
+    }
+
+    cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs);
+    forward_yolo_layer(l, net);
+    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
+}
+
+void backward_yolo_layer_gpu(const layer l, network net)
+{
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+#endif
+
diff --git a/image.darknet/src/yolo_layer.h b/image.darknet/src/yolo_layer.h
new file mode 100644
index 0000000..d2a0243
--- /dev/null
+++ b/image.darknet/src/yolo_layer.h
@@ -0,0 +1,19 @@
+#ifndef YOLO_LAYER_H
+#define YOLO_LAYER_H
+
+#include "darknet.h"
+#include "layer.h"
+#include "network.h"
+
+layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes);
+void forward_yolo_layer(const layer l, network net);
+void backward_yolo_layer(const layer l, network net);
+void resize_yolo_layer(layer *l, int w, int h);
+int yolo_num_detections(layer l, float thresh);
+
+#ifdef GPU
+void forward_yolo_layer_gpu(const layer l, network net);
+void backward_yolo_layer_gpu(layer l, network net);
+#endif
+
+#endif